SNAP Library 2.2, Developer Reference  2014-03-11 19:15:55
SNAP, a general purpose, high performance system for analysis and manipulation of large networks
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines
unicode.cpp
Go to the documentation of this file.
00001 // Unicode.cpp : Defines the entry point for the console application.
00002 //
00003 
00005 // Includes
00006 //#include "unicode.h"
00007 
00008 //-----------------------------------------------------------------------------
00009 // Private declarations of this module
00010 //-----------------------------------------------------------------------------
00011 
00012 namespace {
00013 
00014 class TVectorBuilder2
00015 {
00016 public:
00017         TIntV v;
00018         TVectorBuilder2(int i) { v.Add(i); }
00019         operator TIntV() const { return v; }
00020         TVectorBuilder2& operator ,(int i) { v.Add(i); return *this; }
00021 };
00022 
00023 class TVectorBuilder
00024 {
00025 public:
00026         operator TIntV() const { return TIntV(); }
00027         TVectorBuilder2 operator ,(int i) { return TVectorBuilder2(i); }
00028 };
00029 
00030 TVectorBuilder VB;
00031 
00032 TStr CombinePath(const TStr& s, const TStr& t)
00033 {
00034         int n = s.Len(); if (n <= 0) return t;
00035         if (s[n - 1] == '\\' || s[n - 1] == '/' || s[n - 1] == ':') return s + t;
00036         return s + "\\" + t;
00037 }
00038 
00039 void AssertEq(const TIntV& v1, const TIntV& v2, const TStr& explanation, FILE *f)
00040 {
00041         const int n = v1.Len();
00042         bool ok = (n == v2.Len());
00043         if (ok) for (int i = 0; i < n && ok; i++) ok = ok && (v1[i] == v2[i]);
00044         if (! ok)
00045         {
00046                 if (! f) f = stderr;
00047                 fprintf(f, "%s: [", explanation.CStr());
00048                 for (int i = 0; i < v1.Len(); i++) fprintf(f, "%s%04x", (i == 0 ? "" : " "), int(v1[i]));
00049                 fprintf(f, "] != [");
00050                 for (int i = 0; i < v2.Len(); i++) fprintf(f, "%s%04x", (i == 0 ? "" : " "), int(v2[i]));
00051                 fprintf(f, "]\n");
00052                 Fail;
00053         }
00054 }
00055 
00056 };
00057 
00058 //-----------------------------------------------------------------------------
00059 // TUniCodec -- miscellaneous declarations
00060 //-----------------------------------------------------------------------------
00061 
00062 uint TUniCodec::GetRndUint(TRnd& rnd)
00063 {
00064         uint u = rnd.GetUniDevUInt(256) & 0xff;
00065         u <<= 8; u |= (rnd.GetUniDevUInt(256) & 0xff);
00066         u <<= 8; u |= (rnd.GetUniDevUInt(256) & 0xff);
00067         u <<= 8; u |= (rnd.GetUniDevUInt(256) & 0xff);
00068         return u;
00069 }
00070 
00071 uint TUniCodec::GetRndUint(TRnd& rnd, uint minVal, uint maxVal)
00072 {
00073         if (minVal == TUInt::Mn && maxVal == TUInt::Mx) return GetRndUint(rnd);
00074         uint range = maxVal - minVal + 1;
00075         if (range > (uint(1) << (8 * sizeof(uint) - 1)))
00076                 while (true) { uint u = GetRndUint(rnd); if (u < range) return minVal + u; }
00077         uint mask = 1;
00078         while (mask < range) mask <<= 1;
00079         mask -= 1;
00080         while (true) { uint u = GetRndUint(rnd) & mask; if (u < range) return minVal + u; }
00081 }
00082 
00083 bool TUniCodec::IsMachineLittleEndian()
00084 {
00085         static bool isLE, initialized = false;
00086         if (initialized) return isLE;
00087         int i = 1;
00088         if(*(char *)&i == 1) isLE = true;
00089         else isLE = false;
00090 
00091         initialized = true;
00092         return isLE;
00093 }
00094 
00095 //-----------------------------------------------------------------------------
00096 // TUniCodec -- UTF-8 test driver
00097 //-----------------------------------------------------------------------------
00098 
00099 void TUniCodec::TestUtf8(bool decode, size_t expectedRetVal, bool expectedThrow, const TIntV& src, const TIntV& expectedDest, FILE *f)
00100 {
00101         TIntV dest;
00102         if (f) {
00103                 fprintf(f, "Settings: %s  %s  %s   replacementChar = %x\n",
00104                         (errorHandling == uehAbort ? "abort" : errorHandling == uehThrow ? "throw" : errorHandling == uehIgnore ? "ignore" : errorHandling == uehReplace ? "replace" : "????"),
00105                         (strict ? "STRICT" : ""), (skipBom ? "skipBom" : ""), uint(replacementChar));
00106                 fprintf(f, "src: "); for (int i = 0; i < src.Len(); i++) fprintf(f, (decode ? " %02x" : " %x"), uint(src[i])); }
00107         try
00108         {
00109                 size_t retVal = (decode ? DecodeUtf8(src, 0, src.Len(), dest, true) : EncodeUtf8(src, 0, src.Len(), dest, true));
00110                 if (f) {
00111                         fprintf(f, "\n -> dest:    "); for (int i = 0; i < dest.Len(); i++) fprintf(f, (decode ? " %x" :  " %02x"), uint(dest[i]));
00112                         fprintf(f, "\n    expDest  "); for (int i = 0; i < expectedDest.Len(); i++) fprintf(f, (decode ? " %x" :  " %02x"), uint(expectedDest[i]));
00113                         fprintf(f, "\n    retVal = %llu (expected %llu)\n", static_cast<long long unsigned int> (retVal), static_cast<long long unsigned int> (expectedRetVal)); }
00114                 if (retVal != expectedRetVal)
00115                         printf("!!!");
00116                 IAssert(retVal == expectedRetVal); IAssert(! expectedThrow);
00117                 if (dest.Len() != expectedDest.Len())
00118                         printf("!!!");
00119                 IAssert(dest.Len() == expectedDest.Len());
00120                 for (int i = 0; i < dest.Len(); i++) IAssert(dest[i] == expectedDest[i]);
00121         }
00122         catch (TUnicodeException e)
00123         {
00124                 if (f) {
00125                         fprintf(f, "\n -> expDest  "); for (int i = 0; i < expectedDest.Len(); i++) fprintf(f, " %x", uint(expectedDest[i]));
00126                         fprintf(f, "\n    exception \"%s\" at %d (char 0x%02x)\n", e.message.CStr(), int(e.srcIdx), uint(e.srcChar)); }
00127                 IAssert(expectedThrow);
00128         }
00129 }
00130 
00131 // Generates a random UTF-8-encoded stream according to the specifications in 'testCaseDesc',
00132 // then calls TestUtf8 to make sure that DecodeUtf8 reacts as expected.
00133 void TUniCodec::TestDecodeUtf8(TRnd& rnd, const TStr& testCaseDesc)
00134 {
00135         TIntV src; TIntV expectedDest; int expectedRetVal = 0;
00136         bool expectedAbort = false;
00137         FILE *f = 0; // stderr
00138         // testCaseDesc should consist of pairs or triples of characters, 'cd[e]', where:
00139         // - 'c' defines the range from which the codepoint should be taken ('A'..'H', 'X'..'Z');
00140         // - 'd' defines how many bytes the codepoint should be encoded with ('1'..'6');
00141         // - 'e' defines how many bytes will be removed from the end of the encoded sequence for this codepoint.
00142         //   (absent = 0, 'a' = 1, 'b' = 2 and so on).
00143         for (int i = 0; i < testCaseDesc.Len(); )
00144         {
00145                 IAssert(i + 2 <= testCaseDesc.Len());
00146                 const char c = testCaseDesc[i], d = testCaseDesc[i + 1]; i += 2;
00147                 uint cp = 0; int nBytes = -1, minBytes = -1; bool eighties = false;
00148                 IAssert('1' <= d && d <= '6'); nBytes = d - '0';
00149                 if (c == 'A') { cp = GetRndUint(rnd, 0u, 0x7fu); minBytes = 1; } // 1 byte
00150                 else if (c == 'B') { cp = GetRndUint(rnd, 0x80u, 0x7ffu); minBytes = 2; } // 2 bytes
00151                 else if (c == 'C') { cp = GetRndUint(rnd, 0x800u, 0xffffu); minBytes = 3; } // 3 bytes
00152                 else if (c == 'D') { cp = GetRndUint(rnd, 0x10000u, 0x10ffffu); minBytes = 4; } // 4 bytes, valid Unicode
00153                 else if (c == 'E') { cp = GetRndUint(rnd, 0x110000u, 0x1fffffu); minBytes = 4; } // 4 bytes, invalid Unicode
00154                 else if (c == 'F') { cp = GetRndUint(rnd, 0x200000u, 0x3ffffffu); minBytes = 5; } // 5 bytes
00155                 else if (c == 'G') { cp = GetRndUint(rnd, 0x4000000u, 0x7fffffffu); minBytes = 6; } // 6 bytes, 31 bits
00156                 else if (c == 'H') { cp = GetRndUint(rnd, 0x80000000u, 0xffffffffu); minBytes = 6; } // 6 bytes, 32 bits
00157                 else if (c == 'X') { cp = 0xfffe; minBytes = 3; }
00158                 else if (c == 'Y') { cp = 0xfeff; minBytes = 3; }
00159                 else if (c == 'Z') { eighties = true; minBytes = 1; } // insert several random 10xxxxxx bytes (= 0x80 | random(0..0x3f))
00160                 else Fail;
00161                 IAssert(nBytes >= minBytes);
00162                 // Process 'e'.
00163                 int nToDel = 0;
00164                 if (i < testCaseDesc.Len()) {
00165                         const char e = testCaseDesc[i];
00166                         if (e >= 'a' && e <= 'e') { i += 1; nToDel = e - 'a' + 1; }}
00167                 IAssert(nToDel < nBytes);
00168                 // Will an error occur during the decoding of this codepoint?
00169                 bool errHere = false;
00170                 if (eighties) errHere = true;
00171                 else if (nToDel > 0) errHere = true;
00172                 else if (strict && (cp >= 0x10ffff || nBytes > minBytes)) errHere = true;
00173                 // Update 'expectedDest' and 'expetedRetVal'.
00174                 if (! expectedAbort) {
00175                         if (! errHere) {
00176                                 if (src.Len() == 0 && (cp == 0xfffe || cp == 0xfeff) && skipBom) { }
00177                                 else { expectedDest.Add(cp); expectedRetVal += 1; } }
00178                         else if (errorHandling == uehReplace) {
00179                                 if (eighties) for (int j = 0; j < nBytes; j++) expectedDest.Add(replacementChar);
00180                                 else expectedDest.Add(replacementChar); }
00181                         if (errHere && (errorHandling == uehAbort || errorHandling == uehThrow)) expectedAbort = true; }
00182                 // Update 'src'.
00183                 if (eighties) for (int j = 0; j < nBytes; j++) src.Add(GetRndUint(rnd, 0x80, 0xff));
00184                 else if (nBytes == 1) src.Add(cp);
00185                 else {
00186                         int mask = (1 << nBytes) - 1; mask <<= (8 - nBytes);
00187                         src.Add(mask | (uint(cp) >> (6 * (nBytes - 1))));
00188                         for (int j = 1; j < nBytes - nToDel; j++) src.Add(0x80 | ((cp >> (6 * (nBytes - j - 1))) & _0011_1111)); }
00189         }
00190         if (f) fprintf(f, "Test case: \"%s\"\n", testCaseDesc.CStr());
00191         TestUtf8(true, expectedRetVal, expectedAbort && (errorHandling == uehThrow), src, expectedDest, f);
00192 }
00193 
00194 void TUniCodec::TestUtf8()
00195 {
00196         TIntV utf8ReplCh; EncodeUtf8((TVectorBuilder(), replacementChar).v, 0, 1, utf8ReplCh, true);
00197         for (int skipBom_ = 0; skipBom_ < 2; skipBom_++)
00198         for (int strict_ = 0; strict_ < 2; strict_++)
00199         for (int errMode_ = 0; errMode_ < 4; errMode_++)
00200         {
00201                 strict = (strict_ == 1); errorHandling = TUnicodeErrorHandling(errMode_); skipBom = (skipBom_ == 1);
00202                 TRnd rnd = TRnd(123);
00203                 // Test DecodeUtf8 on various random UTF-8-encoded sequences.
00204                 for (int i = 0; i < 10; i++)
00205                 {
00206                         TestDecodeUtf8(rnd, "X3A1A2A3A4A5A6B2B3B4B5B6C3C4C5C6D4D5D6E5E6F6G6");
00207                         TestDecodeUtf8(rnd, "X3A5dA6d");
00208                         TestDecodeUtf8(rnd, "X3A1B2C3D4E4F5A1G6H6Y3X3A1");
00209                         TestDecodeUtf8(rnd, "X3A1B2C3D4E4F5A2G6H6Y3X3A1");
00210                         TestDecodeUtf8(rnd, "Y3A1B2C3D4E4F5A1G6H6Y3X3A1");
00211                         TestDecodeUtf8(rnd, "A1B2C3D4E4F5A1G6H6Y3X3A1");
00212                         TestDecodeUtf8(rnd, "G6A1A1D4E4A1B2");
00213                         TestDecodeUtf8(rnd, "D4A1A1C3A1B2A1B2");
00214                         TestDecodeUtf8(rnd, "D4A1A1C3A1B2A1B2D4a");
00215                         TestDecodeUtf8(rnd, "X3A1B2C3D5E4F5A1G6H6Y3X3A1");
00216                         TestDecodeUtf8(rnd, "X3A1B2C3D4E5F5A1G6H6Y3X3A1");
00217                         TestDecodeUtf8(rnd, "X3A1B2C3D4aE4F5A1G6H6Y3X3A1");
00218                         TestDecodeUtf8(rnd, "X3A1B2C3D4bE4F5A1G6H6Y3X3A1");
00219                         TestDecodeUtf8(rnd, "X3A2aA3aA4aA5aA6aB2aB3aB4aB5aB6aC3aC4aC5aC6aD4aD5aD6aE5aE6aF6aG6a");
00220                         TestDecodeUtf8(rnd, "X3A3bA4bA5bA6aB3bB4bB5bB6bC3bC4bC5bC6bD4bD5bD6bE5bE6bF6bG6b");
00221                         TestDecodeUtf8(rnd, "X3A4cA5cA6cB4cB5cB6cC4cC5cC6cD4cD5cD6cE5cE6cF6cG6c");
00222                         TestDecodeUtf8(rnd, "X3A5dA6dB5dB6dC5dC6dD5dD6dE5dE6dF6dG6d");
00223                         TestDecodeUtf8(rnd, "X3A6eB6eC6eD6eE6eF6eG6e");
00224                 }
00225                 // Test both DecodeUtf8 and EncodeUtf8 systematically on various characters
00226                 // close to powers of 2.
00227                 TIntV src, expectedDest, src2;
00228                 expectedDest.Gen(1); src.Reserve(6); src2.Gen(1);
00229                 for (int pow = 8; pow <= 32; pow++)
00230                 {
00231                         uint uFrom, uTo;
00232                         if (pow == 8) uFrom = 0, uTo = 1u << pow;
00233                         else if (pow == 32) uFrom = TUInt::Mx - (1u << 8), uTo = TUInt::Mx;
00234                         else uFrom = (1u << pow) - (1u << 8), uTo = (1u << pow) + (1u << 8);
00235                         printf("%u..%u          \r", uFrom, uTo);
00236                         for (uint u = uFrom; ; u++)
00237                         {
00238                                 int nBytes = 0;
00239                                 if (u < (1u << 7)) nBytes = 1;
00240                                 else if (u < (1u << 11)) nBytes = 2;
00241                                 else if (u < (1u << 16)) nBytes = 3;
00242                                 else if (u < (1u << 21)) nBytes = 4;
00243                                 else if (u < (1u << 26)) nBytes = 5;
00244                                 else nBytes = 6;
00245                                 src.Gen(6, nBytes);
00246                                 if (nBytes == 1) src[0] = u;
00247                                 else {
00248                                         src[0] = (((1 << nBytes) - 1) << (8 - nBytes)) | (u >> (6 * (nBytes - 1)));
00249                                         for (int i = 1; i < nBytes; i++) src[i] = 0x80 | ((u >> (6 * (nBytes - i - 1))) & _0011_1111); }
00250                                 bool err = (strict && u > 0x10ffff);
00251                                 expectedDest.Reserve(1, 0);
00252                                 if (err && errorHandling == uehReplace) expectedDest.Add(replacementChar);
00253                                 else if (! err) expectedDest.Add(u);
00254                                 int erv = (err ? 0 : 1);
00255                                 if (skipBom && (u == 0xfeff || u == 0xfffe)) expectedDest.Clr(), erv = 0;
00256                                 TestUtf8(true, erv, (err && errorHandling == uehThrow), src, expectedDest, 0);
00257                                 // We can also test the UTF-8 encoder.
00258                                 src2[0] = u;
00259                                 if (err) {
00260                                         if (errorHandling == uehReplace) src = utf8ReplCh;
00261                                         else src.Clr(false); }
00262                                 TestUtf8(false, (err ? 0 : 1), (err && errorHandling == uehThrow), src2, src, 0);
00263                                 //
00264                                 if (u == uTo) break;
00265                         }
00266                 }
00267         }
00268 }
00269 
00270 //-----------------------------------------------------------------------------
00271 // TUniCodec -- UTF-16 test driver
00272 //-----------------------------------------------------------------------------
00273 
00274 void TUniCodec::WordsToBytes(const TIntV& src, TIntV& dest)
00275 {
00276         dest.Clr();
00277         bool isLE = IsMachineLittleEndian();
00278         for (int i = 0; i < src.Len(); i++) {
00279                 int c = src[i] & 0xffff;
00280                 if (isLE) { dest.Add(c & 0xff); dest.Add((c >> 8) & 0xff); }
00281                 else { dest.Add((c >> 8) & 0xff); dest.Add(c & 0xff); } }
00282 }
00283 
00284 void TUniCodec::TestUtf16(bool decode, size_t expectedRetVal, bool expectedThrow, const TIntV& src, const TIntV& expectedDest,
00285         const TUtf16BomHandling bomHandling, const TUniByteOrder defaultByteOrder, const bool insertBom,
00286         FILE *f)
00287 {
00288         TIntV srcBytes, expectedDestBytes;
00289         WordsToBytes(src, srcBytes); WordsToBytes(expectedDest, expectedDestBytes);
00290         TIntV dest;
00291         if (f) {
00292                 fprintf(f, "Settings: %s  %s  %s  %s  %s replacementChar = %x  \n",
00293                         (errorHandling == uehAbort ? "abort" : errorHandling == uehThrow ? "throw" : errorHandling == uehIgnore ? "ignore" : errorHandling == uehReplace ? "replace" : "????"),
00294                         (strict ? "STRICT" : ""), (decode ? (skipBom ? "skipBom" : "") : (insertBom ? "insrtBom" : "")),
00295                         (bomHandling == bomAllowed ? "bomAllowed" : bomHandling == bomRequired ? "bomRequired" : "bomIgnored"),
00296                         (defaultByteOrder == boBigEndian ? "boBigEndian" : defaultByteOrder == boLittleEndian ? "boLittleEndian" : "boMachineEndian"),
00297                         uint(replacementChar));
00298                 fprintf(f, "src: "); for (int i = 0; i < src.Len(); i++) fprintf(f, (decode ? " %04x" : " %x"), uint(src[i])); }
00299         for (int useBytes = 0; useBytes < 2; useBytes++)
00300         {
00301                 const char *fmt = (useBytes ? " %02x" : " %04x");
00302                 try
00303                 {
00304                         dest.Clr();
00305                         size_t retVal;
00306                         if (! useBytes) {
00307                                 if (decode) retVal = DecodeUtf16FromWords(src, 0, src.Len(), dest, true, bomHandling, defaultByteOrder);
00308                                 else retVal = EncodeUtf16ToWords(src, 0, src.Len(), dest, true, insertBom, defaultByteOrder); }
00309                         else {
00310                                 if (decode) retVal = DecodeUtf16FromBytes(srcBytes, 0, srcBytes.Len(), dest, true, bomHandling, defaultByteOrder);
00311                                 else retVal = EncodeUtf16ToBytes(src, 0, src.Len(), dest, true, insertBom, defaultByteOrder); }
00312                         const TIntV& ed = (useBytes && ! decode ? expectedDestBytes : expectedDest);
00313                         if (f) {
00314                                 fprintf(f, "\n -> dest:    "); for (int i = 0; i < dest.Len(); i++) fprintf(f, (decode ? " %x" :  fmt), uint(dest[i]));
00315                                 fprintf(f, "\n    expDest  "); for (int i = 0; i < ed.Len(); i++) fprintf(f, (decode ? " %x" :  fmt), uint(ed[i]));
00316                                 fprintf(f, "\n    retVal = %llu (expected %llu)\n", static_cast<long long unsigned int> (retVal), static_cast<long long unsigned int> (expectedRetVal)); }
00317                         bool ok = true;
00318                         if (retVal != expectedRetVal) ok = false;
00319                         if (dest.Len() != ed.Len()) ok = false;
00320                         if (ok) for (int i = 0; i < dest.Len(); i++) if (dest[i] != ed[i]) ok = false;
00321                         if (! ok)
00322                         {
00323                                 printf("!!!\n");
00324                         }
00325                         IAssert(retVal == expectedRetVal); IAssert(! expectedThrow);
00326                         IAssert(dest.Len() == ed.Len());
00327                         for (int i = 0; i < dest.Len(); i++) IAssert(dest[i] == ed[i]);
00328                 }
00329                 catch (TUnicodeException e)
00330                 {
00331                         if (f) {
00332                                 fprintf(f, "\n -> expDest  "); for (int i = 0; i < expectedDest.Len(); i++) fprintf(f, (decode ? " %x" : fmt), uint(expectedDest[i]));
00333                                 fprintf(f, "\n    exception \"%s\" at %d (char 0x%02x)\n", e.message.CStr(), int(e.srcIdx), uint(e.srcChar)); }
00334                         IAssert(expectedThrow);
00335                 }
00336         }
00337 }
00338 
00339 // Generates a random UTF-16-encoded stream according to the specifications in 'testCaseDesc',
00340 // then calls TestUtf16 to make sure that DecodeUtf16 reacts as expected.
00341 void TUniCodec::TestDecodeUtf16(TRnd& rnd, const TStr& testCaseDesc,
00342         const TUtf16BomHandling bomHandling,
00343         const TUniByteOrder defaultByteOrder,
00344         const bool insertBom)
00345 {
00346         TIntV src; TIntV expectedDest; int expectedRetVal = 0;
00347         bool expectedAbort = false;
00348         FILE *f = 0;
00349         bool isMachineLe = IsMachineLittleEndian();
00350         bool isDefaultLe = (defaultByteOrder == boLittleEndian || (defaultByteOrder == boMachineEndian && isMachineLe));
00351         bool swap = (isMachineLe != isDefaultLe);
00352         if (insertBom) {
00353                 src.Add(swap ? 0xfffe : 0xfeff);
00354                 if (! skipBom) { expectedRetVal += 1; expectedDest.Add(0xfeff); } }
00355         else if (bomHandling == bomRequired) {
00356                 expectedAbort = true; expectedRetVal = -1; }
00357         // testCaseDesc should consist single characters or pairs of characters, 'c[e]', where:
00358         // - 'c' defines the range from which the codepoint should be taken ('A'..'E', 'X'..'Y');
00359         // - 'e' defines how many words will be removed from the end of the encoded sequence for this codepoint.
00360         //   (absent = 0, 'a' = 1).
00361         for (int i = 0; i < testCaseDesc.Len(); )
00362         {
00363                 const char c = testCaseDesc[i++];
00364                 uint cp = 0; int nWords = -1;
00365                 if (c == 'X' || c == 'Y') IAssert(i > 1); // if you want a BOM at the beginning of your data, use insertBom -- if we permit X and Y here, predicting the expectedDest and expectedRetVal gets more complicated
00366                 if (c == 'A') { cp = GetRndUint(rnd, 0u, Utf16FirstSurrogate - 1); nWords = 1; } // characters below the first surrogate range
00367                 else if (c == 'B') { cp = GetRndUint(rnd, Utf16FirstSurrogate, Utf16FirstSurrogate + 1023); nWords = 1; } // the first surrogate range
00368                 else if (c == 'C') { cp = GetRndUint(rnd, Utf16SecondSurrogate, Utf16SecondSurrogate + 1023); nWords = 1; } // the second surrogate range
00369                 else if (c == 'D') { do { cp = GetRndUint(rnd, Utf16SecondSurrogate + 1024, 0xffffu); } while (cp == 0xfffe || cp == 0xfeff); nWords = 1; } // above the second surrogate range, but still in the BMP
00370                 else if (c == 'E') { cp = GetRndUint(rnd, 0x10000u, 0x10ffffu); nWords = 2; } // above the BMP, but still within the range for UTF-16
00371                 else if (c == 'X') { cp = 0xfffe; nWords = 1; }
00372                 else if (c == 'Y') { cp = 0xfeff; nWords = 1; }
00373                 else Fail;
00374                 if (c == 'B' && i < testCaseDesc.Len()) IAssert(testCaseDesc[i] != 'C');
00375                 // Process 'e'.
00376                 int nToDel = 0;
00377                 if (i < testCaseDesc.Len()) {
00378                         const char e = testCaseDesc[i];
00379                         if (e >= 'a') { i += 1; nToDel = 1; }}
00380                 IAssert((nWords == 1 && nToDel == 0) || (nWords == 2 && (nToDel == 0 || nToDel == 1)));
00381                 if (nWords == 2 && nToDel == 1 && i < testCaseDesc.Len()) IAssert(testCaseDesc[i] != 'C');
00382                 // Will an error occur during the decoding of this codepoint?
00383                 bool errHere = false;
00384                 if (Utf16FirstSurrogate <= cp && cp <= Utf16FirstSurrogate + 1023) errHere = true;
00385                 else if (cp > 0x10ffff) { Fail; errHere = true; }
00386                 else if (nToDel > 0) errHere = true;
00387                 else if (strict && (Utf16SecondSurrogate <= cp && cp <= Utf16SecondSurrogate + 1023)) errHere = true;
00388                 // Update 'expectedDest' and 'expectedRetVal'.
00389                 if (! expectedAbort) {
00390                         if (! errHere) {
00391                                 if (src.Len() == 0 && (cp == 0xfffe || cp == 0xfeff) && skipBom) { }
00392                                 else { expectedDest.Add(cp); expectedRetVal += 1; } }
00393                         else if (errorHandling == uehReplace) {
00394                                 expectedDest.Add(replacementChar); }
00395                         if (errHere && (errorHandling == uehAbort || errorHandling == uehThrow)) expectedAbort = true; }
00396                 // Update 'src'.
00397                 if (nWords == 1) src.Add(swap ? SwapBytes(cp) : cp);
00398                 else {
00399                         int c1 = ((cp - 0x10000) >> 10) & 1023; c1 += Utf16FirstSurrogate;
00400                         int c2 = (cp - 0x10000) & 1023; c2 += Utf16SecondSurrogate;
00401                         src.Add(swap ? SwapBytes(c1) : c1);
00402                         if (nToDel == 0) src.Add(swap ? SwapBytes(c2) : c2); }
00403         }
00404         if (f) fprintf(f, "Test case: \"%s\"\n", testCaseDesc.CStr());
00405         TestUtf16(true, expectedRetVal, expectedAbort && (errorHandling == uehThrow), src, expectedDest, bomHandling, defaultByteOrder, false, f);
00406 }
00407 
00408 void TUniCodec::TestUtf16()
00409 {
00410         TIntV utf16ReplCh; utf16ReplCh.Add(replacementChar);
00411         for (int skipBom_ = 0; skipBom_ < 2; skipBom_++)
00412         for (int strict_ = 0; strict_ < 2; strict_++)
00413         for (int errMode_ = 0; errMode_ < 4; errMode_++)
00414         for (int bomHandling_ = 0; bomHandling_ < 3; bomHandling_++)
00415         for (int byteOrder_ = 0; byteOrder_ < 3; byteOrder_++)
00416         for (int insertBom_ = 0; insertBom_ < 2; insertBom_++)
00417         {
00418                 strict = (strict_ == 1); errorHandling = TUnicodeErrorHandling(errMode_); skipBom = (skipBom_ == 1);
00419                 bool insertBom = (insertBom_ == 1);
00420                 TUniByteOrder byteOrder = (TUniByteOrder) byteOrder_;
00421                 TUtf16BomHandling bomHandling = (TUtf16BomHandling) bomHandling_;
00422                 TRnd rnd = TRnd(123);
00423                 // Test DecodeUtf16 on various random UTF-16-encoded sequences.
00424                 for (int i = 0; i < 10; i++)
00425                 {
00426                         TestDecodeUtf16(rnd, "A", bomHandling, byteOrder, insertBom);
00427                         TestDecodeUtf16(rnd, "AAA", bomHandling, byteOrder, insertBom);
00428                         TestDecodeUtf16(rnd, "B", bomHandling, byteOrder, insertBom);
00429                         TestDecodeUtf16(rnd, "DDAADADAAADDDAA", bomHandling, byteOrder, insertBom);
00430                         TestDecodeUtf16(rnd, "DEEEDAAEEDADEEAAEEADEEDDAA", bomHandling, byteOrder, insertBom);
00431                         TestDecodeUtf16(rnd, "DEaEaEDAAEaEDADEaEAAEEADEEDDAA", bomHandling, byteOrder, insertBom);
00432                         TestDecodeUtf16(rnd, "CABDEBACCEaB", bomHandling, byteOrder, insertBom);
00433                         TestDecodeUtf16(rnd, "EaEEEEaBBACABXABYXXEaYDDXBDCEA", bomHandling, byteOrder, insertBom);
00434                         TestDecodeUtf16(rnd, "EaEEEEaBDCAAXADYXXEaYDDXDCEA", bomHandling, byteOrder, insertBom);
00435                 }
00436                 //continue;
00437                 // Test both DecodeUtf16 and EncodeUtf16 systematically on various characters
00438                 // close to powers of 2.
00439                 TIntV src, expectedDest, src2;
00440                 expectedDest.Gen(1); src.Reserve(6); src2.Gen(1);
00441                 for (int pow = 8; pow <= 32; pow++)
00442                 {
00443                         uint uFrom, uTo;
00444                         if (pow == 8) uFrom = 0, uTo = 1u << pow;
00445                         else if (pow == 32) uFrom = TUInt::Mx - (1u << 8), uTo = TUInt::Mx;
00446                         else uFrom = (1u << pow) - (1u << 8), uTo = (1u << pow) + (1u << 8);
00447                         printf("%u..%u          \r", uFrom, uTo);
00448                         for (uint u = uFrom; ; u++)
00449                         {
00450                                 int nWords = 0;
00451                                 if (u < 0x10000) nWords = 1;
00452                                 else nWords = 2;
00453                                 bool isMachineLe = IsMachineLittleEndian(), isDestLe = (byteOrder == boLittleEndian || (byteOrder == boMachineEndian && isMachineLe));
00454                                 bool swap = (isMachineLe != isDestLe);
00455                                 bool err = (u > 0x10ffff) || (Utf16FirstSurrogate <= u && u <= Utf16FirstSurrogate + 1023) || (strict && Utf16SecondSurrogate <= u && u <= Utf16SecondSurrogate + 1023);
00456                                 src.Gen(3, (err ? 0 : nWords) + (insertBom ? 1 : 0));
00457                                 if (insertBom) src[0] = (swap ? 0xfffe : 0xfeff);
00458                                 if (! ((u > 0x10ffff) || (Utf16FirstSurrogate <= u && u <= Utf16FirstSurrogate + 1023)))
00459                                 {
00460                                         // Try to encode 'u' and see if it gets decoded correctly.
00461                                         if (nWords == 1) src[insertBom ? 1 : 0] = (swap ? SwapBytes(u) : u);
00462                                         else {
00463                                                 int u1 = Utf16FirstSurrogate + (((u - 0x10000) >> 10) & 1023);
00464                                                 int u2 = Utf16SecondSurrogate + ((u - 0x10000) & 1023);
00465                                                 src[insertBom ? 1 : 0] = (swap ? SwapBytes(u1) : u1);
00466                                                 src[insertBom ? 2 : 1] = (swap ? SwapBytes(u2) : u2); }
00467                                         if (! ((u == 0xfffe || u == 0xfeff) && bomHandling == bomAllowed && ! insertBom)) // this will just create a mess when decoding
00468                                         {
00469                                                 expectedDest.Reserve(2, 0);
00470                                                 if (insertBom && ! skipBom) expectedDest.Add(0xfeff);
00471                                                 if (err && errorHandling == uehReplace) expectedDest.Add(replacementChar);
00472                                                 else if (! err) expectedDest.Add(u);
00473                                                 int erv = (err ? 0 : expectedDest.Len());
00474                                                 if (skipBom && (u == 0xfeff || u == 0xfffe) && ! insertBom) expectedDest.Clr(), erv = 0;
00475                                                 bool errD = err;
00476                                                 if (bomHandling == bomRequired && ! insertBom) {
00477                                                         expectedDest.Clr(false);
00478                                                         if (u == 0xfeff || u == 0xfffe) { erv = (skipBom ? 0 : 1); if (! skipBom) expectedDest.Add(0xfeff); }
00479                                                         else { erv = -1; errD = true;
00480                                                                 /*if (errorHandling == uehReplace) expectedDest.Add(replacementChar);*/ }}
00481                                                 TestUtf16(true, erv, (errD && errorHandling == uehThrow), src, expectedDest, bomHandling, byteOrder, insertBom, 0);
00482                                         }
00483                                 }
00484                                 // We can also test the UTF-16 encoder.
00485                                 src2[0] = u;
00486                                 if (err) {
00487                                         src.Clr(false); if (insertBom) src.Add(swap ? 0xfffe : 0xfeff);
00488                                         if (errorHandling == uehReplace) {
00489                                                 src.Add(swap ? SwapBytes(replacementChar) : replacementChar);
00490                                                 /*if (byteOrder == boBigEndian || (byteOrder == boMachineEndian && ! TUniCodec::IsMachineLittleEndian())) { src.Add((replacementChar >> 8) & 0xff); src.Add(replacementChar & 0xff); }
00491                                                 else { src.Add(replacementChar & 0xff); src.Add((replacementChar >> 8) & 0xff); } */
00492                                         }}
00493                                 TestUtf16(false, (err ? 0 : 1) + (insertBom ? 1 : 0), (err && errorHandling == uehThrow), src2, src, bomHandling, byteOrder, insertBom, 0);
00494                                 //
00495                                 if (u == uTo) break;
00496                         }
00497                 }
00498         }
00499 }
00500 
00501 //-----------------------------------------------------------------------------
00502 // TUniCaseFolding
00503 //-----------------------------------------------------------------------------
00504 
00505 void TUniCaseFolding::LoadTxt(const TStr& fileName)
00506 {
00507         Clr();
00508         TUniChDb::TUcdFileReader reader; reader.Open(fileName);
00509         TStrV fields;
00510         while (reader.GetNextLine(fields))
00511         {
00512                 int cp = reader.ParseCodePoint(fields[0]);
00513                 const TStr status = fields[1], mapsTo = fields[2];
00514                 if (status == "C" || status == "S" || status == "T") {
00515                         TIntH &dest = (status == "C" ? cfCommon : status == "S" ? cfSimple : cfTurkic);
00516                         IAssert(! dest.IsKey(cp));
00517                         int cp2 = reader.ParseCodePoint(mapsTo);
00518                         dest.AddDat(cp, cp2); }
00519                 else if (status == "F") {
00520                         TIntIntVH &dest = cfFull;
00521                         IAssert(! dest.IsKey(cp));
00522                         TIntV cps; reader.ParseCodePointList(mapsTo, cps); IAssert(cps.Len() > 0);
00523                         dest.AddDat(cp, cps); }
00524                 else
00525                         FailR(status.CStr());
00526         }
00527         printf("TUniCaseFolding(\"%s\"): %d common, %d simple, %d full, %d Turkic.\n",
00528                 fileName.CStr(), cfCommon.Len(), cfSimple.Len(), cfFull.Len(), cfTurkic.Len());
00529 }
00530 
00531 void TUniCaseFolding::Test(const TIntV& src, const TIntV& expectedDest, const bool full, const bool turkic, FILE *f)
00532 {
00533         fprintf(f, "TUniCaseFolding(%s%s): ", (full ? "full" : "simple"), (turkic ? ", turkic" : ""));
00534         for (int i = 0; i < src.Len(); i++) fprintf(f, " %04x", int(src[i]));
00535         TIntV dest; Fold(src, 0, src.Len(), dest, true, full, turkic);
00536         fprintf(f, "\n  -> ");
00537         for (int i = 0; i < dest.Len(); i++) fprintf(f, " %04x", int(dest[i]));
00538         fprintf(f, "\n");
00539         IAssert(dest.Len() == expectedDest.Len());
00540         for (int i = 0; i < dest.Len(); i++) IAssert(dest[i] == expectedDest[i]);
00541 }
00542 
00543 /*
00544 void TUniCaseFolding::Test(const TIntV& src, FILE *f) {
00545         Test(src, false, false, f); Test(src, false, true, f);
00546         Test(src, true, false, f); Test(src, true, true, f); }
00547 */
00548 
00549 void TUniCaseFolding::Test()
00550 {
00551         FILE *f = stderr;
00552         TVectorBuilder VB;
00553         // simple
00554         Test((VB, 0x41, 0x62, 0x49, 0x43, 0xdf), (VB, 0x61, 0x62, 0x69, 0x63, 0xdf), false, false, f);
00555         // simple + turkic
00556         Test((VB, 0x41, 0x62, 0x49, 0x43, 0xdf), (VB, 0x61, 0x62, 0x131, 0x63, 0xdf), false, true, f);
00557         // full
00558         Test((VB, 0x41, 0x62, 0x49, 0x43, 0xdf), (VB, 0x61, 0x62, 0x69, 0x63, 0x73, 0x73), true, false, f);
00559         // full + turkic
00560         Test((VB, 0x41, 0x62, 0x49, 0x43, 0xdf), (VB, 0x61, 0x62, 0x131, 0x63, 0x73, 0x73), true, true, f);
00561 }
00562 
00563 //-----------------------------------------------------------------------------
00564 // TUniChInfo
00565 //-----------------------------------------------------------------------------
00566 
00567 // UAX #14
00568 const ushort TUniChInfo::LineBreak_Unknown = TUniChInfo::GetLineBreakCode('X', 'X');
00569 const ushort TUniChInfo::LineBreak_ComplexContext = TUniChInfo::GetLineBreakCode('S', 'A');
00570 const ushort TUniChInfo::LineBreak_Numeric = TUniChInfo::GetLineBreakCode('N', 'U');
00571 const ushort TUniChInfo::LineBreak_InfixNumeric = TUniChInfo::GetLineBreakCode('I', 'S');
00572 const ushort TUniChInfo::LineBreak_Quotation = TUniChInfo::GetLineBreakCode('Q', 'U');
00573 
00574 //-----------------------------------------------------------------------------
00575 // TUniChDb -- word breaking
00576 //-----------------------------------------------------------------------------
00577 
00578 // Test driver for WbFind*NonIgnored.
00579 void TUniChDb::TestWbFindNonIgnored(const TIntV& src) const
00580 {
00581         int n = src.Len();
00582         TBoolV isIgnored; isIgnored.Gen(n);
00583         for (int i = 0; i < n; i++) isIgnored[i] = IsWbIgnored(src[i]);
00584         TIntV prevNonIgnored, nextNonIgnored, curOrNextNonIgnored;
00585         prevNonIgnored.Gen(n); nextNonIgnored.Gen(n); curOrNextNonIgnored.Gen(n);
00586         FILE *f = 0; // stderr;
00587         for (int srcIdx = 0; srcIdx < n; srcIdx++) for (int srcLen = 1; srcLen < n - srcIdx; srcLen++)
00588         {
00589                 int prev = -1;
00590                 for (int i = 0; i < srcLen; i++) {
00591                         prevNonIgnored[i] = prev;
00592                         if (! isIgnored[srcIdx + i]) prev = srcIdx + i; }
00593                 int next = srcIdx + srcLen;
00594                 for (int i = srcLen - 1; i >= 0; i--) {
00595                         nextNonIgnored[i] = next;
00596                         if (! isIgnored[srcIdx + i]) next = srcIdx + i;
00597                         curOrNextNonIgnored[i] = next; }
00598                 if (f) {
00599                         fprintf(f, "\nIndex:     "); for (int i = 0; i < srcLen; i++) fprintf(f, " %2d", srcIdx + i);
00600                         fprintf(f, "\nNonIgn:    "); for (int i = 0; i < srcLen; i++) fprintf(f, " %s", (isIgnored[srcIdx + i] ? " ." : " Y"));
00601                         fprintf(f, "\nPrevNI:    "); for (int i = 0; i < srcLen; i++) fprintf(f, " %2d", int(prevNonIgnored[i]));
00602                         fprintf(f, "\nNextNI:    "); for (int i = 0; i < srcLen; i++) fprintf(f, " %2d", int(nextNonIgnored[i]));
00603                         fprintf(f, "\nCurNextNI: "); for (int i = 0; i < srcLen; i++) fprintf(f, " %2d", int(curOrNextNonIgnored[i]));
00604                         fprintf(f, "\n"); }
00605                 for (int i = 0; i < srcLen; i++)
00606                 {
00607                         size_t s;
00608                         s = size_t(srcIdx + i); WbFindNextNonIgnored(src, s, size_t(srcIdx + srcLen));
00609                         IAssert(s == size_t(nextNonIgnored[i]));
00610                         s = size_t(srcIdx + i); WbFindCurOrNextNonIgnored(src, s, size_t(srcIdx + srcLen));
00611                         IAssert(s == size_t(curOrNextNonIgnored[i]));
00612                         s = size_t(srcIdx + i); bool ok = WbFindPrevNonIgnored(src, size_t(srcIdx), s);
00613                         if (prevNonIgnored[i] < 0) { IAssert(! ok); IAssert(s == size_t(srcIdx)); }
00614                         else { IAssert(ok); IAssert(s == size_t(prevNonIgnored[i])); }
00615                 }
00616         }
00617 }
00618 
00619 void TUniChDb::TestWbFindNonIgnored() const
00620 {
00621         TIntV chIgnored, chNonIgnored;
00622         FILE *f = 0; // stderr;
00623         for (int i = h.FFirstKeyId(); h.FNextKeyId(i); ) {
00624                 const int cp = h.GetKey(i); const TUniChInfo& ci = h[i];
00625                 if (f) fprintf(f, "%04x: flags %08x props %08x %08x script \"%s\"\n", cp,
00626                         ci.flags, ci.properties, ci.propertiesX, GetScriptName(ci.script).CStr());
00627                 (IsWbIgnored(h[i]) ? chIgnored : chNonIgnored).Add(h.GetKey(i));
00628         }
00629         chIgnored.Sort(); chNonIgnored.Sort();
00630         printf("TUniChDb::TestWbNonIgnored: %d ignored, %d nonignored chars.\n", chIgnored.Len(), chNonIgnored.Len());
00631         TRnd rnd = TRnd(123);
00632         for (int iter = 0; iter <= 50; iter++)
00633         {
00634                 int percIgnored = 2 * iter;
00635                 for (int n = 0; n <= 20; n++)
00636                 {
00637                         // Prepare a random sequence of 'n' codepoints.
00638                         TIntV v; v.Gen(n);
00639                         for (int i = 0; i < n; i++) {
00640                                 TIntV& chars = (rnd.GetUniDevInt(100) < percIgnored) ? chIgnored : chNonIgnored;
00641                                 int j = rnd.GetUniDevInt(chars.Len());
00642                                 v.Add(chars[j]); }
00643                         // Run the tests with this sequence.
00644                         TestWbFindNonIgnored(v);
00645                 }
00646         }
00647 }
00648 
00649 void TUniChDb::TestFindNextWordOrSentenceBoundary(const TStr& basePath, bool sentence)
00650 {
00651         TUcdFileReader reader; TStrV fields;
00652         reader.Open(CombinePath(CombinePath(basePath, GetAuxiliaryDir()), (sentence ? GetSentenceBreakTestFn() : GetWordBreakTestFn())));
00653         int nLines = 0; TRnd rnd = TRnd(123);
00654         while (reader.GetNextLine(fields))
00655         {
00656                 nLines += 1;
00657                 IAssert(fields.Len() == 1);
00658                 TStrV parts; fields[0].SplitOnWs(parts);
00659                 const int n = parts.Len(); IAssert((n % 2) == 1);
00660                 TIntV chars; TBoolV isBreak, isPredicted, isPredicted2;
00661                 // Each line is a sequence of codepoints, with a \times or \div in between each
00662                 // pair of codepoints (as well as at the beginning and the end of the sequence) to
00663                 // indicate whether a boundary exists there or not.
00664                 for (int i = 0; i < n; i++)
00665                 {
00666                         const TStr& s = parts[i];
00667                         if ((i % 2) == 0) {
00668                                 if (s == "\xc3\x97") // multiplication sign (U+00D7) in UTF-8
00669                                         isBreak.Add(false);
00670                                 else if (s == "\xc3\xb7") // division sign (U+00F7) in UTF-8
00671                                         isBreak.Add(true);
00672                                 else FailR(s.CStr()); }
00673                         else chars.Add(reader.ParseCodePoint(s));
00674                 }
00675                 const int m = n / 2; IAssert(chars.Len() == m); IAssert(isBreak.Len() == m + 1);
00676                 IAssert(isBreak[0]); IAssert(isBreak[m]);
00677                 isPredicted.Gen(m + 1); isPredicted.PutAll(false);
00678                 if (AlwaysFalse()) { printf("%3d", nLines); for (int i = 0; i < m; i++) printf(" %04x", int(chars[i])); printf("\n"); }
00679                 // We'll insert a few random characters at the beginning of the sequence
00680                 // so that srcPos doesn't always begin at 0.
00681                 for (int nBefore = 0; nBefore < 5; nBefore++)
00682                 {
00683                         TIntV chars2; for (int i = 0; i < nBefore; i++) chars2.Add(0, rnd.GetUniDevInt(0x10ffff + 1));
00684                         chars2.AddV(chars);
00685                         // Use FindNextBoundary to find all the word boundaries.
00686                         size_t position = (nBefore > 0 ? nBefore - 1 : nBefore); size_t prevPosition = position;
00687                         while (sentence ? FindNextSentenceBoundary(chars2, nBefore, m, position) : FindNextWordBoundary(chars2, nBefore, m, position))
00688                         {
00689                                 IAssert(prevPosition < position);
00690                                 IAssert(position <= size_t(nBefore + m));
00691                                 isPredicted[int(position) - nBefore] = true;
00692                                 prevPosition = position;
00693                         }
00694                         IAssert(position == size_t(nBefore + m));
00695                         if (sentence) FindSentenceBoundaries(chars2, nBefore, m, isPredicted2);
00696                         else FindWordBoundaries(chars2, nBefore, m, isPredicted2);
00697                         IAssert(isPredicted2.Len() == m + 1);
00698                         bool ok = true;
00699                         // If we start at 0, the word boundary at the beginning of the sequence was
00700                         // not found explicitly, so we'll add it now.
00701                         if (nBefore == 0) isPredicted[0] = true;
00702                         // Compare the predicted and the true boundaries.
00703                         for (int i = 0; i <= m; i++) {
00704                                 if (isBreak[i] != isPredicted[i]) ok = false;
00705                                 IAssert(isPredicted2[i] == isPredicted[i]); }
00706                         FILE *f = stderr;
00707                         if (! ok)
00708                         {
00709                                 fprintf(f, "\nError in line %d:\n", nLines);
00710                                 fprintf(f, "True:      ");
00711                                 for (int i = 0; i <= m; i++) {
00712                                         fprintf(f, "%s ", (isBreak[i] ? "|" : "."));
00713                                         if (i < m) fprintf(f, "%04x ", int(chars[i + nBefore])); }
00714                                 fprintf(f, "\nPredicted: ");
00715                                 for (int i = 0; i <= m; i++) {
00716                                         fprintf(f, "%s ", (isPredicted[i] ? "|" : "."));
00717                                         if (i < m) {
00718                                                 const int cp = chars[i + nBefore];
00719                                                 TStr s = sentence ? TUniChInfo::GetSbFlagsStr(GetSbFlags(cp)) : TUniChInfo::GetWbFlagsStr(GetWbFlags(cp));
00720                                                 if (IsWbIgnored(cp)) s = "*" + s;
00721                                                 fprintf(f, "%4s ", s.CStr()); }}
00722                                 fprintf(f, "\n");
00723                                 Fail;
00724                         }
00725                         // Test FindNextBoundary if we start in the middle of the sequence,
00726                         // i.e. not at an existing boundary.
00727                         for (int i = 0; i < m; i++) {
00728                                 position = i + nBefore; bool ok = sentence ? FindNextSentenceBoundary(chars2, nBefore, m, position) : FindNextWordBoundary(chars2, nBefore, m, position);
00729                                 IAssert(ok); // at the very least, there should be the 'boundary' at nBefore + m
00730                                 IAssert(size_t(i + nBefore) < position); IAssert(position <= size_t(nBefore + m));
00731                                 position -= nBefore;
00732                                 for (int j = i + 1; j < int(position); j++)
00733                                         IAssert(! isBreak[j]);
00734                                 IAssert(isBreak[int(position)]); }
00735                 }
00736         }
00737         reader.Close();
00738         printf("TUniChDb::TestFindNext%sBoundary: %d lines processed.\n", (sentence ? "Sentence" : "Word"), nLines);
00739 }
00740 
00741 //-----------------------------------------------------------------------------
00742 // TUniChDb -- composition and decomposition
00743 //-----------------------------------------------------------------------------
00744 
00745 void TUniChDb::TestComposition(const TStr& basePath)
00746 {
00747         TUcdFileReader reader; TStrV fields; int nLines = 0;
00748         reader.Open(CombinePath(basePath, GetNormalizationTestFn()));
00749         bool inPart1 = false; TIntH testedInPart1;
00750         while (reader.GetNextLine(fields))
00751         {
00752                 nLines += 1;
00753                 if (fields.Len() == 1) {
00754                         IAssert(fields[0].IsPrefix("@Part"));
00755                         inPart1 = (fields[0] == "@Part1"); continue; }
00756                 IAssert(fields.Len() == 6);
00757                 IAssert(fields[5].Len() == 0);
00758                 TIntV c1, c2, c3, c4, c5;
00759                 reader.ParseCodePointList(fields[0], c1);
00760                 reader.ParseCodePointList(fields[1], c2);
00761                 reader.ParseCodePointList(fields[2], c3);
00762                 reader.ParseCodePointList(fields[3], c4);
00763                 reader.ParseCodePointList(fields[4], c5);
00764                 TIntV v;
00765 #define AssE_(v1, v2, expl) AssertEq(v1, v2, TStr(expl) + " (line " + TInt::GetStr(nLines) + ")", 0)
00766 #define NFC_(cmpWith, operand) DecomposeAndCompose(operand, 0, operand.Len(), v, false); AssE_(cmpWith, v, #cmpWith " == NFC(" #operand ")")
00767 #define NFD_(cmpWith, operand) Decompose(operand, 0, operand.Len(), v, false); AssE_(cmpWith, v, #cmpWith " == NFD(" #operand ")")
00768 #define NFKC_(cmpWith, operand) DecomposeAndCompose(operand, 0, operand.Len(), v, true); AssE_(cmpWith, v, #cmpWith " == NFKC(" #operand ")")
00769 #define NFKD_(cmpWith, operand) Decompose(operand, 0, operand.Len(), v, true); AssE_(cmpWith, v, #cmpWith " == NFKD(" #operand ")")
00770                 // NFD:
00771                 NFD_(c3, c1);   // c3 == NFD(c1)
00772                 NFD_(c3, c2);   // c3 == NFD(c2)
00773                 NFD_(c3, c3);   // c3 == NFD(c3)
00774                 NFD_(c5, c4);   // c5 == NFD(c4)
00775                 NFD_(c5, c5);   // c5 == NFD(c5)
00776                 // NFC:
00777                 NFC_(c2, c1);   // c2 == NFC(c1)
00778                 NFC_(c2, c2);   // c2 == NFC(c2)
00779                 NFC_(c2, c3);   // c2 == NFC(c3)
00780                 NFC_(c4, c4);   // c4 == NFC(c4)
00781                 NFC_(c4, c5);   // c4 == NFC(c5)
00782                 // NFKD:
00783                 NFKD_(c5, c1);   // c5 == NFKD(c1)
00784                 NFKD_(c5, c2);   // c5 == NFKD(c2)
00785                 NFKD_(c5, c3);   // c5 == NFKD(c3)
00786                 NFKD_(c5, c4);   // c5 == NFKD(c4)
00787                 NFKD_(c5, c5);   // c5 == NFKD(c5)
00788                 // NFKC:
00789                 NFKC_(c4, c1);   // c4 == NFKC(c1)
00790                 NFKC_(c4, c2);   // c4 == NFKC(c2)
00791                 NFKC_(c4, c3);   // c4 == NFKC(c3)
00792                 NFKC_(c4, c4);   // c4 == NFKC(c4)
00793                 NFKC_(c4, c5);   // c4 == NFKC(c5)
00794                 //
00795                 if (inPart1) {
00796                         IAssert(c1.Len() == 1);
00797                         testedInPart1.AddKey(c1[0]); }
00798         }
00799         reader.Close();
00800         // Test other individual codepoints that were not mentioned in part 1.
00801         int nOther = 0;
00802         for (int i = h.FFirstKeyId(); h.FNextKeyId(i); )
00803         {
00804                 const int cp = h.GetKey(i), nLines = -1;
00805                 if (testedInPart1.IsKey(cp)) continue;
00806                 TIntV x, v; x.Add(cp);
00807                 NFC_(x, x);    // x == NFC(x)
00808                 NFD_(x, x);    // x == NFD(x)
00809                 NFKC_(x, x);   // x == NFKC(x)
00810                 NFKD_(x, x);   // x == NFKD(x)
00811                 nOther += 1;
00812         }
00813 #undef AssE_
00814 #undef NFC_
00815 #undef NFD_
00816 #undef NFKC_
00817 #undef NFKD_
00818         printf("TUniChDb::TestComposition: %d lines processed + %d other individual codepoints.\n", nLines, nOther);
00819 }
00820 
00821 //-----------------------------------------------------------------------------
00822 // TUniChDb -- case conversion tests
00823 //-----------------------------------------------------------------------------
00824 
00825 void TUniChDb::TestCaseConversion(const TStr& source, const TStr& trueLc,
00826                                                                   const TStr& trueTc, const TStr& trueUc,
00827                                                                   bool turkic, bool lithuanian)
00828 {
00829         TIntV src;
00830         TUcdFileReader::ParseCodePointList(source, src);
00831         FILE *f = stderr;
00832         for (int i = 0; i < 3; i++)
00833         {
00834                 TCaseConversion how = (i == 0) ? ccLower : (i == 1) ? ccTitle : ccUpper;
00835                 const TStr &trueDestS = (how == ccLower ? trueLc : how == ccTitle ? trueTc : trueUc);
00836                 TIntV trueDest; TUcdFileReader::ParseCodePointList(trueDestS, trueDest);
00837                 TIntV dest;
00838                 GetCaseConverted(src, 0, src.Len(), dest, true, how, turkic, lithuanian);
00839                 bool ok = (dest.Len() == trueDest.Len());
00840                 if (ok) for (int i = 0; i < dest.Len() && ok; i++) ok = ok && (dest[i] == trueDest[i]);
00841                 if (ok) continue;
00842                 fprintf(f, "%s(", (how == ccLower ? "toLowercase" : how == ccTitle ? "toTitlecase" : "toUppercase"));
00843                 for (int i = 0; i < src.Len(); i++) fprintf(f, "%s%04x", (i == 0 ? "" : " "), int(src[i]));
00844                 fprintf(f, ")\nCorrect:   (");
00845                 for (int i = 0; i < trueDest.Len(); i++) fprintf(f, "%s%04x", (i == 0 ? "" : " "), int(trueDest[i]));
00846                 fprintf(f, ")\nOur output:(");
00847                 for (int i = 0; i < dest.Len(); i++) fprintf(f, "%s%04x", (i == 0 ? "" : " "), int(dest[i]));
00848                 fprintf(f, ")\n");
00849                 IAssert(ok);
00850         }
00851 }
00852 
00853 void TUniChDb::TestCaseConversions()
00854 {
00855         // Because no thorough case-conversion test files have been provided as part
00856         // of the Unicode standard, we'll have to test things on a few test cases of our own.
00857         // - First, test some unconditional special mappings, such as 'ss', 'ffl', 'dz', etc.
00858         const TStr F = "0046 ", L = "004C ", S = "0053 ", T = "0054 ", W = "0057 ";
00859         const TStr f = "0066 ", l = "006c ", s = "0073 ", t = "0074 ", w = "0077 ";
00860         const TStr ss = "00df ", ffl = "fb04 ", longs = "017f ", longst = "fb05 ", wRing = "1e98 ", Ring = "030a ";
00861         const TStr DZ = "01c4 ", Dz = "01c5 ", dz = "01c6 ";
00862         const TStr space = "0020 ", Grave = "0300 ";
00863         TestCaseConversion(
00864                 F + L + s + t + space + Dz + w + T + ss + wRing + space + longs + DZ + space + dz + longst,  // source
00865                 f + l + s + t + space + dz + w + t + ss + wRing + space + longs + dz + space + dz + longst,  // lowercase
00866                 F + l + s + t + space + Dz + w + t + ss + wRing + space + S + dz + space + Dz + longst,      // titlecase
00867                 F + L + S + T + space + DZ + W + T + S + S + W + Ring + space + S + DZ + space + DZ + S + T, // uppercase
00868                 false, false);
00869         // - Dotted I, dotless i, etc., but with turkic == false.
00870         const TStr I = "0049 ", J = "004a ", i = "0069 ", j = "006a ", iDotless = "0131 ", IDot = "0130 ", DotA = "0307 ";
00871         TestCaseConversion(
00872                 s + I + t + i + w + iDotless + f + IDot + l + space + iDotless + DotA + f + I + DotA + s, // source
00873                 s + i + t + i + w + iDotless + f + i + DotA + l + space + iDotless + DotA + f + i + DotA + s, // lowercase
00874                 S + i + t + i + w + iDotless + f + i + DotA + l + space + I + DotA + f + i + DotA + s, // titlecase
00875                 S + I + T + I + W + I + F + IDot + L + space + I + DotA + F + I + DotA + S, // uppercase
00876                 false, false);
00877         // - Sigma (final vs. non-final forms).
00878         const TStr Sigma = "03a3 ", sigma = "03c3 ", fsigma = "03c2 ";
00879         TestCaseConversion(
00880                 Sigma + s + space + s + Sigma  + space + s + Sigma + s + space + Sigma + S + Sigma  + space + Sigma, // source
00881                 sigma + s + space + s + fsigma + space + s + sigma + s + space + sigma + s + fsigma + space + sigma, // lowercase
00882                 Sigma + s + space + S + fsigma + space + S + sigma + s + space + Sigma + s + fsigma + space + Sigma, // titlecase
00883                 Sigma + S + space + S + Sigma  + space + S + Sigma + S + space + Sigma + S + Sigma  + space + Sigma, // uppercase
00884                 false, false);
00885         TestCaseConversion(
00886                 sigma + s + space + s + sigma  + space + s + sigma + s + space + sigma + S + sigma  + space + sigma, // source
00887                 sigma + s + space + s + sigma  + space + s + sigma + s + space + sigma + s + sigma  + space + sigma, // lowercase
00888                 Sigma + s + space + S + sigma  + space + S + sigma + s + space + Sigma + s + sigma  + space + Sigma, // titlecase
00889                 Sigma + S + space + S + Sigma  + space + S + Sigma + S + space + Sigma + S + Sigma  + space + Sigma, // uppercase
00890                 false, false);
00891         TestCaseConversion(
00892                 fsigma + s + space + s + fsigma + space + s + fsigma + s + space + fsigma + S + fsigma  + space + fsigma, // source
00893                 fsigma + s + space + s + fsigma + space + s + fsigma + s + space + fsigma + s + fsigma  + space + fsigma, // lowercase
00894                 Sigma  + s + space + S + fsigma + space + S + fsigma + s + space + Sigma  + s + fsigma  + space + Sigma, // titlecase
00895                 Sigma  + S + space + S + Sigma  + space + S + Sigma  + S + space + Sigma  + S + Sigma   + space + Sigma, // uppercase
00896                 false, false);
00897         const TStr nonSA = "0315 0321 0322 "; // characters that are neither ccStarter nor ccAbove
00898         // Special case mappings for Turkic languages:
00899         // - After_I
00900         TestCaseConversion(
00901                 s + I + t + i + w + iDotless + f + IDot + l + space + iDotless + DotA + f + I + DotA + J + DotA + I + Grave + DotA + I + DotA + DotA + I + nonSA + DotA + s, // source
00902                 s + iDotless + t + i + w + iDotless + f + i + l + space + iDotless + DotA + f + i + j + DotA + iDotless + Grave + DotA + i + DotA + i + nonSA + s, // lowercase
00903                 S + iDotless + t + i + w + iDotless + f + i + l + space + I + DotA + f + i + j + DotA + iDotless + Grave + DotA + i + DotA + i + nonSA + s, // titlecase
00904                 S + I + T + IDot + W + I + F + IDot + L + space + I + DotA + F + I + DotA + J + DotA + I + Grave + DotA + I + DotA + DotA + I + nonSA + DotA + S, // uppercase
00905                 true, false); // turkic
00906         // - Not_Before_Dot
00907         TestCaseConversion(
00908                 I + Grave + t + I + DotA + f + I + nonSA + DotA + j + space + I + nonSA + DotA + space + I + Grave + t, // source
00909                 iDotless + Grave + t + i + f + i + nonSA + j + space + i + nonSA + space + iDotless + Grave + t, // lowercase
00910                 I + Grave + t + i + f + i + nonSA + j + space + I + nonSA + DotA + space + I + Grave + t, // titlecase
00911                 I + Grave + T + I + DotA + F + I + nonSA + DotA + J + space + I + nonSA + DotA + space + I + Grave + T, // uppercase
00912                 true, false); // turkic
00913         // Special case mappings for Lithuanian:
00914         // - After_Soft_Dotted  [note: I + DotA turns into i + DotA + DotA when lowercasing due to More_Above]
00915         TestCaseConversion(
00916                 i + DotA + t + i + Grave + DotA + f + i + DotA + DotA + f + i + nonSA + DotA + I + DotA + t + DotA + i + DotA + Grave, // source
00917                 i + DotA + t + i + Grave + DotA + f + i + DotA + DotA + f + i + nonSA + DotA + i + DotA + DotA + t + DotA + i + DotA + Grave, // lowercase
00918                 I + t + i + Grave + DotA + f + i + DotA + DotA + f + i + nonSA + DotA + i + DotA + DotA + t + DotA + i + DotA + Grave, // titlecase
00919                 I + T + I + Grave + DotA + F + I + DotA + F + I + nonSA + I + DotA + T + DotA + I + Grave, // uppercase
00920                 false, true); // lithuanian
00921         // - More_Above  [note: j + DotA turns into just J when uppercasing due to After_Soft_Dotted]
00922         TestCaseConversion(
00923                 J +        Grave + space + J +        nonSA + DotA + space + j + Grave + space + j + DotA + space + J + nonSA + J +        nonSA + Grave + space + j + nonSA, // source
00924                 j + DotA + Grave + space + j + DotA + nonSA + DotA + space + j + Grave + space + j + DotA + space + j + nonSA + j + DotA + nonSA + Grave + space + j + nonSA, // lowercase
00925                 J +        Grave + space + J +        nonSA + DotA + space + J + Grave + space + J +        space + J + nonSA + j + DotA + nonSA + Grave + space + J + nonSA, // titlecase
00926                 J +        Grave + space + J +        nonSA + DotA + space + J + Grave + space + J +        space + J + nonSA + J +        nonSA + Grave + space + J + nonSA, // uppercase
00927                 false, true); // lithuanian
00928         // SoftDotted [^ Starter Above]* 0307   --(uc,tc)-->  brez 0307
00929         // SoftDotted [^ Starter Above]* 0307   --(
00930         //TestCaseConversion("", "", "", "", false, false);
00931 }
00932 
00933 //-----------------------------------------------------------------------------
00934 // TUniChDb -- initialization from the text files
00935 //-----------------------------------------------------------------------------
00936 
00937 void TUniChDb::LoadTxt_ProcessDecomposition(TUniChInfo& ci, TStr s)
00938 {
00939         if (s.Empty()) return;
00940         if (s[0] == '<') {
00941                 int i = s.SearchCh('>'); IAssert(i > 0);
00942                 ci.flags |= ucfCompatibilityDecomposition;
00943                 s = s.GetSubStr(i + 1, s.Len() - 1); s.ToTrunc(); }
00944         TIntV dec; TUcdFileReader::ParseCodePointList(s, dec);
00945         IAssert(dec.Len() > 0);
00946         ci.decompOffset = decompositions.Len();
00947         decompositions.AddV(dec); decompositions.Add(-1);
00948 }
00949 
00950 void TUniChDb::InitPropList(const TStr& basePath)
00951 {
00952         TUcdFileReader reader; TStrV fields; int nCps = 0, nLines = 0;
00953         reader.Open(CombinePath(basePath, GetPropListFn()));
00954         TSubcatHelper helper(*this);
00955         while (reader.GetNextLine(fields))
00956         {
00957                 IAssert(fields.Len() == 2);
00958                 int from, to; reader.ParseCodePointRange(fields[0], from, to);
00959                 TStr s = fields[1];
00960                 TUniChProperties prop = TUniChProperties(0); TUniChPropertiesX propx = TUniChPropertiesX(0);
00961                 if (s == "White_Space") prop = ucfPrWhiteSpace;
00962                 else if (s == "Bidi_Control") prop = ucfPrBidiControl;
00963                 else if (s == "Join_Control") prop = ucfPrJoinControl;
00964                 else if (s == "Dash") prop = ucfPrDash;
00965                 else if (s == "Hyphen") prop = ucfPrHyphen;
00966                 else if (s == "Quotation_Mark") prop = ucfPrQuotationMark;
00967                 else if (s == "Terminal_Punctuation") prop = ucfPrTerminalPunctuation;
00968                 else if (s == "Other_Math") propx = ucfPxOtherMath;
00969                 else if (s == "Hex_Digit") prop = ucfPrHexDigit;
00970                 else if (s == "ASCII_Hex_Digit") prop = ucfPrAsciiHexDigit;
00971                 else if (s == "Other_Alphabetic") propx = ucfPxOtherAlphabetic;
00972                 else if (s == "Ideographic") prop = ucfPrIdeographic;
00973                 else if (s == "Diacritic") prop = ucfPrDiacritic;
00974                 else if (s == "Extender") prop = ucfPrExtender;
00975                 else if (s == "Other_Lowercase") propx = ucfPxOtherLowercase;
00976                 else if (s == "Other_Uppercase") propx = ucfPxOtherUppercase;
00977                 else if (s == "Noncharacter_Code_Point") prop = ucfPrNoncharacterCodePoint;
00978                 else if (s == "Other_Grapheme_Extend") propx = ucfPxOtherGraphemeExtend;
00979                 else if (s == "IDS_Binary_Operator") propx = ucfPxIdsBinaryOperator;
00980                 else if (s == "IDS_Trinary_Operator") propx = ucfPxIdsTrinaryOperator;
00981                 else if (s == "Radical") propx = ucfPxRadical;
00982                 else if (s == "Unified_Ideograph") propx = ucfPxUnifiedIdeograph;
00983                 else if (s == "Other_Default_Ignorable_Code_Point") propx = ucfPxOtherDefaultIgnorableCodePoint;
00984                 else if (s == "Deprecated") prop = ucfPrDeprecated;
00985                 else if (s == "Soft_Dotted") prop = ucfPrSoftDotted;
00986                 else if (s == "Logical_Order_Exception") prop = ucfPrLogicalOrderException;
00987                 else if (s == "Other_ID_Start") propx = ucfPxOtherIdStart;
00988                 else if (s == "Other_ID_Continue") propx = ucfPxOtherIdContinue;
00989                 else if (s == "STerm") prop = ucfPrSTerm;
00990                 else if (s == "Variation_Selector") prop = ucfPrVariationSelector;
00991                 else if (s == "Pattern_White_Space") prop = ucfPrPatternWhiteSpace;
00992                 else if (s == "Pattern_Syntax") prop = ucfPrPatternSyntax;
00993                 else FailR(s.CStr());
00994                 helper.ProcessComment(reader);
00995                 for (int cp = from; cp <= to; cp++) {
00996                         int i = h.GetKeyId(cp); if (i < 0) { i = h.AddKey(cp); helper.SetCat(cp); }
00997                         TUniChInfo &ci = h[i]; helper.TestCat(cp);
00998                         if (prop) { IAssert(! ci.IsProperty(prop)); ci.SetProperty(prop); }
00999                         if (propx) { IAssert(! ci.IsPropertyX(propx)); ci.SetPropertyX(propx); }
01000                         nCps++; }
01001                 nLines++;
01002         }
01003         reader.Close();
01004         printf("TUniChDb::InitPropList: %d lines, %d code points.\n", nLines, nCps);
01005 }
01006 
01007 void TUniChDb::InitDerivedCoreProperties(const TStr& basePath)
01008 {
01009         TUcdFileReader reader; TStrV fields; int nCps = 0, nLines = 0;
01010         reader.Open(CombinePath(basePath, GetDerivedCorePropsFn()));
01011         TSubcatHelper helper(*this);
01012         while (reader.GetNextLine(fields))
01013         {
01014                 IAssert(fields.Len() == 2);
01015                 int from, to; reader.ParseCodePointRange(fields[0], from, to);
01016                 TStr s = fields[1];
01017                 TUniChFlags flag = ucfCompatibilityDecomposition;
01018                 if (s == "Math") flag = ucfDcpMath;
01019                 else if (s == "Alphabetic") flag = ucfDcpAlphabetic;
01020                 else if (s == "Lowercase") flag = ucfDcpLowercase;
01021                 else if (s == "Uppercase") flag = ucfDcpUppercase;
01022                 else if (s == "ID_Start") flag = ucfDcpIdStart;
01023                 else if (s == "ID_Continue") flag = ucfDcpIdContinue;
01024                 else if (s == "XID_Start") flag = ucfDcpXidStart;
01025                 else if (s == "XID_Continue") flag = ucfDcpXidContinue;
01026                 else if (s == "Default_Ignorable_Code_Point") flag = ucfDcpDefaultIgnorableCodePoint;
01027                 else if (s == "Grapheme_Extend") flag = ucfDcpGraphemeExtend;
01028                 else if (s == "Grapheme_Base") flag = ucfDcpGraphemeBase;
01029                 else if (s == "Grapheme_Link") continue; // this flag is deprecated; test for combClass == Virama instead
01030                 else FailR(s.CStr());
01031                 // If we add new codepoints to the hash table, we should also set their category.
01032                 // This is supposed to be provided in the comment, e.g. "# Cf       SOFT HYPHEN".
01033                 helper.ProcessComment(reader);
01034                 //
01035                 for (int cp = from; cp <= to; cp++) {
01036                         int i = h.GetKeyId(cp); if (i < 0) { i = h.AddKey(cp); helper.SetCat(cp); }
01037                         helper.TestCat(cp);
01038                         TUniChInfo &ci = h[i]; IAssert(! ci.IsDcpFlag(flag));
01039                         ci.SetDcpFlag(flag); nCps++; }
01040                 nLines++;
01041         }
01042         reader.Close();
01043         printf("TUniChDb::InitDerivedCoreProperties: %d lines, %d code points.\n", nLines, nCps);
01044 }
01045 
01046 void TUniChDb::InitLineBreaks(const TStr& basePath)
01047 {
01048         // Clear old linebreak values.
01049         ushort xx = TUniChInfo::LineBreak_Unknown;
01050         for (int i = h.FFirstKeyId(); h.FNextKeyId(i); ) h[i].lineBreak = xx;
01051         // Read LineBreak.txt.
01052         TUcdFileReader reader; TStrV fields;
01053         reader.Open(CombinePath(basePath, GetLineBreakFn()));
01054         int nLines = 0, nCps = 0;
01055         while (reader.GetNextLine(fields))
01056         {
01057                 IAssert(fields.Len() == 2);
01058                 int from, to; reader.ParseCodePointRange(fields[0], from, to);
01059                 TStr s = fields[1]; IAssert(s.Len() == 2);
01060                 ushort us = TUniChInfo::GetLineBreakCode(s[0], s[1]);
01061                 if (us == xx) continue;
01062                 for (int cp = from; cp <= to; cp++) {
01063                         int i = h.GetKeyId(cp); if (i < 0) { i = h.AddKey(cp);
01064                                 printf("TUniChDb::InitLineBreaks: warning, adding codepoint %d, its category will remain unknown.\n", cp); }
01065                         IAssert(h[i].lineBreak == xx);
01066                         h[i].lineBreak = us; nCps++; }
01067                 nLines++;
01068         }
01069         reader.Close();
01070         printf("TUniChDb::InitLineBreaks: %d lines, %d codepoints processed (excluding \'xx\' values).\n", nLines, nCps);
01071 }
01072 
01073 void TUniChDb::InitScripts(const TStr& basePath)
01074 {
01075         TUcdFileReader reader; TStrV fields;
01076         reader.Open(CombinePath(basePath, GetScriptsFn()));
01077         TSubcatHelper helper(*this);
01078         while (reader.GetNextLine(fields))
01079         {
01080                 int from, to; reader.ParseCodePointRange(fields[0], from, to);
01081                 TStr scriptName = fields[1];
01082                 int scriptNo = scripts.GetKeyId(scriptName);
01083                 if (scriptNo < 0) { scriptNo = scripts.AddKey(scriptName); scripts[scriptNo] = 0; }
01084                 IAssert(scriptNo >= 0 && scriptNo < SCHAR_MAX); // because TUniChInfo.script is a signed char
01085                 scripts[scriptNo] += 1;
01086                 helper.ProcessComment(reader);
01087                 for (int cp = from; cp <= to; cp++) {
01088                         int i = h.GetKeyId(cp); if (i < 0) { i = h.AddKey(cp); helper.SetCat(cp); }
01089                         helper.TestCat(cp);
01090                         TUniChInfo &ci = h[i]; ci.script = scriptNo; }
01091         }
01092         reader.Close();
01093         scripts.AddDat(GetScriptNameUnknown()) = 0;
01094         printf("TUniChDb::InitScripts: %d scripts: ", scripts.Len());
01095         if (AlwaysFalse()) for (int i = scripts.FFirstKeyId(); scripts.FNextKeyId(i); )
01096                 printf("  %d:%s (%d)", i, scripts.GetKey(i).CStr(), int(scripts[i]));
01097         printf("\n");
01098 }
01099 
01100 void TUniChDb::InitWordAndSentenceBoundaryFlags(const TStr& basePath)
01101 {
01102         // UAX #29, sec. 4.1 and 5.1.
01103         // Note: these flags can also be initialized from auxiliary\\WordBreakProperty.txt.
01104         int katakana = GetScriptByName(GetScriptNameKatakana()); IAssert(katakana >= 0);
01105         int hiragana = GetScriptByName(GetScriptNameHiragana()); IAssert(hiragana >= 0);
01106         // Clear any existing word-boundary flags and initialize them again.
01107         for (int i = h.FFirstKeyId(); h.FNextKeyId(i); )
01108         {
01109                 const int cp = h.GetKey(i); TUniChInfo& ci = h[i];
01110                 ci.ClrWbAndSbFlags();
01111                 // Word-boundary flags.
01112                 if (ci.subCat  == ucOtherFormat && cp != 0x200c && cp != 0x200d) ci.SetWbFlag(ucfWbFormat);
01113                 if (ci.script == katakana) ci.SetWbFlag(ucfWbKatakana);
01114                 if (ci.lineBreak == TUniChInfo::LineBreak_InfixNumeric && cp != 0x3a) ci.SetWbFlag(ucfWbMidNum);
01115                 if (ci.lineBreak == TUniChInfo::LineBreak_Numeric) ci.SetWbFlag(ucfWbNumeric);
01116                 if (ci.subCat == ucPunctuationConnector) ci.SetWbFlag(ucfWbExtendNumLet);
01117                 // Sentence-boundary flags.  Some are identical to some word-boundary flags.
01118                 if (cp == 0xa || cp == 0xd || cp == 0x85 || cp == 0x2028 || cp == 0x2029) ci.SetSbFlag(ucfSbSep);
01119                 if (ci.subCat == ucOtherFormat && cp != 0x200c && cp != 0x200d) ci.SetSbFlag(ucfSbFormat);
01120                 if (ci.IsWhiteSpace() && ! ci.IsSbFlag(ucfSbSep) && cp != 0xa0) ci.SetSbFlag(ucfSbSp);
01121                 if (ci.IsLowercase() && ! ci.IsGraphemeExtend()) ci.SetSbFlag(ucfSbLower);
01122                 if (ci.IsUppercase() || ci.subCat == ucLetterTitlecase) ci.SetSbFlag(ucfSbUpper);
01123                 if ((ci.IsAlphabetic() || cp == 0xa0 || cp == 0x5f3) && ! ci.IsSbFlag(ucfSbLower) && ! ci.IsSbFlag(ucfSbUpper) && ! ci.IsGraphemeExtend()) ci.SetSbFlag(ucfSbOLetter);
01124                 if (ci.lineBreak == TUniChInfo::LineBreak_Numeric) ci.SetSbFlag(ucfSbNumeric);
01125                 if (cp == 0x2e) ci.SetSbFlag(ucfSbATerm);
01126                 // Note: UAX #29 says that if the property STerm = true, then the character should belong to the STerm class for
01127                 // the purposes of sentence-boundary detection.  Now in PropList.txt there is no doubt that 002E has the STerm
01128                 // property; thus, it should also belong to the STerm sentence-boundary class.  However, in
01129                 // SentenceBreakProperty.txt, 002E is only listed in the ATerm class, but not in the STerm class.
01130                 if (ci.IsSTerminal() && cp != 0x2e) ci.SetSbFlag(ucfSbSTerm);
01131                 if ((ci.subCat == ucPunctuationOpen || ci.subCat == ucPunctuationClose || ci.lineBreak == TUniChInfo::LineBreak_Quotation) && cp != 0x5f3 && ! ci.IsSbFlag(ucfSbATerm) && ! ci.IsSbFlag(ucfSbSTerm)) ci.SetSbFlag(ucfSbClose);
01132         }
01133         // Some additional characters for Katakana and MidLetter.
01134         TIntV v = (VB, 0x3031, 0x3032, 0x3033, 0x3034, 0x3035, 0x309b, 0x309c, 0x30a0, 0x30fc, 0xff70, 0xff9e, 0xff9f);
01135         for (int i = 0; i < v.Len(); i++) h.GetDat(v[i]).SetWbFlag(ucfWbKatakana);
01136         v = (VB, 0x27, 0xb7, 0x5f4, 0x2019, 0x2027, 0x3a);
01137         for (int i = 0; i < v.Len(); i++) h.GetDat(v[i]).SetWbFlag(ucfWbMidLetter);
01138         // WbALetter depends on Katakana, so it cannot be initialized earlier.
01139         for (int i = h.FFirstKeyId(); h.FNextKeyId(i); )
01140         {
01141                 const int cp = h.GetKey(i); TUniChInfo& ci = h[i];
01142                 if ((ci.IsAlphabetic() || cp == 0x5f3) && ! ci.IsIdeographic() && ! ci.IsWbFlag(ucfWbKatakana) && ci.lineBreak != TUniChInfo::LineBreak_ComplexContext && ci.script != hiragana && ! ci.IsGraphemeExtend())
01143                         ci.SetWbFlag(ucfWbALetter);
01144         }
01145         // An alternative is to extract the flags from WordBreakProperty.txt.
01146         // The results should be the same.
01147         {TUcdFileReader reader; TStrV fields;
01148         reader.Open(CombinePath(CombinePath(basePath, GetAuxiliaryDir()), GetWordBreakPropertyFn()));
01149         THash<TInt, TInt> hh;
01150         while (reader.GetNextLine(fields))
01151         {
01152                 IAssert(fields.Len() == 2);
01153                 int from, to; reader.ParseCodePointRange(fields[0], from, to);
01154                 TStr s = fields[1];
01155                 TUniChFlags flag = ucfCompatibilityDecomposition;
01156                 if (s == "Format") flag = ucfWbFormat;
01157                 else if (s == "Katakana") flag = ucfWbKatakana;
01158                 else if (s == "ALetter") flag = ucfWbALetter;
01159                 else if (s == "MidLetter") flag = ucfWbMidLetter;
01160                 else if (s == "MidNum") flag = ucfWbMidNum;
01161                 else if (s == "Numeric") flag = ucfWbNumeric;
01162                 else if (s == "ExtendNumLet") flag = ucfWbExtendNumLet;
01163                 else FailR(s.CStr());
01164                 for (int c = from; c <= to; c++) {
01165                         int i = hh.GetKeyId(c); if (i < 0) hh.AddDat(c, flag);
01166                         else hh[i].Val |= flag; }
01167         }
01168         reader.Close();
01169         TIntV cps; for (int i = h.FFirstKeyId(); h.FNextKeyId(i); ) cps.Add(h.GetKey(i));
01170         for (int i = hh.FFirstKeyId(); hh.FNextKeyId(i); ) cps.Add(hh.GetKey(i));
01171         cps.Sort(); cps.Merge();
01172         for (int i = 0; i < cps.Len(); i++)
01173         {
01174                 int cp = cps[i];
01175                 int flags1 = 0; if (h.IsKey(cp)) flags1 = h.GetDat(cp).GetWbFlags();
01176                 int flags2 = 0; if (hh.IsKey(cp)) flags2 = hh.GetDat(cp);
01177                 flags1 &= ~ucfSbSep; flags2 &= ~ucfSbSep;
01178                 if (flags1 != flags2) {
01179                         printf("cp = %04x: flags1 = %08x flags2 = %08x xor = %08x\n", cp, flags1, flags2, flags1 ^ flags2);
01180                         Fail; }
01181         }}
01182         // Likewise, for sentence boundary flags we have SentenceBreakProperty.txt.
01183         {TUcdFileReader reader; TStrV fields;
01184         reader.Open(CombinePath(CombinePath(basePath, GetAuxiliaryDir()), GetSentenceBreakPropertyFn()));
01185         THash<TInt, TInt> hh;
01186         while (reader.GetNextLine(fields))
01187         {
01188                 IAssert(fields.Len() == 2);
01189                 int from, to; reader.ParseCodePointRange(fields[0], from, to);
01190                 TStr s = fields[1];
01191                 TUniChFlags flag = ucfCompatibilityDecomposition;
01192                 if (s == "Sep") flag = ucfSbSep;
01193                 else if (s == "Format") flag = ucfSbFormat;
01194                 else if (s == "Sp") flag = ucfSbSp;
01195                 else if (s == "Lower") flag = ucfSbLower;
01196                 else if (s == "Upper") flag = ucfSbUpper;
01197                 else if (s == "OLetter") flag = ucfSbOLetter;
01198                 else if (s == "Numeric") flag = ucfSbNumeric;
01199                 else if (s == "ATerm") flag = ucfSbATerm;
01200                 else if (s == "STerm") flag = ucfSbSTerm;
01201                 else if (s == "Close") flag = ucfSbClose;
01202                 else FailR(s.CStr());
01203                 for (int c = from; c <= to; c++) {
01204                         int i = hh.GetKeyId(c); if (i < 0) hh.AddDat(c, flag);
01205                         else hh[i].Val |= flag; }
01206         }
01207         reader.Close();
01208         TIntV cps; for (int i = h.FFirstKeyId(); h.FNextKeyId(i); ) cps.Add(h.GetKey(i));
01209         for (int i = hh.FFirstKeyId(); hh.FNextKeyId(i); ) cps.Add(hh.GetKey(i));
01210         cps.Sort(); cps.Merge();
01211         for (int i = 0; i < cps.Len(); i++)
01212         {
01213                 int cp = cps[i];
01214                 int flags1 = 0; if (h.IsKey(cp)) flags1 = h.GetDat(cp).GetSbFlags();
01215                 int flags2 = 0; if (hh.IsKey(cp)) flags2 = hh.GetDat(cp);
01216                 if (flags1 != flags2) {
01217                         printf("cp = %04x: flags1 = %08x [%s] flags2 = %08x [%s] xor = %08x\n", cp,
01218                                 flags1, TUniChInfo::GetSbFlagsStr(flags1).CStr(),
01219                                 flags2, TUniChInfo::GetSbFlagsStr(flags2).CStr(),
01220                                 flags1 ^ flags2);
01221                         Fail; }
01222         }}
01223 }
01224 
01225 void TUniChDb::InitSpecialCasing(const TStr& basePath)
01226 {
01227         TUcdFileReader reader; TStrV fields;
01228         reader.Open(CombinePath(basePath, GetSpecialCasingFn()));
01229         while (reader.GetNextLine(fields))
01230         {
01231                 IAssert(fields.Len() == 5 || fields.Len() == 6);
01232                 IAssert(fields.Last().Empty());
01233                 // Skip conditional mappings -- they will be hardcoded in the GetCaseConverted method.
01234                 TStr conditions = "";
01235                 if (fields.Len() == 6) conditions = fields[4];
01236                 conditions.ToTrunc(); if (! conditions.Empty()) continue;
01237                 // Keep the other mappings.
01238                 const int cp = reader.ParseCodePoint(fields[0]);
01239                 TIntV v; reader.ParseCodePointList(fields[1], v);
01240                 specialCasingLower.AddDat(cp, v);
01241                 reader.ParseCodePointList(fields[2], v);
01242                 specialCasingTitle.AddDat(cp, v);
01243                 reader.ParseCodePointList(fields[3], v);
01244                 specialCasingUpper.AddDat(cp, v);
01245         }
01246         reader.Close();
01247 }
01248 
01249 void TUniChDb::LoadTxt(const TStr& basePath)
01250 {
01251         Clr();
01252         // Set up a hash table with enough ports that there will be more or less no chains longer than 1 element.
01253         h = THash<TInt, TUniChInfo>(196613, true);
01254         //
01255         caseFolding.LoadTxt(CombinePath(basePath, GetCaseFoldingFn()));
01256         //
01257         TUcdFileReader reader; TStrV fields; TIntH seen;
01258         reader.Open(CombinePath(basePath, GetUnicodeDataFn()));
01259         while (reader.GetNextLine(fields))
01260         {
01261                 // Codepoint.
01262                 int cp = reader.ParseCodePoint(fields[0]);
01263                 IAssert(! seen.IsKey(cp)); seen.AddKey(cp);
01264                 TUniChInfo& ci = h.AddDat(cp);
01265                 // Name.
01266                 ci.nameOffset = charNames.AddStr(fields[1]);
01267                 // Category.
01268                 TStr& s = fields[2]; IAssert(s.Len() == 2);
01269                 ci.chCat = s[0]; ci.chSubCat = s[1];
01270                 // Canonical combining class.
01271                 s = fields[3]; IAssert(s.Len() > 0);
01272                 int i; bool ok = s.IsInt(true, TUCh::Mn, TUCh::Mx, i); IAssertR(ok, s);
01273                 ci.combClass = (uchar) i;
01274                 // Decomposition type and mapping.
01275                 LoadTxt_ProcessDecomposition(ci, fields[5]);
01276                 // Simple case mappings.
01277                 s = fields[12]; ci.simpleUpperCaseMapping = (! s.Empty() ? reader.ParseCodePoint(s) : -1);
01278                 s = fields[13]; ci.simpleLowerCaseMapping = (! s.Empty() ? reader.ParseCodePoint(s) : -1);
01279                 s = fields[14]; ci.simpleTitleCaseMapping = (! s.Empty() ? reader.ParseCodePoint(s) : -1);
01280                 //
01281                 ci.InitAfterLoad(); // initializes ci.cat, ci.subCat
01282         }
01283         reader.Close();
01284         //
01285         InitScripts(basePath);
01286         //
01287         InitPropList(basePath);
01288         InitDerivedCoreProperties(basePath);
01289         InitLineBreaks(basePath);
01290         InitSpecialCasing(basePath);
01291         // Process the composition exclusions (UAX #15, sec. 6).
01292         for (int i = h.FFirstKeyId(); h.FNextKeyId(i); )
01293         {
01294                 TUniChInfo& ci = h[i];
01295                 int ofs = ci.decompOffset; if (ofs < 0) continue;
01296                 int n = 0; while (decompositions[ofs + n] >= 0) n++;
01297                 IAssert(n > 0);
01298                 // Singleton decompositions.
01299                 if (n == 1) { ci.flags |= ucfCompositionExclusion; continue; }
01300                 // Non-starter decompositions.
01301                 int cp1 = decompositions[ofs];
01302                 IAssert(h.IsKey(cp1));
01303                 uchar ccc = h.GetDat(cp1).combClass;
01304                 if (ccc != TUniChInfo::ccStarter) { ci.flags |= ucfCompositionExclusion; continue; }
01305         }
01306         // Process the composition exclusion table.
01307         reader.Open(CombinePath(basePath, GetCompositionExclusionsFn()));
01308         int nExclusionTable = 0;
01309         while (reader.GetNextLine(fields))
01310         {
01311                 IAssert(fields.Len() == 1);
01312                 int cp = reader.ParseCodePoint(fields[0]);
01313                 int i = h.GetKeyId(cp); IAssert(i >= 0);
01314                 h[i].flags |= ucfCompositionExclusion;
01315                 nExclusionTable++;
01316         }
01317         reader.Close();
01318         // Prepare the inverted index for composition pairs.
01319         for (int i = h.FFirstKeyId(); h.FNextKeyId(i); )
01320         {
01321                 int cp = h.GetKey(i);
01322                 TUniChInfo& ci = h[i];
01323                 int ofs = ci.decompOffset; if (ofs < 0) continue;
01324                 if (ci.IsCompositionExclusion()) continue;
01325                 if (ci.IsCompatibilityDecomposition()) continue;
01326                 int n = 0; while (decompositions[ofs + n] >= 0) n++;
01327                 if (n != 2) continue;
01328                 TIntPr pr = TIntPr(decompositions[ofs], decompositions[ofs + 1]);
01329                 IAssert(! inverseDec.IsKey(pr));
01330                 IAssert(ci.combClass == TUniChInfo::ccStarter);
01331                 inverseDec.AddDat(pr, cp);
01332         }
01333         printf("TUniChDb(%s): %d chars in h, %d in decomp inverse index; %d in decomp vector; %d in exclusion table\n",
01334                 basePath.CStr(), h.Len(), inverseDec.Len(), decompositions.Len(), nExclusionTable);
01335         // Before calling InitWordBoundaryFlags(), scripts must have been initialized, as well as
01336         // flags such as Alphabetic, Word_Break, and Grapheme_Extend.
01337         InitWordAndSentenceBoundaryFlags(basePath); // Note: scripts must have been initialized by this point.
01338         // Make sure that Hangul combined characters are treated as stareters.
01339         for (int cp = HangulSBase; cp < HangulSBase + HangulSCount; cp++)
01340         {
01341                 int j = h.GetKeyId(cp); if (j < 0) continue;
01342                 TUniChInfo& ci = h[j];
01343                 if (ci.combClass == TUniChInfo::ccInvalid) ci.combClass = TUniChInfo::ccStarter;
01344                 IAssert(ci.combClass == TUniChInfo::ccStarter);
01345         }
01346         // There should be no more additions to 'h' beyond this point.
01347         const int oldHLen = h.Len();
01348         // Provide default (identity) case mappings if any were missing from UnicodeData.txt
01349         // (or if any entirely new characters were added later, e.g. while reading LineBreaks.txt).
01350         int scriptUnknown = GetScriptByName(GetScriptNameUnknown());
01351         for (int i = h.FFirstKeyId(); h.FNextKeyId(i); )
01352         {
01353                 int cp = h.GetKey(i); TUniChInfo &ci = h[i];
01354                 if (ci.simpleLowerCaseMapping < 0) ci.simpleLowerCaseMapping = cp;
01355                 if (ci.simpleUpperCaseMapping < 0) ci.simpleUpperCaseMapping = cp;
01356                 if (ci.simpleTitleCaseMapping < 0) ci.simpleTitleCaseMapping = cp;
01357                 if (ci.script < 0) ci.script = scriptUnknown;
01358         }
01359         IAssert(h.Len() == oldHLen);
01360 }
01361 
01362 void TUniChDb::SaveBin(const TStr& fnBinUcd)
01363 {
01364         PSOut SOut=TFOut::New(fnBinUcd);
01365         Save(*SOut);
01366 }
01367 
01368 void TUniChDb::InitAfterLoad()
01369 {
01370         scriptUnknown = GetScriptByName(GetScriptNameUnknown()); IAssert(scriptUnknown >= 0);
01371 }
01372 
01373 //-----------------------------------------------------------------------------
01374 // TUniChDb -- main test driver
01375 //-----------------------------------------------------------------------------
01376 
01377 void TUniChDb::Test(const TStr& basePath)
01378 {
01379         TStr fnBin = CombinePath(basePath, GetBinFn());
01380         if (true || ! TFile::Exists(fnBin))
01381         {
01382                 // Test LoadTxt.
01383                 LoadTxt(basePath);
01384                 // Test Save.
01385                 {PSOut SOut = TFOut::New(fnBin);
01386                 Save(*SOut);}
01387         }
01388         // Test Load.
01389         this->~TUniChDb();
01390         new(this) TUniChDb();
01391         {PSIn SIn = TFIn::New(fnBin);
01392         Load(*SIn);}
01393         // Test the case folding.
01394         caseFolding.Test();
01395         // Test the word breaking.
01396         TestWbFindNonIgnored();
01397         // Test the sentence breaking.
01398         TestFindNextWordOrSentenceBoundary(basePath, true);
01399         TestFindNextWordOrSentenceBoundary(basePath, false);
01400         // Test composition and decomposition.
01401         TestComposition(basePath);
01402         // Test the case conversions.
01403         TestCaseConversions();
01404 }
01405 
01406 //-----------------------------------------------------------------------------
01407 // T8BitCodec -- a class for converting between 8-bit encodings and Unicode
01408 //-----------------------------------------------------------------------------
01409 
01410 //-----------------------------------------------------------------------------
01411 // ISO-8859-2
01412 //-----------------------------------------------------------------------------
01413 
01414 const int TEncoding_ISO8859_2::toUnicodeTable[6 * 16] =
01415 {
01416         /* 0xa0 */ 0x00a0, 0x0104, 0x02d8, 0x0141, 0x00a4, 0x013d, 0x015a, 0x00a7, 0x00a8, 0x0160, 0x015e, 0x0164, 0x0179, 0x00ad, 0x017d, 0x017b,
01417         /* 0xb0 */ 0x00b0, 0x0105, 0x02db, 0x0142, 0x00b4, 0x013e, 0x015b, 0x02c7, 0x00b8, 0x0161, 0x015f, 0x0165, 0x017a, 0x02dd, 0x017e, 0x017c,
01418         /* 0xc0 */ 0x0154, 0x00c1, 0x00c2, 0x0102, 0x00c4, 0x0139, 0x0106, 0x00c7, 0x010c, 0x00c9, 0x0118, 0x00cb, 0x011a, 0x00cd, 0x00ce, 0x010e,
01419         /* 0xd0 */ 0x0110, 0x0143, 0x0147, 0x00d3, 0x00d4, 0x0150, 0x00d6, 0x00d7, 0x0158, 0x016e, 0x00da, 0x0170, 0x00dc, 0x00dd, 0x0162, 0x00df,
01420         /* 0xe0 */ 0x0155, 0x00e1, 0x00e2, 0x0103, 0x00e4, 0x013a, 0x0107, 0x00e7, 0x010d, 0x00e9, 0x0119, 0x00eb, 0x011b, 0x00ed, 0x00ee, 0x010f,
01421         /* 0xf0 */ 0x0111, 0x0144, 0x0148, 0x00f3, 0x00f4, 0x0151, 0x00f6, 0x00f7, 0x0159, 0x016f, 0x00fa, 0x0171, 0x00fc, 0x00fd, 0x0163, 0x02d9
01422 };
01423 
01424 const int TEncoding_ISO8859_2::fromUnicodeTable1[14 * 16] = {
01425         /* U+00a0 */ 0x00a0,     -1,     -1,     -1, 0x00a4,     -1,     -1, 0x00a7, 0x00a8,     -1,     -1,     -1,     -1, 0x00ad,     -1,     -1,
01426         /* U+00b0 */ 0x00b0,     -1,     -1,     -1, 0x00b4,     -1,     -1,     -1, 0x00b8,     -1,     -1,     -1,     -1,     -1,     -1,     -1,
01427         /* U+00c0 */     -1, 0x00c1, 0x00c2,     -1, 0x00c4,     -1,     -1, 0x00c7,     -1, 0x00c9,     -1, 0x00cb,     -1, 0x00cd, 0x00ce,     -1,
01428         /* U+00d0 */     -1,     -1,     -1, 0x00d3, 0x00d4,     -1, 0x00d6, 0x00d7,     -1,     -1, 0x00da,     -1, 0x00dc, 0x00dd,     -1, 0x00df,
01429         /* U+00e0 */     -1, 0x00e1, 0x00e2,     -1, 0x00e4,     -1,     -1, 0x00e7,     -1, 0x00e9,     -1, 0x00eb,     -1, 0x00ed, 0x00ee,     -1,
01430         /* U+00f0 */     -1,     -1,     -1, 0x00f3, 0x00f4,     -1, 0x00f6, 0x00f7,     -1,     -1, 0x00fa,     -1, 0x00fc, 0x00fd,     -1,     -1,
01431         /* U+0100 */     -1,     -1, 0x00c3, 0x00e3, 0x00a1, 0x00b1, 0x00c6, 0x00e6,     -1,     -1,     -1,     -1, 0x00c8, 0x00e8, 0x00cf, 0x00ef,
01432         /* U+0110 */ 0x00d0, 0x00f0,     -1,     -1,     -1,     -1,     -1,     -1, 0x00ca, 0x00ea, 0x00cc, 0x00ec,     -1,     -1,     -1,     -1,
01433         /* U+0120 */     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1 /* blank */,
01434         /* U+0130 */     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1, 0x00c5, 0x00e5,     -1,     -1, 0x00a5, 0x00b5,     -1,
01435         /* U+0140 */     -1, 0x00a3, 0x00b3, 0x00d1, 0x00f1,     -1,     -1, 0x00d2, 0x00f2,     -1,     -1,     -1,     -1,     -1,     -1,     -1,
01436         /* U+0150 */ 0x00d5, 0x00f5,     -1,     -1, 0x00c0, 0x00e0,     -1,     -1, 0x00d8, 0x00f8, 0x00a6, 0x00b6,     -1,     -1, 0x00aa, 0x00ba,
01437         /* U+0160 */ 0x00a9, 0x00b9, 0x00de, 0x00fe, 0x00ab, 0x00bb,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1, 0x00d9, 0x00f9,
01438         /* U+0170 */ 0x00db, 0x00fb,     -1,     -1,     -1,     -1,     -1,     -1,     -1, 0x00ac, 0x00bc, 0x00af, 0x00bf, 0x00ae, 0x00be,     -1
01439 };
01440 
01441 const int TEncoding_ISO8859_2::fromUnicodeTable2[2 * 16] = {
01442         /* U+02c0 */     -1,     -1,     -1,     -1,     -1,     -1,     -1, 0x00b7,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,
01443         /* U+02d0 */     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1, 0x00a2, 0x00ff,     -1, 0x00b2,     -1, 0x00bd,     -1,     -1
01444 };
01445 
01446 //-----------------------------------------------------------------------------
01447 // ISO-8859-3
01448 //-----------------------------------------------------------------------------
01449 
01450 const int TEncoding_ISO8859_3::toUnicodeTable[6 * 16] = {
01451         /* 0xa0 */ 0x00a0, 0x0126, 0x02d8, 0x00a3, 0x00a4,     -1, 0x0124, 0x00a7, 0x00a8, 0x0130, 0x015e, 0x011e, 0x0134, 0x00ad,     -1, 0x017b,
01452         /* 0xb0 */ 0x00b0, 0x0127, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x0125, 0x00b7, 0x00b8, 0x0131, 0x015f, 0x011f, 0x0135, 0x00bd,     -1, 0x017c,
01453         /* 0xc0 */ 0x00c0, 0x00c1, 0x00c2,     -1, 0x00c4, 0x010a, 0x0108, 0x00c7, 0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf,
01454         /* 0xd0 */     -1, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x0120, 0x00d6, 0x00d7, 0x011c, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x016c, 0x015c, 0x00df,
01455         /* 0xe0 */ 0x00e0, 0x00e1, 0x00e2,     -1, 0x00e4, 0x010b, 0x0109, 0x00e7, 0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef,
01456         /* 0xf0 */     -1, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x0121, 0x00f6, 0x00f7, 0x011d, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x016d, 0x015d, 0x02d9
01457 };
01458 
01459 const int TEncoding_ISO8859_3::fromUnicodeTable1[14 * 16] = {
01460         /* U+00a0 */ 0x00a0,     -1,     -1, 0x00a3, 0x00a4,     -1,     -1, 0x00a7, 0x00a8,     -1,     -1,     -1,     -1, 0x00ad,     -1,     -1,
01461         /* U+00b0 */ 0x00b0,     -1, 0x00b2, 0x00b3, 0x00b4, 0x00b5,     -1, 0x00b7, 0x00b8,     -1,     -1,     -1,     -1, 0x00bd,     -1,     -1,
01462         /* U+00c0 */ 0x00c0, 0x00c1, 0x00c2,     -1, 0x00c4,     -1,     -1, 0x00c7, 0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf,
01463         /* U+00d0 */     -1, 0x00d1, 0x00d2, 0x00d3, 0x00d4,     -1, 0x00d6, 0x00d7,     -1, 0x00d9, 0x00da, 0x00db, 0x00dc,     -1,     -1, 0x00df,
01464         /* U+00e0 */ 0x00e0, 0x00e1, 0x00e2,     -1, 0x00e4,     -1,     -1, 0x00e7, 0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef,
01465         /* U+00f0 */     -1, 0x00f1, 0x00f2, 0x00f3, 0x00f4,     -1, 0x00f6, 0x00f7,     -1, 0x00f9, 0x00fa, 0x00fb, 0x00fc,     -1,     -1,     -1,
01466         /* U+0100 */     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1, 0x00c6, 0x00e6, 0x00c5, 0x00e5,     -1,     -1,     -1,     -1,
01467         /* U+0110 */     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1, 0x00d8, 0x00f8, 0x00ab, 0x00bb,
01468         /* U+0120 */ 0x00d5, 0x00f5,     -1,     -1, 0x00a6, 0x00b6, 0x00a1, 0x00b1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,
01469         /* U+0130 */ 0x00a9, 0x00b9,     -1,     -1, 0x00ac, 0x00bc,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,
01470         /* U+0140 */     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1 /* blank */,
01471         /* U+0150 */     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1, 0x00de, 0x00fe, 0x00aa, 0x00ba,
01472         /* U+0160 */     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1, 0x00dd, 0x00fd,     -1,     -1,
01473         /* U+0170 */     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1, 0x00af, 0x00bf,     -1,     -1,     -1,
01474 };
01475 const int TEncoding_ISO8859_3::fromUnicodeTable2[2] = {
01476         /* U+02d8 */ 0x00a2, 0x00ff
01477 };
01478 
01479 //-----------------------------------------------------------------------------
01480 // ISO-8859-4
01481 //-----------------------------------------------------------------------------
01482 
01483 const int TEncoding_ISO8859_4::toUnicodeTable[6 * 16] = {
01484         /* 0xa0 */ 0x00a0, 0x0104, 0x0138, 0x0156, 0x00a4, 0x0128, 0x013b, 0x00a7, 0x00a8, 0x0160, 0x0112, 0x0122, 0x0166, 0x00ad, 0x017d, 0x00af,
01485         /* 0xb0 */ 0x00b0, 0x0105, 0x02db, 0x0157, 0x00b4, 0x0129, 0x013c, 0x02c7, 0x00b8, 0x0161, 0x0113, 0x0123, 0x0167, 0x014a, 0x017e, 0x014b,
01486         /* 0xc0 */ 0x0100, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x012e, 0x010c, 0x00c9, 0x0118, 0x00cb, 0x0116, 0x00cd, 0x00ce, 0x012a,
01487         /* 0xd0 */ 0x0110, 0x0145, 0x014c, 0x0136, 0x00d4, 0x00d5, 0x00d6, 0x00d7, 0x00d8, 0x0172, 0x00da, 0x00db, 0x00dc, 0x0168, 0x016a, 0x00df,
01488         /* 0xe0 */ 0x0101, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x012f, 0x010d, 0x00e9, 0x0119, 0x00eb, 0x0117, 0x00ed, 0x00ee, 0x012b,
01489         /* 0xf0 */ 0x0111, 0x0146, 0x014d, 0x0137, 0x00f4, 0x00f5, 0x00f6, 0x00f7, 0x00f8, 0x0173, 0x00fa, 0x00fb, 0x00fc, 0x0169, 0x016b, 0x02d9
01490 };
01491 
01492 const int TEncoding_ISO8859_4::fromUnicodeTable1[14 * 16] = {
01493         /* U+00a0 */ 0x00a0,     -1,     -1,     -1, 0x00a4,     -1,     -1, 0x00a7, 0x00a8,     -1,     -1,     -1,     -1, 0x00ad,     -1, 0x00af,
01494         /* U+00b0 */ 0x00b0,     -1,     -1,     -1, 0x00b4,     -1,     -1,     -1, 0x00b8,     -1,     -1,     -1,     -1,     -1,     -1,     -1,
01495         /* U+00c0 */     -1, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6,     -1,     -1, 0x00c9,     -1, 0x00cb,     -1, 0x00cd, 0x00ce,     -1,
01496         /* U+00d0 */     -1,     -1,     -1,     -1, 0x00d4, 0x00d5, 0x00d6, 0x00d7, 0x00d8,     -1, 0x00da, 0x00db, 0x00dc,     -1,     -1, 0x00df,
01497         /* U+00e0 */     -1, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6,     -1,     -1, 0x00e9,     -1, 0x00eb,     -1, 0x00ed, 0x00ee,     -1,
01498         /* U+00f0 */     -1,     -1,     -1,     -1, 0x00f4, 0x00f5, 0x00f6, 0x00f7, 0x00f8,     -1, 0x00fa, 0x00fb, 0x00fc,     -1,     -1,     -1,
01499         /* U+0100 */ 0x00c0, 0x00e0,     -1,     -1, 0x00a1, 0x00b1,     -1,     -1,     -1,     -1,     -1,     -1, 0x00c8, 0x00e8,     -1,     -1,
01500         /* U+0110 */ 0x00d0, 0x00f0, 0x00aa, 0x00ba,     -1,     -1, 0x00cc, 0x00ec, 0x00ca, 0x00ea,     -1,     -1,     -1,     -1,     -1,     -1,
01501         /* U+0120 */     -1,     -1, 0x00ab, 0x00bb,     -1,     -1,     -1,     -1, 0x00a5, 0x00b5, 0x00cf, 0x00ef,     -1,     -1, 0x00c7, 0x00e7,
01502         /* U+0130 */     -1,     -1,     -1,     -1,     -1,     -1, 0x00d3, 0x00f3, 0x00a2,     -1,     -1, 0x00a6, 0x00b6,     -1,     -1,     -1,
01503         /* U+0140 */     -1,     -1,     -1,     -1,     -1, 0x00d1, 0x00f1,     -1,     -1,     -1, 0x00bd, 0x00bf, 0x00d2, 0x00f2,     -1,     -1,
01504         /* U+0150 */     -1,     -1,     -1,     -1,     -1,     -1, 0x00a3, 0x00b3,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,
01505         /* U+0160 */ 0x00a9, 0x00b9,     -1,     -1,     -1,     -1, 0x00ac, 0x00bc, 0x00dd, 0x00fd, 0x00de, 0x00fe,     -1,     -1,     -1,     -1,
01506         /* U+0170 */     -1,     -1, 0x00d9, 0x00f9,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1, 0x00ae, 0x00be,     -1,
01507 };
01508 
01509 const int TEncoding_ISO8859_4::fromUnicodeTable2[2 * 16] = {
01510         /* U+02c0 */     -1,     -1,     -1,     -1,     -1,     -1,     -1, 0x00b7,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,
01511         /* U+02d0 */     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1, 0x00ff,     -1, 0x00b2,     -1,     -1,     -1,     -1
01512 };
01513 
01514 //-----------------------------------------------------------------------------
01515 // CP 437
01516 //-----------------------------------------------------------------------------
01517 
01518 const int TEncoding_CP437::toUnicodeTable[8 * 16] = {
01519         /* 0x80 */ 0x00c7, 0x00fc, 0x00e9, 0x00e2, 0x00e4, 0x00e0, 0x00e5, 0x00e7, 0x00ea, 0x00eb, 0x00e8, 0x00ef, 0x00ee, 0x00ec, 0x00c4, 0x00c5,
01520         /* 0x90 */ 0x00c9, 0x00e6, 0x00c6, 0x00f4, 0x00f6, 0x00f2, 0x00fb, 0x00f9, 0x00ff, 0x00d6, 0x00dc, 0x00a2, 0x00a3, 0x00a5, 0x20a7, 0x0192,
01521         /* 0xa0 */ 0x00e1, 0x00ed, 0x00f3, 0x00fa, 0x00f1, 0x00d1, 0x00aa, 0x00ba, 0x00bf, 0x2310, 0x00ac, 0x00bd, 0x00bc, 0x00a1, 0x00ab, 0x00bb,
01522         /* 0xb0 */ 0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x2561, 0x2562, 0x2556, 0x2555, 0x2563, 0x2551, 0x2557, 0x255d, 0x255c, 0x255b, 0x2510,
01523         /* 0xc0 */ 0x2514, 0x2534, 0x252c, 0x251c, 0x2500, 0x253c, 0x255e, 0x255f, 0x255a, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256c, 0x2567,
01524         /* 0xd0 */ 0x2568, 0x2564, 0x2565, 0x2559, 0x2558, 0x2552, 0x2553, 0x256b, 0x256a, 0x2518, 0x250c, 0x2588, 0x2584, 0x258c, 0x2590, 0x2580,
01525         /* 0xe0 */ 0x03b1, 0x00df, 0x0393, 0x03c0, 0x03a3, 0x03c3, 0x00b5, 0x03c4, 0x03a6, 0x0398, 0x03a9, 0x03b4, 0x221e, 0x03c6, 0x03b5, 0x2229,
01526         /* 0xf0 */ 0x2261, 0x00b1, 0x2265, 0x2264, 0x2320, 0x2321, 0x00f7, 0x2248, 0x00b0, 0x2219, 0x00b7, 0x221a, 0x207f, 0x00b2, 0x25a0, 0x00a0
01527 };
01528 
01529 const int TEncoding_CP437::fromUnicodeTable1[6 * 16] = {
01530         /* U+00a0 */ 0x00ff, 0x00ad, 0x009b, 0x009c,     -1, 0x009d,     -1,     -1,     -1,     -1, 0x00a6, 0x00ae, 0x00aa,     -1,     -1,     -1,
01531         /* U+00b0 */ 0x00f8, 0x00f1, 0x00fd,     -1,     -1, 0x00e6,     -1, 0x00fa,     -1,     -1, 0x00a7, 0x00af, 0x00ac, 0x00ab,     -1, 0x00a8,
01532         /* U+00c0 */     -1,     -1,     -1,     -1, 0x008e, 0x008f, 0x0092, 0x0080,     -1, 0x0090,     -1,     -1,     -1,     -1,     -1,     -1,
01533         /* U+00d0 */     -1, 0x00a5,     -1,     -1,     -1,     -1, 0x0099,     -1,     -1,     -1,     -1,     -1, 0x009a,     -1,     -1, 0x00e1,
01534         /* U+00e0 */ 0x0085, 0x00a0, 0x0083,     -1, 0x0084, 0x0086, 0x0091, 0x0087, 0x008a, 0x0082, 0x0088, 0x0089, 0x008d, 0x00a1, 0x008c, 0x008b,
01535         /* U+00f0 */     -1, 0x00a4, 0x0095, 0x00a2, 0x0093,     -1, 0x0094, 0x00f6,     -1, 0x0097, 0x00a3, 0x0096, 0x0081,     -1,     -1, 0x0098,
01536 };
01537 
01538 const int TEncoding_CP437::fromUnicodeTable2[4 * 16] = {
01539         /* U+0390 */     -1,     -1,     -1, 0x00e2,     -1,     -1,     -1,     -1, 0x00e9,     -1,     -1,     -1,     -1,     -1,     -1,     -1,
01540         /* U+03a0 */     -1,     -1,     -1, 0x00e4,     -1,     -1, 0x00e8,     -1,     -1, 0x00ea,     -1,     -1,     -1,     -1,     -1,     -1,
01541         /* U+03b0 */     -1, 0x00e0,     -1,     -1, 0x00eb, 0x00ee,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,
01542         /* U+03c0 */ 0x00e3,     -1,     -1, 0x00e5, 0x00e7,     -1, 0x00ed,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,
01543 };
01544 
01545 const int TEncoding_CP437::fromUnicodeTable3[6 * 16] = {
01546         /* U+2210 */     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1, 0x00f9, 0x00fb,     -1,     -1,     -1, 0x00ec,     -1,
01547         /* U+2220 */     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1, 0x00ef,     -1,     -1,     -1,     -1,     -1,     -1,
01548         /* U+2230 */     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1 /* blank */,
01549         /* U+2240 */     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1, 0x00f7,     -1,     -1,     -1,     -1,     -1,     -1,     -1,
01550         /* U+2250 */     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1 /* blank */,
01551         /* U+2260 */     -1, 0x00f0,     -1,     -1, 0x00f3, 0x00f2,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,
01552 };
01553 
01554 const int TEncoding_CP437::fromUnicodeTable4[11 * 16] = {
01555         /* U+2500 */ 0x00c4,     -1, 0x00b3,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1, 0x00da,     -1,     -1,     -1,
01556         /* U+2510 */ 0x00bf,     -1,     -1,     -1, 0x00c0,     -1,     -1,     -1, 0x00d9,     -1,     -1,     -1, 0x00c3,     -1,     -1,     -1,
01557         /* U+2520 */     -1,     -1,     -1,     -1, 0x00b4,     -1,     -1,     -1,     -1,     -1,     -1,     -1, 0x00c2,     -1,     -1,     -1,
01558         /* U+2530 */     -1,     -1,     -1,     -1, 0x00c1,     -1,     -1,     -1,     -1,     -1,     -1,     -1, 0x00c5,     -1,     -1,     -1,
01559         /* U+2540 */     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1 /* blank */,
01560         /* U+2550 */ 0x00cd, 0x00ba, 0x00d5, 0x00d6, 0x00c9, 0x00b8, 0x00b7, 0x00bb, 0x00d4, 0x00d3, 0x00c8, 0x00be, 0x00bd, 0x00bc, 0x00c6, 0x00c7,
01561         /* U+2560 */ 0x00cc, 0x00b5, 0x00b6, 0x00b9, 0x00d1, 0x00d2, 0x00cb, 0x00cf, 0x00d0, 0x00ca, 0x00d8, 0x00d7, 0x00ce,     -1,     -1,     -1,
01562         /* U+2570 */     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1 /* blank */,
01563         /* U+2580 */ 0x00df,     -1,     -1,     -1, 0x00dc,     -1,     -1,     -1, 0x00db,     -1,     -1,     -1, 0x00dd,     -1,     -1,     -1,
01564         /* U+2590 */ 0x00de, 0x00b0, 0x00b1, 0x00b2,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,
01565         /* U+25a0 */ 0x00fe,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1
01566 };
01567 //      /* U+0190 */     -1,     -1, 0x009f,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,
01568 //      /* U+2070 */     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1, 0x00fc,
01569 //      /* U+20a0 */     -1,     -1,     -1,     -1,     -1,     -1,     -1, 0x009e,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,
01570 //      /* U+2310 */ 0x00a9,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,
01571 //      /* U+2320 */ 0x00f4, 0x00f5,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,
01572 
01573 //-----------------------------------------------------------------------------
01574 // CP 852
01575 //-----------------------------------------------------------------------------
01576 
01577 const int TEncoding_CP852::toUnicodeTable[8 * 16] = {
01578         /* 0x80 */ 0x00c7, 0x00fc, 0x00e9, 0x00e2, 0x00e4, 0x016f, 0x0107, 0x00e7, 0x0142, 0x00eb, 0x0150, 0x0151, 0x00ee, 0x0179, 0x00c4, 0x0106,
01579         /* 0x90 */ 0x00c9, 0x0139, 0x013a, 0x00f4, 0x00f6, 0x013d, 0x013e, 0x015a, 0x015b, 0x00d6, 0x00dc, 0x0164, 0x0165, 0x0141, 0x00d7, 0x010d,
01580         /* 0xa0 */ 0x00e1, 0x00ed, 0x00f3, 0x00fa, 0x0104, 0x0105, 0x017d, 0x017e, 0x0118, 0x0119, 0x00ac, 0x017a, 0x010c, 0x015f, 0x00ab, 0x00bb,
01581         /* 0xb0 */ 0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x00c1, 0x00c2, 0x011a, 0x015e, 0x2563, 0x2551, 0x2557, 0x255d, 0x017b, 0x017c, 0x2510,
01582         /* 0xc0 */ 0x2514, 0x2534, 0x252c, 0x251c, 0x2500, 0x253c, 0x0102, 0x0103, 0x255a, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256c, 0x00a4,
01583         /* 0xd0 */ 0x0111, 0x0110, 0x010e, 0x00cb, 0x010f, 0x0147, 0x00cd, 0x00ce, 0x011b, 0x2518, 0x250c, 0x2588, 0x2584, 0x0162, 0x016e, 0x2580,
01584         /* 0xe0 */ 0x00d3, 0x00df, 0x00d4, 0x0143, 0x0144, 0x0148, 0x0160, 0x0161, 0x0154, 0x00da, 0x0155, 0x0170, 0x00fd, 0x00dd, 0x0163, 0x00b4,
01585         /* 0xf0 */ 0x00ad, 0x02dd, 0x02db, 0x02c7, 0x02d8, 0x00a7, 0x00f7, 0x00b8, 0x00b0, 0x00a8, 0x02d9, 0x0171, 0x0158, 0x0159, 0x25a0, 0x00a0
01586 };
01587 
01588 const int TEncoding_CP852::fromUnicodeTable1[14 * 16] = {
01589         /* U+00a0 */ 0x00ff,     -1,     -1,     -1, 0x00cf,     -1,     -1, 0x00f5, 0x00f9,     -1,     -1, 0x00ae, 0x00aa, 0x00f0,     -1,     -1,
01590         /* U+00b0 */ 0x00f8,     -1,     -1,     -1, 0x00ef,     -1,     -1,     -1, 0x00f7,     -1,     -1, 0x00af,     -1,     -1,     -1,     -1,
01591         /* U+00c0 */     -1, 0x00b5, 0x00b6,     -1, 0x008e,     -1,     -1, 0x0080,     -1, 0x0090,     -1, 0x00d3,     -1, 0x00d6, 0x00d7,     -1,
01592         /* U+00d0 */     -1,     -1,     -1, 0x00e0, 0x00e2,     -1, 0x0099, 0x009e,     -1,     -1, 0x00e9,     -1, 0x009a, 0x00ed,     -1, 0x00e1,
01593         /* U+00e0 */     -1, 0x00a0, 0x0083,     -1, 0x0084,     -1,     -1, 0x0087,     -1, 0x0082,     -1, 0x0089,     -1, 0x00a1, 0x008c,     -1,
01594         /* U+00f0 */     -1,     -1,     -1, 0x00a2, 0x0093,     -1, 0x0094, 0x00f6,     -1,     -1, 0x00a3,     -1, 0x0081, 0x00ec,     -1,     -1,
01595         /* U+0100 */     -1,     -1, 0x00c6, 0x00c7, 0x00a4, 0x00a5, 0x008f, 0x0086,     -1,     -1,     -1,     -1, 0x00ac, 0x009f, 0x00d2, 0x00d4,
01596         /* U+0110 */ 0x00d1, 0x00d0,     -1,     -1,     -1,     -1,     -1,     -1, 0x00a8, 0x00a9, 0x00b7, 0x00d8,     -1,     -1,     -1,     -1,
01597         /* U+0120 */     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1 /* blank */,
01598         /* U+0130 */     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1, 0x0091, 0x0092,     -1,     -1, 0x0095, 0x0096,     -1,
01599         /* U+0140 */     -1, 0x009d, 0x0088, 0x00e3, 0x00e4,     -1,     -1, 0x00d5, 0x00e5,     -1,     -1,     -1,     -1,     -1,     -1,     -1,
01600         /* U+0150 */ 0x008a, 0x008b,     -1,     -1, 0x00e8, 0x00ea,     -1,     -1, 0x00fc, 0x00fd, 0x0097, 0x0098,     -1,     -1, 0x00b8, 0x00ad,
01601         /* U+0160 */ 0x00e6, 0x00e7, 0x00dd, 0x00ee, 0x009b, 0x009c,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1, 0x00de, 0x0085,
01602         /* U+0170 */ 0x00eb, 0x00fb,     -1,     -1,     -1,     -1,     -1,     -1,     -1, 0x008d, 0x00ab, 0x00bd, 0x00be, 0x00a6, 0x00a7,     -1
01603 };
01604 
01605 const int TEncoding_CP852::fromUnicodeTable2[2* 16] = {
01606         /* U+02c0 */     -1,     -1,     -1,     -1,     -1,     -1,     -1, 0x00f3,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,
01607         /* U+02d0 */     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1, 0x00f4, 0x00fa,     -1, 0x00f2,     -1, 0x00f1,     -1,     -1
01608 };
01609 
01610 const int TEncoding_CP852::fromUnicodeTable3[11 * 16] = {
01611         /* U+2500 */ 0x00c4,     -1, 0x00b3,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1, 0x00da,     -1,     -1,     -1,
01612         /* U+2510 */ 0x00bf,     -1,     -1,     -1, 0x00c0,     -1,     -1,     -1, 0x00d9,     -1,     -1,     -1, 0x00c3,     -1,     -1,     -1,
01613         /* U+2520 */     -1,     -1,     -1,     -1, 0x00b4,     -1,     -1,     -1,     -1,     -1,     -1,     -1, 0x00c2,     -1,     -1,     -1,
01614         /* U+2530 */     -1,     -1,     -1,     -1, 0x00c1,     -1,     -1,     -1,     -1,     -1,     -1,     -1, 0x00c5,     -1,     -1,     -1,
01615         /* U+2540 */     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1 /* blank */,
01616         /* U+2550 */ 0x00cd, 0x00ba,     -1,     -1, 0x00c9,     -1,     -1, 0x00bb,     -1,     -1, 0x00c8,     -1,     -1, 0x00bc,     -1,     -1,
01617         /* U+2560 */ 0x00cc,     -1,     -1, 0x00b9,     -1,     -1, 0x00cb,     -1,     -1, 0x00ca,     -1,     -1, 0x00ce,     -1,     -1,     -1,
01618         /* U+2570 */     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1 /* blank */,
01619         /* U+2580 */ 0x00df,     -1,     -1,     -1, 0x00dc,     -1,     -1,     -1, 0x00db,     -1,     -1,     -1,     -1,     -1,     -1,     -1,
01620         /* U+2590 */     -1, 0x00b0, 0x00b1, 0x00b2,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,
01621         /* U+25a0 */ 0x00fe,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1
01622 };
01623 
01624 //-----------------------------------------------------------------------------
01625 // Windows-1250
01626 //-----------------------------------------------------------------------------
01627 
01628 const int TEncoding_CP1250::toUnicodeTable[8 * 16] = {
01629         /* 0x80 */ 0x20ac,     -1, 0x201a,     -1, 0x201e, 0x2026, 0x2020, 0x2021,     -1, 0x2030, 0x0160, 0x2039, 0x015a, 0x0164, 0x017d, 0x0179,
01630         /* 0x90 */     -1, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014,     -1, 0x2122, 0x0161, 0x203a, 0x015b, 0x0165, 0x017e, 0x017a,
01631         /* 0xa0 */ 0x00a0, 0x02c7, 0x02d8, 0x0141, 0x00a4, 0x0104, 0x00a6, 0x00a7, 0x00a8, 0x00a9, 0x015e, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x017b,
01632         /* 0xb0 */ 0x00b0, 0x00b1, 0x02db, 0x0142, 0x00b4, 0x00b5, 0x00b6, 0x00b7, 0x00b8, 0x0105, 0x015f, 0x00bb, 0x013d, 0x02dd, 0x013e, 0x017c,
01633         /* 0xc0 */ 0x0154, 0x00c1, 0x00c2, 0x0102, 0x00c4, 0x0139, 0x0106, 0x00c7, 0x010c, 0x00c9, 0x0118, 0x00cb, 0x011a, 0x00cd, 0x00ce, 0x010e,
01634         /* 0xd0 */ 0x0110, 0x0143, 0x0147, 0x00d3, 0x00d4, 0x0150, 0x00d6, 0x00d7, 0x0158, 0x016e, 0x00da, 0x0170, 0x00dc, 0x00dd, 0x0162, 0x00df,
01635         /* 0xe0 */ 0x0155, 0x00e1, 0x00e2, 0x0103, 0x00e4, 0x013a, 0x0107, 0x00e7, 0x010d, 0x00e9, 0x0119, 0x00eb, 0x011b, 0x00ed, 0x00ee, 0x010f,
01636         /* 0xf0 */ 0x0111, 0x0144, 0x0148, 0x00f3, 0x00f4, 0x0151, 0x00f6, 0x00f7, 0x0159, 0x016f, 0x00fa, 0x0171, 0x00fc, 0x00fd, 0x0163, 0x02d9
01637 };
01638 
01639 const int TEncoding_CP1250::fromUnicodeTable1[14 * 16] = {
01640         /* U+00a0 */ 0x00a0,     -1,     -1,     -1, 0x00a4,     -1, 0x00a6, 0x00a7, 0x00a8, 0x00a9,     -1, 0x00ab, 0x00ac, 0x00ad, 0x00ae,     -1,
01641         /* U+00b0 */ 0x00b0, 0x00b1,     -1,     -1, 0x00b4, 0x00b5, 0x00b6, 0x00b7, 0x00b8,     -1,     -1, 0x00bb,     -1,     -1,     -1,     -1,
01642         /* U+00c0 */     -1, 0x00c1, 0x00c2,     -1, 0x00c4,     -1,     -1, 0x00c7,     -1, 0x00c9,     -1, 0x00cb,     -1, 0x00cd, 0x00ce,     -1,
01643         /* U+00d0 */     -1,     -1,     -1, 0x00d3, 0x00d4,     -1, 0x00d6, 0x00d7,     -1,     -1, 0x00da,     -1, 0x00dc, 0x00dd,     -1, 0x00df,
01644         /* U+00e0 */     -1, 0x00e1, 0x00e2,     -1, 0x00e4,     -1,     -1, 0x00e7,     -1, 0x00e9,     -1, 0x00eb,     -1, 0x00ed, 0x00ee,     -1,
01645         /* U+00f0 */     -1,     -1,     -1, 0x00f3, 0x00f4,     -1, 0x00f6, 0x00f7,     -1,     -1, 0x00fa,     -1, 0x00fc, 0x00fd,     -1,     -1,
01646         /* U+0100 */     -1,     -1, 0x00c3, 0x00e3, 0x00a5, 0x00b9, 0x00c6, 0x00e6,     -1,     -1,     -1,     -1, 0x00c8, 0x00e8, 0x00cf, 0x00ef,
01647         /* U+0110 */ 0x00d0, 0x00f0,     -1,     -1,     -1,     -1,     -1,     -1, 0x00ca, 0x00ea, 0x00cc, 0x00ec,     -1,     -1,     -1,     -1,
01648         /* U+0120 */     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1 /* blank */,
01649         /* U+0130 */     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1, 0x00c5, 0x00e5,     -1,     -1, 0x00bc, 0x00be,     -1,
01650         /* U+0140 */     -1, 0x00a3, 0x00b3, 0x00d1, 0x00f1,     -1,     -1, 0x00d2, 0x00f2,     -1,     -1,     -1,     -1,     -1,     -1,     -1,
01651         /* U+0150 */ 0x00d5, 0x00f5,     -1,     -1, 0x00c0, 0x00e0,     -1,     -1, 0x00d8, 0x00f8, 0x008c, 0x009c,     -1,     -1, 0x00aa, 0x00ba,
01652         /* U+0160 */ 0x008a, 0x009a, 0x00de, 0x00fe, 0x008d, 0x009d,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1, 0x00d9, 0x00f9,
01653         /* U+0170 */ 0x00db, 0x00fb,     -1,     -1,     -1,     -1,     -1,     -1,     -1, 0x008f, 0x009f, 0x00af, 0x00bf, 0x008e, 0x009e,     -1,
01654 };
01655 
01656 const int TEncoding_CP1250::fromUnicodeTable2[2 * 16] = {
01657         /* U+02c0 */     -1,     -1,     -1,     -1,     -1,     -1,     -1, 0x00a1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,
01658         /* U+02d0 */     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1, 0x00a2, 0x00ff,     -1, 0x00b2,     -1, 0x00bd,     -1,     -1,
01659 };
01660 
01661 const int TEncoding_CP1250::fromUnicodeTable3[3 * 16] = {
01662         /* U+2010 */     -1,     -1,     -1, 0x0096, 0x0097,     -1,     -1,     -1, 0x0091, 0x0092, 0x0082,     -1, 0x0093, 0x0094, 0x0084,     -1,
01663         /* U+2020 */ 0x0086, 0x0087, 0x0095,     -1,     -1,     -1, 0x0085,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,
01664         /* U+2030 */ 0x0089,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1, 0x008b, 0x009b,     -1,     -1,     -1,     -1,     -1,
01665 };
01666 //      /* U+20a0 */     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1, 0x0080,     -1,     -1,     -1,
01667 //      /* U+2120 */     -1,     -1, 0x0099,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1
01668 
01669 //-----------------------------------------------------------------------------
01670 // YU-ASCII
01671 //-----------------------------------------------------------------------------
01672 
01673 //                                                C acute c acute C caron c caron S caron s caron Z caron z caron D stroke d stroke
01674 const int TEncoding_YuAscii::uniChars[10] =     {  0x106,  0x107,  0x10c,  0x10d,  0x160,  0x161,  0x17d,  0x17e,   0x110,  0x111  };
01675 const int TEncoding_YuAscii::yuAsciiChars[10] = {   0x5d,   0x7d,   0x5e,   0x7e,   0x5b,   0x7b,   0x40,   0x60,    0x5c,   0x7c  };
01676 //                                                   ']'     '}'     '^'    '~'     '['     '{'     '@'     '`'      '\\'    '|'
01677 
01678 
01679 //-----------------------------------------------------------------------------
01680 // TUnicode - codec registry
01681 //-----------------------------------------------------------------------------
01682 
01683 void TUnicode::InitCodecs()
01684 {
01685         ClrCodecs();
01686         RegisterCodec("ISO-8859-1 ISO_8859-1 ISO_8859-1:1987 ISO-IR-100 CP819 IBM819 LATIN1 L1 csISOLatin1 ISO8859-1 ISO8859_1 CP28591", TCodecBase::New<TCodec_ISO8859_1>());
01687         RegisterCodec("ISO-8859-2 ISO_8859-2 ISO_8859-2:1987 ISO-IR-101 LATIN2 L2 csISOLatin2 ISO8859-2 ISO8859_2 CP28592", TCodecBase::New<TCodec_ISO8859_2>());
01688         RegisterCodec("ISO-8859-3 ISO_8859-3 ISO_8859-3:1988 ISO-IR-109 LATIN3 L3 csISOLatin3 ISO8859-3 ISO8859_3 CP28593", TCodecBase::New<TCodec_ISO8859_3>());
01689         RegisterCodec("ISO-8859-4 ISO_8859-4 ISO_8859-4:1988 ISO-IR-110 LATIN4 L4 csISOLatin4 ISO8859-4 ISO8859_4 CP28594", TCodecBase::New<TCodec_ISO8859_4>());
01690         RegisterCodec("YUASCII YU-ASCII YU_ASCII", TCodecBase::New<TCodec_YuAscii>());
01691         RegisterCodec("CP1250 Windows-1250 MS-EE", TCodecBase::New<TCodec_CP1250>());
01692         RegisterCodec("CP852 cp852_DOSLatin2 DOSLatin2", TCodecBase::New<TCodec_CP852>());
01693         RegisterCodec("CP437 cp437_DOSLatinUS DOSLatinUS", TCodecBase::New<TCodec_CP437>());
01694 }
01695 
01696 void TUnicode::EncodeUtf8(const uint& c, TChA& dest) {
01697         if (c > 0x10ffff) {
01698                 throw TExcept::New(TStr::Fmt("Unkown Unicode character %u", c)); }
01699         if (c < 0x80u)
01700                 dest.AddCh(char(c & 0xffu));
01701         else if (c < 0x800u) {
01702                 dest.AddCh(char(TUniCodec::_1100_0000 | ((c >> 6) & TUniCodec::_0001_1111)));
01703                 dest.AddCh(char(TUniCodec::_1000_0000 | (c & TUniCodec::_0011_1111))); }
01704         else if (c < 0x10000u) {
01705                 dest.AddCh(char(TUniCodec::_1110_0000 | ((c >> 12) & TUniCodec::_0000_1111)));
01706                 dest.AddCh(char(TUniCodec::_1000_0000 | ((c >> 6) & TUniCodec::_0011_1111)));
01707                 dest.AddCh(char(TUniCodec::_1000_0000 | (c & TUniCodec::_0011_1111))); }
01708         else if (c < 0x200000u) {
01709                 dest.AddCh(char(TUniCodec::_1111_0000 | ((c >> 18) & TUniCodec::_0000_0111)));
01710                 dest.AddCh(char(TUniCodec::_1000_0000 | ((c >> 12) & TUniCodec::_0011_1111)));
01711                 dest.AddCh(char(TUniCodec::_1000_0000 | ((c >> 6) & TUniCodec::_0011_1111)));
01712                 dest.AddCh(char(TUniCodec::_1000_0000 | (c & TUniCodec::_0011_1111))); }
01713         else if (c < 0x4000000u) {
01714                 dest.AddCh(char(TUniCodec::_1111_1000 | ((c >> 24) & TUniCodec::_0000_0011)));
01715                 dest.AddCh(char(TUniCodec::_1000_0000 | ((c >> 18) & TUniCodec::_0011_1111)));
01716                 dest.AddCh(char(TUniCodec::_1000_0000 | ((c >> 12) & TUniCodec::_0011_1111)));
01717                 dest.AddCh(char(TUniCodec::_1000_0000 | ((c >> 6) & TUniCodec::_0011_1111)));
01718                 dest.AddCh(char(TUniCodec::_1000_0000 | (c & TUniCodec::_0011_1111))); }
01719         else {
01720                 dest.AddCh(char(TUniCodec::_1111_1100 | ((c >> 30) & TUniCodec::_0000_0011)));
01721                 dest.AddCh(char(TUniCodec::_1000_0000 | ((c >> 24) & TUniCodec::_0011_1111)));
01722                 dest.AddCh(char(TUniCodec::_1000_0000 | ((c >> 18) & TUniCodec::_0011_1111)));
01723                 dest.AddCh(char(TUniCodec::_1000_0000 | ((c >> 12) & TUniCodec::_0011_1111)));
01724                 dest.AddCh(char(TUniCodec::_1000_0000 | ((c >> 6) & TUniCodec::_0011_1111)));
01725                 dest.AddCh(char(TUniCodec::_1000_0000 | (c & TUniCodec::_0011_1111))); }
01726 }
01727 
01728 TStr TUnicode::EncodeUtf8(const uint& Ch) {
01729         TChA ChA; EncodeUtf8(Ch, ChA); return ChA;
01730 }