dd/d90/unicode_8h_source.html

00001 #include "bd.h"
00002
00003 //#ifndef unicode_h
00004 //#define unicode_h
00005
00007 // Includes
00008 //#include "base.h"
00009 #include <new>
00010
00011 typedef int TUniVecIdx;
00012
00013 //-----------------------------------------------------------------------------
00014 // TUniCodec -- an UTF-8 and UTF-16 Encoder/Decoder
00015 //-----------------------------------------------------------------------------
00016
00017 // Error handling modes for the TUniCodec class.
00018 typedef enum TUnicodeErrorHandling_
00019 {
00020         // What happens when an error occurs:
00021         uehIgnore = 0,  // - it is silently ignored (nothing is added to the output vector)
00022         uehThrow = 1,   // - an exception is thrown (TUnicodeException)
00023         uehReplace = 2, // - the replacement character is added to the output vector
00024         uehAbort = 3    // - the encoding/decoding process stops immediately
00025 }
00026 TUnicodeErrorHandling;
00027
00028 class TUnicodeException
00029 {
00030 public:
00031         TStr message;  // error message
00032         size_t srcIdx; // the position in the source vector where the error occurred
00033         int srcChar;   // the source character at the position srcIdx
00034         TUnicodeException(size_t srcIdx_, int srcChar_, const TStr& message_) :
00035                 message(message_), srcIdx(srcIdx_), srcChar(srcChar_) { }
00036 };
00037
00038 typedef enum TUniByteOrder_
00039 {
00040         boMachineEndian = 0,
00041         boLittleEndian = 1,
00042         boBigEndian = 2
00043 }
00044 TUniByteOrder;
00045
00046 typedef enum TUtf16BomHandling_
00047 {
00048         bomAllowed = 0,   // if a BOM is present, it is used to determine the byte order; otherwise, the default byte order is used
00049         bomRequired = 1,  // if a BOM is present, it is used to determine the byte order; otherwise, an error is reported
00050         bomIgnored = 2    // the default byte order is used; if a BOM is present, it is treated like any other character
00051 }
00052 TUtf16BomHandling;
00053
00054 class TUniCodec
00055 {
00056 public:
00057         // 0xfffd is defined as the replacement character by the Unicode standard.
00058         // By default, it is rendered as a question mark inside a diamond: "<?>".
00059         enum { DefaultReplacementChar = 0xfffd };
00060
00061         // The replacement character is inserted into the destination vector
00062         // if an error occurs in the source vector.  By default, this is set
00063         // to DefaultReplacementChar.
00064         int replacementChar;
00065         // The error handling mode.
00066         TUnicodeErrorHandling errorHandling;
00067         // There are a number of situations where there is strictly speaking an error in
00068         // the source data although it can still be decoded in a reasonably meaningful way.
00069         // If strict == true, these situations are treated as errors.  Examples:
00070         // - when decoding UTF-8:
00071         //   - a codepoint represented by more bytes than necessary (e.g. one of the characters 0..127
00072         //     encoded as a two-byte sequence)
00073         //   - a codepoint > 0x10ffff
00074         // - when decoding UTF-16:
00075         //   - a codepoint from the range reserved for the second character of a surrogate pair
00076         //     is not preceded by a codepoint from the range reserved for the first character of a surrogate pair
00077         // - when encoding UTF-8:
00078         //   - a codepoint > 0x10ffff
00079         // - when encoding UTF-16:
00080         //   - a codepoint from the range reserved from the second character of a surrogate pair
00081         //     [note that a codepoint > 0x10ffff, or from the range reserved for the first character of a
00082         //     surrogate pair, is always an error, even with strict == false]
00083         bool strict;
00084         // skipBom == true means: If a byte-order-mark (0xfffe or 0xfeff) occurs at the beginning
00085         // of the source vector, it is skipped (when decoding).
00086         // - Note: a BOM is not really useful in UTF-8 encoded data.  However, the .NET UTF8Encoding
00087         //   emits 0xfeff by default as a kind of preamble.  It gets encoded as 3 bytes, ef bb bf,
00088         //   and can be helpful to make the data easier to recognize as UTF-8 encoded data.
00089         bool skipBom;
00090
00091         TUniCodec() : replacementChar(DefaultReplacementChar), errorHandling(uehIgnore), strict(false), skipBom(true)
00092         {
00093         }
00094
00095         TUniCodec(TUnicodeErrorHandling errorHandling_, bool strict_, int replacementChar_, bool skipBom_) :
00096                 replacementChar(replacementChar_), errorHandling(errorHandling_), strict(strict_), skipBom(skipBom_)
00097         {
00098         }
00099
00100 protected:
00101         enum {
00102 #define DefineByte(b7, b6, b5, b4, b3, b2, b1, b0) _ ## b7 ## b6 ## b5 ## b4 ## _ ## b3 ## b2 ## b1 ## b0 = (b7 << 7) | (b6 << 6) | (b5 << 5) | (b4 << 4) | (b3 << 3) | (b2 << 2) | (b1 << 1) | b0
00103                 DefineByte(1, 0, 0, 0, 0, 0, 0, 0),
00104                 DefineByte(1, 1, 0, 0, 0, 0, 0, 0),
00105                 DefineByte(1, 1, 1, 0, 0, 0, 0, 0),
00106                 DefineByte(1, 1, 1, 1, 0, 0, 0, 0),
00107                 DefineByte(1, 1, 1, 1, 1, 0, 0, 0),
00108                 DefineByte(1, 1, 1, 1, 1, 1, 0, 0),
00109                 DefineByte(1, 1, 1, 1, 1, 1, 1, 0),
00110                 DefineByte(0, 0, 1, 1, 1, 1, 1, 1),
00111                 DefineByte(0, 0, 0, 1, 1, 1, 1, 1),
00112                 DefineByte(0, 0, 0, 0, 1, 1, 1, 1),
00113                 DefineByte(0, 0, 0, 0, 0, 1, 1, 1),
00114                 DefineByte(0, 0, 0, 0, 0, 0, 1, 1)
00115 #undef DefineByte
00116         };
00117
00118         typedef TUniVecIdx TVecIdx;
00119         //friend class TUniChDb;
00120         friend class TUniCaseFolding;
00121         friend class TUnicode;
00122
00123 public:
00124
00125         //-----------------------------------------------------------------------
00126         // UTF-8
00127         //-----------------------------------------------------------------------
00128
00129         // Returns the number of characters that have been successfully decoded.
00130         // This does not include any replacement characters that may have been inserted into 'dest'.
00131         template<typename TSrcVec, typename TDestCh>
00132         size_t DecodeUtf8(
00133                 const TSrcVec& src, size_t srcIdx, const size_t srcCount,
00134                 TVec<TDestCh>& dest, const bool clrDest = true) const;
00135         template<typename TSrcVec, typename TDestCh>
00136         size_t DecodeUtf8(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true) const { return DecodeUtf8(src, 0, src.Len(), dest, clrDest); }
00137
00138         // Returns the number of characters that have been successfully encoded.
00139         // This does not include any replacement characters that may have been inserted into 'dest'.
00140         template<typename TSrcVec, typename TDestCh>
00141         size_t EncodeUtf8(
00142                 const TSrcVec& src, size_t srcIdx, const size_t srcCount,
00143                 TVec<TDestCh>& dest, const bool clrDest = true) const;
00144         template<typename TSrcVec, typename TDestCh>
00145         size_t EncodeUtf8(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true) const { return EncodeUtf8(src, 0, src.Len(), dest, clrDest); }
00146
00147         // The following wrappers around the UTF-8 encoder return a TStr containing
00148         // the UTF-8-encoded version of the input string.
00149         template<typename TSrcVec> TStr EncodeUtf8Str(const TSrcVec& src, size_t srcIdx, const size_t srcCount) const { TVec<char> temp; EncodeUtf8(src, srcIdx, srcCount, temp); TStr retVal = &(temp[0]); return retVal; }
00150         template<typename TSrcVec> TStr EncodeUtf8Str(const TSrcVec& src) const { TVec<char> temp; EncodeUtf8(src, temp); temp.Add(0); TStr retVal = &(temp[0]); return retVal; }
00151
00152         //-----------------------------------------------------------------------
00153         // UTF-16 Decoder
00154         //-----------------------------------------------------------------------
00155
00156 protected:
00157         enum {
00158                 Utf16FirstSurrogate = 0xd800,
00159                 Utf16SecondSurrogate = 0xdc00
00160         };
00161
00162         static bool IsMachineLittleEndian();
00163
00164 public:
00165
00166         // Returns the number of characters that have been successfully decoded.
00167         // This does not include any replacement characters that may have been inserted into 'dest'.
00168         // Each element of 'src' is assumed to contain one byte of data.
00169         // srcCount must be even (though srcIdx doesn't need to be).
00170         template<typename TSrcVec, typename TDestCh>
00171         size_t DecodeUtf16FromBytes(
00172                 const TSrcVec& src, size_t srcIdx, const size_t srcCount,
00173                 TVec<TDestCh>& dest, const bool clrDest,
00174                 const TUtf16BomHandling bomHandling = bomAllowed,
00175                 const TUniByteOrder defaultByteOrder = boMachineEndian) const;
00176
00177         // Here, each element of 'src' is treated as a 16-bit word.  The byte-order settings
00178         // are used to determine if the two bytes of each word should be swapped before further
00179         // processing.  For example, if a BOM is present, it must have the value 0xfeff; if it
00180         // actually has the value 0xfffe, this means that the two bytes of each word must be swapped.
00181         // Basically, the combination of the byteOrder parameter and the byte order mark (if present) at the
00182         // beginning of the source data is used to determine the "original" byte order of the data;
00183         // if this doesn't match the byte order of the local machine, the two bytes of each word will
00184         // be swapped during the decoding process.
00185         template<typename TSrcVec, typename TDestCh>
00186         size_t DecodeUtf16FromWords(
00187                 const TSrcVec& src, size_t srcIdx, const size_t srcCount,
00188                 TVec<TDestCh>& dest, bool clrDest,
00189                 const TUtf16BomHandling bomHandling = bomAllowed,
00190                 const TUniByteOrder defaultByteOrder = boMachineEndian) const;
00191
00192         //-----------------------------------------------------------------------
00193         // UTF-16 Encoder
00194         //-----------------------------------------------------------------------
00195
00196         // Returns the number of characters that have been successfully encoded.
00197         // This does not include any replacement characters that may have been inserted into 'dest'.
00198         //
00199         // Notes:
00200         // - UTF-16 *cannot* encode characters above 0x10ffff, so their presence is always
00201         //   treated as an error, regardless of the value of 'strict'.
00202         // - Characters from the range Utf16FirstSurrogate through Utf16FirstSurrogate + 1023
00203         //   cannot be encoded by UTF-16 either, as they would be misinterpreted during decoding
00204         //   as the first character of a surrogate pair.
00205         // - Characters from the range Utf16SecondSurrogate through Utf16SecondSurrogate + 1023
00206         //   can be encoded in principle; however, if strict == true, they are treated as errors.
00207         template<typename TSrcVec, typename TDestCh>
00208         size_t EncodeUtf16ToWords(
00209                 const TSrcVec& src, size_t srcIdx, const size_t srcCount,
00210                 TVec<TDestCh>& dest, const bool clrDest, const bool insertBom,
00211                 const TUniByteOrder destByteOrder = boMachineEndian) const;
00212
00213         template<typename TSrcVec, typename TDestCh>
00214         size_t EncodeUtf16ToBytes(
00215                 const TSrcVec& src, size_t srcIdx, const size_t srcCount,
00216                 TVec<TDestCh>& dest, const bool clrDest, const bool insertBom,
00217                 const TUniByteOrder destByteOrder = boMachineEndian) const;
00218
00219         //-----------------------------------------------------------------------
00220         // Helper declarations for the test drivers
00221         //-----------------------------------------------------------------------
00222
00223 protected:
00224
00225         static uint GetRndUint(TRnd& rnd);
00226         static uint GetRndUint(TRnd& rnd, uint minVal, uint maxVal);
00227
00228         //-----------------------------------------------------------------------
00229         // UTF-8 Test Driver
00230         //-----------------------------------------------------------------------
00231
00232 protected:
00233         void TestUtf8(bool decode, size_t expectedRetVal, bool expectedThrow, const TIntV& src, const TIntV& expectedDest, FILE *f);
00234         // Generates a random UTF-8-encoded stream according to the specifications in 'testCaseDesc',
00235         // then calls TestUtf8 to make sure that DecodeUtf8 reacts as expected.
00236         void TestDecodeUtf8(TRnd& rnd, const TStr& testCaseDesc);
00237 public:
00238         void TestUtf8();
00239
00240         //-----------------------------------------------------------------------
00241         // UTF-16 Test Driver
00242         //-----------------------------------------------------------------------
00243
00244 protected:
00245         void WordsToBytes(const TIntV& src, TIntV& dest);
00246         void TestUtf16(bool decode, size_t expectedRetVal, bool expectedThrow, const TIntV& src, const TIntV& expectedDest,
00247                 // Note: insertBom is only used with the encoder.  When encoding, 'defaultByteOrder' is used as the destination byte order.
00248                 const TUtf16BomHandling bomHandling, const TUniByteOrder defaultByteOrder, const bool insertBom,
00249                 FILE *f);
00250         static inline int SwapBytes(int x) {
00251                 return ((x >> 8) & 0xff) | ((x & 0xff) << 8); }
00252         // Generates a random UTF-16-encoded stream according to the specifications in 'testCaseDesc',
00253         // then calls TestUtf16 to make sure that DecodeUtf16 reacts as expected.
00254         void TestDecodeUtf16(TRnd& rnd, const TStr& testCaseDesc,
00255                 const TUtf16BomHandling bomHandling,
00256                 const TUniByteOrder defaultByteOrder,
00257                 const bool insertBom);
00258 public:
00259         void TestUtf16();
00260
00261 };
00262
00263 //-----------------------------------------------------------------------------
00264 // Case folding
00265 //-----------------------------------------------------------------------------
00266 // Note: there's no need to access this class directly.
00267 // Use TUniChDb::GetCaseFolded() instead.
00268
00269 typedef THash<TInt, TIntV> TIntIntVH;
00270
00271 class TUniCaseFolding
00272 {
00273 protected:
00274         TIntH cfCommon, cfSimple, cfTurkic;
00275         TIntIntVH cfFull;
00276
00277         template<typename TSrcDat, typename TDestDat>
00278         inline static void AppendVector(const TVec<TSrcDat>& src, TVec<TDestDat>& dest) {
00279                 for (int i = 0; i < src.Len(); i++) dest.Add(src[i]); }
00280         friend class TUniChDb;
00281         typedef TUniVecIdx TVecIdx;
00282
00283 public:
00284         TUniCaseFolding() { }
00285         explicit TUniCaseFolding(TSIn& SIn) : cfCommon(SIn), cfSimple(SIn), cfTurkic(SIn), cfFull(SIn) { SIn.LoadCs(); }
00286         void Load(TSIn& SIn) { cfCommon.Load(SIn); cfSimple.Load(SIn); cfFull.Load(SIn); cfTurkic.Load(SIn); SIn.LoadCs(); }
00287         void Save(TSOut& SOut) const { cfCommon.Save(SOut); cfSimple.Save(SOut); cfFull.Save(SOut); cfTurkic.Save(SOut); SOut.SaveCs(); }
00288         void Clr() { cfCommon.Clr(); cfSimple.Clr(); cfFull.Clr(); cfTurkic.Clr(); }
00289         void LoadTxt(const TStr& fileName);
00290
00291         // Use 'turkic' when processing text in a Turkic language (tr, az).  This only affects the uppercase I and I-with-dot-above.
00292         template<typename TSrcVec, typename TDestCh>
00293         void Fold(const TSrcVec& src, size_t srcIdx, const size_t srcCount,
00294                 TVec<TDestCh>& dest, const bool clrDest, const bool full, const bool turkic) const
00295         {
00296                 for (const size_t srcEnd = srcIdx + srcCount; srcIdx < srcEnd; )
00297                 {
00298                         int c = src[TVecIdx(srcIdx)], i; srcIdx++;
00299                         if (turkic && ((i = cfTurkic.GetKeyId(c)) >= 0)) { dest.Add(cfTurkic[i]); continue; }
00300                         if (full && ((i = cfFull.GetKeyId(c)) >= 0)) { AppendVector(cfFull[i], dest); continue; }
00301                         if ((! full) && ((i = cfSimple.GetKeyId(c)) >= 0)) { dest.Add(cfSimple[i]); continue; }
00302                         i = cfCommon.GetKeyId(c); if (i >= 0) dest.Add(cfCommon[i]); else dest.Add(c);
00303                 }
00304         }
00305
00306         template<typename TSrcVec>
00307         void FoldInPlace(TSrcVec& src, size_t srcIdx, const size_t srcCount, const bool turkic) const
00308         {
00309                 for (const size_t srcEnd = srcIdx + srcCount; srcIdx < srcEnd; srcIdx++)
00310                 {
00311                         int c = src[TVecIdx(srcIdx)], i;
00312                         if (turkic && ((i = cfTurkic.GetKeyId(c)) >= 0)) { src[TVecIdx(srcIdx)] = cfTurkic[i]; continue; }
00313                         if ((i = cfSimple.GetKeyId(c)) >= 0) { src[TVecIdx(srcIdx)] = cfSimple[i]; continue; }
00314                         i = cfCommon.GetKeyId(c); if (i >= 0) src[TVecIdx(srcIdx)] = cfCommon[i];
00315                 }
00316         }
00317
00318 protected:
00319         void Test(const TIntV& src, const TIntV& expectedDest, const bool full, const bool turkic, FILE *f);
00320 public:
00321         void Test();
00322 };
00323
00324 //-----------------------------------------------------------------------------
00325 // TCodecBase -- an abstract base class for codecs
00326 //-----------------------------------------------------------------------------
00327
00328 class TCodecBase;
00329 typedef TPt<TCodecBase> PCodecBase;
00330 typedef TVec<PCodecBase> TCodecBaseV;
00331
00332 class TCodecBase
00333 {
00334 protected:
00335         TCRef CRef;
00336         friend class TPt<TCodecBase>;
00337 public:
00338         virtual ~TCodecBase() { }
00339
00340         template<class TCodecImpl>
00341         static PCodecBase New(); /* {
00342                 return new TCodecWrapper<TCodecImpl>(); } */
00343
00344         virtual TStr GetName() const = 0;
00345         virtual void Test() const { }
00346
00347         // Returns the number of characters that have been successfully decoded.
00348         // This does not include any replacement characters that may have been inserted into 'dest'.
00349         virtual size_t ToUnicode(const TIntV& src, size_t srcIdx, const size_t srcCount, TIntV& dest, const bool clrDest = true) const = 0;
00350         virtual size_t ToUnicode(const TStr& src, size_t srcIdx, const size_t srcCount, TIntV& dest, const bool clrDest = true) const = 0;
00351
00352         size_t ToUnicode(const TIntV& src, TIntV& dest, const bool clrDest = true) const { return ToUnicode(src, 0, src.Len(), dest, clrDest); }
00353         size_t ToUnicode(const TStr& src, TIntV& dest, const bool clrDest = true) const { return ToUnicode(src, 0, src.Len(), dest, clrDest); }
00354
00355         // Returns the number of characters that have been successfully encoded.
00356         // This does not include any replacement characters that may have been inserted into 'dest'.
00357         virtual size_t FromUnicode(const TIntV& src, size_t srcIdx, const size_t srcCount, TIntV& dest, const bool clrDest = true) const = 0;
00358         virtual size_t FromUnicode(const TIntV& src, size_t srcIdx, const size_t srcCount, TChA& dest, const bool clrDest = true) const = 0;
00359         virtual size_t FromUnicode(const TIntV& src, size_t srcIdx, const size_t srcCount, TStr& dest, const bool clrDest = true) const = 0;
00360
00361         size_t FromUnicode(const TIntV& src, TIntV& dest, const bool clrDest = true) const { return FromUnicode(src, 0, src.Len(), dest, clrDest); }
00362         size_t FromUnicode(const TIntV& src, TChA& dest, const bool clrDest = true) const { return FromUnicode(src, 0, src.Len(), dest, clrDest); }
00363         size_t FromUnicode(const TIntV& src, TStr& dest, const bool clrDest = true) const { return FromUnicode(src, 0, src.Len(), dest, clrDest); }
00364 };
00365
00366 //-----------------------------------------------------------------------------
00367 // TCodecWrapper -- a descendant of TCodecBase; relies on a template
00368 // parameter class for the actual implementation of the codec.
00369 //-----------------------------------------------------------------------------
00370 // Thus, if you know in advance that you'll need ISO-8859-2, just use
00371 // T8BitCodec<TEncoding_ISO8859_2>.  If you don't know the encoding
00372 // in advance, use a PCodecBase pointing to a suitable specialization
00373 // of TCodecWrapper<...>.  You can TUnicode::GetCodec(TStr& name)
00374 // to obtain a suitable pointer.
00375
00376 template<class TCodecImpl_>
00377 class TCodecWrapper : public TCodecBase
00378 {
00379 public:
00380         typedef TCodecImpl_ TCodecImpl;
00381         TCodecImpl impl;
00382 public:
00383
00384         virtual TStr GetName() const { return impl.GetName(); }
00385
00386         virtual void Test() const { impl.Test(); }
00387
00388         virtual size_t ToUnicode(const TIntV& src, size_t srcIdx, const size_t srcCount, TIntV& dest, const bool clrDest = true) const {
00389                 return impl.ToUnicode(src, srcIdx, srcCount, dest, clrDest); }
00390         virtual size_t ToUnicode(const TStr& src, size_t srcIdx, const size_t srcCount, TIntV& dest, const bool clrDest = true) const {
00391                 return impl.ToUnicode(src, srcIdx, srcCount, dest, clrDest); }
00392
00393         virtual size_t FromUnicode(const TIntV& src, size_t srcIdx, const size_t srcCount, TIntV& dest, const bool clrDest = true) const {
00394                 return impl.FromUnicode(src, srcIdx, srcCount, dest, clrDest); }
00395         virtual size_t FromUnicode(const TIntV& src, size_t srcIdx, const size_t srcCount, TChA& dest, const bool clrDest = true) const {
00396                 return impl.FromUnicode(src, srcIdx, srcCount, dest, clrDest); }
00397         virtual size_t FromUnicode(const TIntV& src, size_t srcIdx, const size_t srcCount, TStr& dest, const bool clrDest = true) const {
00398                 TChA buf; size_t retVal = impl.FromUnicode(src, srcIdx, srcCount, buf, false);
00399                 if (clrDest) dest += buf.CStr(); else dest = buf.CStr();
00400                 return retVal; }
00401 };
00402
00403 template<class TCodecImpl>
00404 PCodecBase TCodecBase::New() {
00405   return new TCodecWrapper<TCodecImpl>();
00406 }
00407
00408 //-----------------------------------------------------------------------------
00409 // TVecElt -- a template for determining the type of a vector's elements
00410 //-----------------------------------------------------------------------------
00411
00412 template<class TVector_>
00413 class TVecElt
00414 {
00415 };
00416
00417 template<class TDat>
00418 class TVecElt<TVec<TDat> >
00419 {
00420 public:
00421         typedef TVec<TDat> TVector;
00422         typedef TDat TElement;
00423         static inline void Add(TVector& vector, const TElement& element) { vector.Add(element); }
00424 };
00425
00426 template<>
00427 class TVecElt<TChA>
00428 {
00429 public:
00430         typedef TChA TVector;
00431         typedef char TElement;
00432         static inline void Add(TVector& vector, const TElement& element) { vector += element; }
00433 };
00434
00435
00436 //-----------------------------------------------------------------------------
00437 // T8BitCodec -- a class for converting between 8-bit encodings and Unicode
00438 //-----------------------------------------------------------------------------
00439
00440 class TEncoding_ISO8859_1
00441 {
00442 public:
00443         static inline TStr GetName() { return "ISO-8859-1"; }
00444         static int ToUnicode(int c) { Assert(0 <= c && c <= 255); return c; }
00445         static int FromUnicode(int c) { if (0 <= c && c <= 255) return c; else return -1; }
00446 };
00447
00448 class TEncoding_ISO8859_2 // ISO Latin 2
00449 {
00450 public:
00451         static inline TStr GetName() { return "ISO-8859-2"; }
00452         static const int toUnicodeTable[6 * 16], fromUnicodeTable1[14 * 16], fromUnicodeTable2[2 * 16];
00453         static int ToUnicode(int c) { Assert(0 <= c && c <= 255);
00454                 if (c < 0xa0) return c; else return toUnicodeTable[c - 0xa0]; }
00455         static int FromUnicode(int c) {
00456                 if (0 <= c && c < 0xa0) return c;
00457                 else if (0xa0 <= c && c < 0x180) return fromUnicodeTable1[c - 0xa0];
00458                 else if (0x2c0 <= c && c < 0x2e0) return fromUnicodeTable2[c - 0x2c0];
00459                 else return -1; }
00460 };
00461
00462 class TEncoding_ISO8859_3
00463 {
00464 public:
00465         static inline TStr GetName() { return "ISO-8859-3"; }
00466         static const int toUnicodeTable[6 * 16], fromUnicodeTable1[14 * 16], fromUnicodeTable2[2];
00467         static int ToUnicode(int c) { Assert(0 <= c && c <= 255);
00468                 if (c < 0xa0) return c; else return toUnicodeTable[c - 0xa0]; }
00469         static int FromUnicode(int c) {
00470                 if (0 <= c && c < 0xa0) return c;
00471                 else if (0xa0 <= c && c < 0x180) return fromUnicodeTable1[c - 0xa0];
00472                 else if (0x2d8 <= c && c < 0x2da) return fromUnicodeTable2[c - 0x2d8];
00473                 else return -1; }
00474 };
00475
00476 class TEncoding_ISO8859_4
00477 {
00478 public:
00479         static inline TStr GetName() { return "ISO-8859-4"; }
00480         static const int toUnicodeTable[6 * 16], fromUnicodeTable1[14 * 16], fromUnicodeTable2[2 * 16];
00481         static int ToUnicode(int c) { Assert(0 <= c && c <= 255);
00482                 if (c < 0xa0) return c; else return toUnicodeTable[c - 0xa0]; }
00483         static int FromUnicode(int c) {
00484                 if (0 <= c && c < 0xa0) return c;
00485                 else if (0xa0 <= c && c < 0x180) return fromUnicodeTable1[c - 0xa0];
00486                 else if (0x2c0 <= c && c < 0x2e0) return fromUnicodeTable2[c - 0x2c0];
00487                 else return -1; }
00488 };
00489
00490 class TEncoding_YuAscii
00491 {
00492 public:
00493         static const int uniChars[10], yuAsciiChars[10];
00494         static inline TStr GetName() { return "YU-ASCII"; }
00495         static int ToUnicode(int c) { Assert(0 <= c && c <= 255);
00496                 for (int i = 0; i < int(sizeof(yuAsciiChars) / sizeof(yuAsciiChars[0])); i++)
00497                         if (c == yuAsciiChars[i]) return uniChars[i];
00498                 return c; }
00499         static int FromUnicode(int c) {
00500                 for (int i = 0; i < int(sizeof(uniChars) / sizeof(uniChars[0])); i++)
00501                         if (c == uniChars[i]) return yuAsciiChars[i];
00502                         else if(c == yuAsciiChars[i]) return -1;
00503                 if (0 <= c && c <= 255) return c; else return -1; }
00504 };
00505
00506 class TEncoding_CP437 // DOS US
00507 {
00508 public:
00509         static inline TStr GetName() { return "CP437"; }
00510         static const int toUnicodeTable[8 * 16], fromUnicodeTable1[6 * 16], fromUnicodeTable2[4 * 16], fromUnicodeTable3[6 * 16], fromUnicodeTable4[11 * 16];
00511         static int ToUnicode(int c) { Assert(0 <= c && c <= 255);
00512                 if (c < 0x80) return c; else return toUnicodeTable[c - 0x80]; }
00513         static int FromUnicode(int c) {
00514                 if (0 <= c && c < 0x80) return c;
00515                 else if (0xa0 <= c && c < 0x100) return fromUnicodeTable1[c - 0xa0];
00516                 else if (0x390 <= c && c < 0x3d0) return fromUnicodeTable2[c - 0x390];
00517                 else if (0x2210 <= c && c < 0x2270) return fromUnicodeTable3[c - 0x2210];
00518                 else if (0x2500 <= c && c < 0x25b0) return fromUnicodeTable4[c - 0x2500];
00519                 else if (c == 0x192) return 0x9f;
00520                 else if (c == 0x207f) return 0xfc;
00521                 else if (c == 0x20a7) return 0x9e;
00522                 else if (c == 0x2310) return 0xa9;
00523                 else if (c == 0x2320) return 0xf4;
00524                 else if (c == 0x2321) return 0xf5;
00525                 else return -1; }
00526 };
00527
00528 class TEncoding_CP852 // DOS Latin 2
00529 {
00530 public:
00531         static inline TStr GetName() { return "CP852"; }
00532         static const int toUnicodeTable[8 * 16], fromUnicodeTable1[14 * 16], fromUnicodeTable2[2 * 16], fromUnicodeTable3[11 * 16];
00533         static int ToUnicode(int c) { Assert(0 <= c && c <= 255);
00534                 if (c < 0x80) return c; else return toUnicodeTable[c - 0x80]; }
00535         static int FromUnicode(int c) {
00536                 if (0 <= c && c < 0x80) return c;
00537                 else if (0xa0 <= c && c < 0x180) return fromUnicodeTable1[c - 0xa0];
00538                 else if (0x2c0 <= c && c < 0x2e0) return fromUnicodeTable2[c - 0x2c0];
00539                 else if (0x2500 <= c && c < 0x25b0) return fromUnicodeTable3[c - 0x2500];
00540                 else return -1; }
00541 };
00542
00543 class TEncoding_CP1250 // Windows-1250, similar to ISO Latin 2
00544 {
00545 public:
00546         static inline TStr GetName() { return "CP1250"; }
00547         static const int toUnicodeTable[8 * 16], fromUnicodeTable1[14 * 16], fromUnicodeTable2[2 * 16], fromUnicodeTable3[3 * 16];
00548         static int ToUnicode(int c) { Assert(0 <= c && c <= 255);
00549                 if (c < 0x80) return c; else return toUnicodeTable[c - 0x80]; }
00550         static int FromUnicode(int c) {
00551                 if (0 <= c && c < 0x80) return c;
00552                 else if (0xa0 <= c && c < 0x180) return fromUnicodeTable1[c - 0xa0];
00553                 else if (0x2c0 <= c && c < 0x2e0) return fromUnicodeTable2[c - 0x2c0];
00554                 else if (0x2010 <= c && c < 0x2040) return fromUnicodeTable3[c - 0x2010];
00555                 else if (c == 0x20ac) return 0x80;
00556                 else if (c == 0x2122) return 0x99;
00557                 else return -1; }
00558 };
00559
00560 template<class TEncoding_>
00561 class T8BitCodec
00562 {
00563 protected:
00564         typedef TUniVecIdx TVecIdx;
00565 public:
00566         typedef TEncoding_ TEncoding;
00567         TUnicodeErrorHandling errorHandling;
00568         int replacementChar;
00569
00570         T8BitCodec() : errorHandling(uehIgnore), replacementChar(TUniCodec::DefaultReplacementChar) { }
00571         T8BitCodec(TUnicodeErrorHandling errorHandling_, int replacementChar_ = TUniCodec::DefaultReplacementChar) :
00572                 errorHandling(errorHandling_), replacementChar(replacementChar_) { }
00573         static TStr GetName() { return TEncoding::GetName(); }
00574
00575         void Test() const
00576         {
00577                 int nDecoded = 0;
00578                 for (int c = 0; c <= 255; c++) {
00579                         int cu = TEncoding::ToUnicode(c); if (cu == -1) continue;
00580                         nDecoded++;
00581                         IAssert(0 <= cu && cu < 0x110000);
00582                         int c2 = TEncoding::FromUnicode(cu);
00583                         IAssert(c2 == c); }
00584                 int nEncoded = 0;
00585                 for (int cu = 0; cu < 0x110000; cu++) {
00586                         int c = TEncoding::FromUnicode(cu); if (c == -1) continue;
00587                         nEncoded++;
00588                         IAssert(0 <= c && c <= 255);
00589                         int cu2 = TEncoding::ToUnicode(c);
00590                         IAssert(cu2 == cu); }
00591                 IAssert(nDecoded == nEncoded);
00592         }
00593
00594         // Returns the number of characters that have been successfully decoded.
00595         // This does not include any replacement characters that may have been inserted into 'dest'.
00596         template<typename TSrcVec, typename TDestCh>
00597         size_t ToUnicode(
00598                 const TSrcVec& src, size_t srcIdx, const size_t srcCount,
00599                 TVec<TDestCh>& dest, const bool clrDest = true) const
00600         {
00601                 if (clrDest) dest.Clr();
00602                 size_t toDo = srcCount;
00603                 while (toDo-- > 0) {
00604                         int chSrc = ((int) src[TVecIdx(srcIdx)]) & 0xff; srcIdx++;
00605                         int chDest = TEncoding::ToUnicode(chSrc);
00606                         dest.Add(chDest); }
00607                 return srcCount;
00608         }
00609         template<typename TSrcVec, typename TDestCh>
00610         size_t ToUnicode(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true) const { return ToUnicode(src, 0, src.Len(), dest, clrDest); }
00611
00612         size_t ToUnicode(const TIntV& src, TIntV& dest, const bool clrDest = true) const { return ToUnicode(src, 0, src.Len(), dest, clrDest); }
00613         size_t ToUnicode(const TStr& src, TIntV& dest, const bool clrDest = true) const { return ToUnicode(src, 0, src.Len(), dest, clrDest); }
00614
00615         // Returns the number of characters that have been successfully encoded.
00616         // This does not include any replacement characters that may have been inserted into 'dest'.
00617         template<typename TSrcVec, typename TDestVec>
00618         size_t FromUnicode(
00619                 const TSrcVec& src, size_t srcIdx, const size_t srcCount,
00620                 TDestVec& dest, const bool clrDest = true) const
00621         {
00622                 typedef typename TVecElt<TDestVec>::TElement TDestCh;
00623                 if (clrDest) dest.Clr();
00624                 size_t toDo = srcCount, nEncoded = 0;
00625                 while (toDo-- > 0) {
00626                         int chSrc = (int) src[TVecIdx(srcIdx)]; srcIdx++;
00627                         int chDest = TEncoding::FromUnicode(chSrc);
00628                         if (chDest < 0) {
00629                                 switch (errorHandling) {
00630                                 case uehThrow: throw TUnicodeException(srcIdx - 1, chSrc, "Invalid character for encoding into " + GetName() + ".");
00631                                 case uehAbort: return nEncoded;
00632                                 case uehReplace: TVecElt<TDestVec>::Add(dest, TDestCh(replacementChar)); continue;
00633                                 case uehIgnore: continue;
00634                                 default: Fail; } }
00635                         TVecElt<TDestVec>::Add(dest, TDestCh(chDest)); nEncoded++; }
00636                 return nEncoded;
00637         }
00638
00639         template<typename TSrcVec, typename TDestVec>
00640         size_t FromUnicode(const TSrcVec& src, TDestVec& dest, const bool clrDest = true) const { return FromUnicode(src, 0, src.Len(), dest, clrDest); }
00641
00642         size_t UniToStr(const TIntV& src, size_t srcIdx, const size_t srcCount, TStr& dest, const bool clrDest = true) const {
00643                 TChA buf; size_t retVal = FromUnicode(src, srcIdx, srcCount, buf, false);
00644                 if (clrDest) dest += buf.CStr(); else dest = buf.CStr();
00645                 return retVal; }
00646         size_t UniToStr(const TIntV& src, TStr& dest, const bool clrDest = true) const { return UniToStr(src, 0, src.Len(), dest, clrDest); }
00647 };
00648
00649 typedef T8BitCodec<TEncoding_ISO8859_1> TCodec_ISO8859_1;
00650 typedef T8BitCodec<TEncoding_ISO8859_2> TCodec_ISO8859_2;
00651 typedef T8BitCodec<TEncoding_ISO8859_3> TCodec_ISO8859_3;
00652 typedef T8BitCodec<TEncoding_ISO8859_4> TCodec_ISO8859_4;
00653 typedef T8BitCodec<TEncoding_CP852> TCodec_CP852;
00654 typedef T8BitCodec<TEncoding_CP437> TCodec_CP437;
00655 typedef T8BitCodec<TEncoding_CP1250> TCodec_CP1250;
00656 typedef T8BitCodec<TEncoding_YuAscii> TCodec_YuAscii;
00657
00658 //-----------------------------------------------------------------------------
00659 // Various declarations used by the Unicode Character Database
00660 //-----------------------------------------------------------------------------
00661
00662 typedef enum TUniChCategory_
00663 {
00664 #define DefineUniCat(cat, c) uc ## cat = (int(uchar(c)) & 0xff)
00665         DefineUniCat(Letter, 'L'),             // ucLetter
00666         DefineUniCat(Mark, 'M'),
00667         DefineUniCat(Number, 'N'),
00668         DefineUniCat(Punctuation, 'P'),
00669         DefineUniCat(Symbol, 'S'),
00670         DefineUniCat(Separator, 'Z'),
00671         DefineUniCat(Other, 'C')
00672 #undef DefineUniCat
00673 }
00674 TUniChCategory;
00675
00676 typedef enum TUniChSubCategory_
00677 {
00678 #define DefineUniSubCat(cat, subCat, c) uc ## cat ## subCat = ((uc ## cat) << 8) | (int(uchar(c)) & 0xff)
00679         DefineUniSubCat(Letter, Uppercase, 'u'),            // ucLetterUppercase
00680         DefineUniSubCat(Letter, Lowercase, 'l'),
00681         DefineUniSubCat(Letter, Titlecase, 't'),
00682         DefineUniSubCat(Letter, Modifier, 'm'),
00683         DefineUniSubCat(Letter, Other, 'o'),
00684         DefineUniSubCat(Mark, Nonspacing, 'n'),
00685         DefineUniSubCat(Mark, SpacingCombining, 'c'),
00686         DefineUniSubCat(Mark, Enclosing, 'e'),
00687         DefineUniSubCat(Number, DecimalDigit, 'd'),
00688         DefineUniSubCat(Number, Letter, 'l'),
00689         DefineUniSubCat(Number, Other, 'o'),
00690         DefineUniSubCat(Punctuation, Connector, 'c'),
00691         DefineUniSubCat(Punctuation, Dash, 'd'),
00692         DefineUniSubCat(Punctuation, Open, 's'),
00693         DefineUniSubCat(Punctuation, Close, 'e'),
00694         DefineUniSubCat(Punctuation, InitialQuote, 'i'),
00695         DefineUniSubCat(Punctuation, FinalQuote, 'f'),
00696         DefineUniSubCat(Punctuation, Other, 'o'),
00697         DefineUniSubCat(Symbol, Math, 'm'),
00698         DefineUniSubCat(Symbol, Currency, 'c'),
00699         DefineUniSubCat(Symbol, Modifier, 'k'),
00700         DefineUniSubCat(Symbol, Other, 'o'),
00701         DefineUniSubCat(Separator, Space, 's'),
00702         DefineUniSubCat(Separator, Line, 'l'),
00703         DefineUniSubCat(Separator, Paragraph, 'p'),
00704         DefineUniSubCat(Other, Control, 'c'),
00705         DefineUniSubCat(Other, Format, 'f'),
00706         DefineUniSubCat(Other, Surrogate, 's'),
00707         DefineUniSubCat(Other, PrivateUse, 'o'),
00708         DefineUniSubCat(Other, NotAssigned, 'n')
00709 }
00710 TUniChSubCategory;
00711
00712 typedef enum TUniChFlags_
00713 {
00714         ucfCompatibilityDecomposition = 1, // if this flag is not set, the decomposition is canonical
00715         ucfCompositionExclusion = 1 << 1,       // from CompositionExclusions.txt
00716         // Flags used when searching for word boundaries.  See UAX #29.
00717         ucfWbFormat = 1 << 2,
00718         ucfWbKatakana = 1 << 3,
00719         ucfWbALetter = 1 << 4,
00720         ucfWbMidLetter = 1 << 5,
00721         ucfWbMidNum = 1 << 6,
00722         ucfWbNumeric = 1 << 7,
00723         ucfWbExtendNumLet = 1 << 8,
00724         // Flags used with sentence boundaries (Sep is also used with word boundaries).  See UAX #29.
00725         ucfSbSep = 1 << 9,
00726         ucfSbFormat = 1 << 10,
00727         ucfSbSp = 1 << 11,
00728         ucfSbLower = 1 << 12,
00729         ucfSbUpper = 1 << 13,
00730         ucfSbOLetter = 1 << 14,
00731         ucfSbNumeric = 1 << 15,
00732         ucfSbATerm = 1 << 16,
00733         ucfSbSTerm = 1 << 17,
00734         ucfSbClose = 1 << 18,
00735         ucfSbMask = ucfSbSep | ucfSbFormat | ucfSbSp | ucfSbLower | ucfSbUpper | ucfSbOLetter | ucfSbNumeric | ucfSbATerm | ucfSbSTerm | ucfSbClose,
00736         ucfWbMask = ucfWbFormat | ucfWbKatakana | ucfWbALetter | ucfWbMidLetter | ucfWbMidNum | ucfWbNumeric | ucfWbExtendNumLet | ucfSbSep,
00737         // Flags from DerivedCoreProperties.txt.
00738         // [The comments are from UCD.html.]
00739         // - Characters with the Alphabetic property. For more information, see Chapter 4 in [Unicode].
00740         //   Generated from: Other_Alphabetic + Lu + Ll + Lt + Lm + Lo + Nl
00741         ucfDcpAlphabetic = 1 << 19,
00742         // - For programmatic determination of default-ignorable code points.
00743         //   New characters that should be ignored in processing (unless explicitly supported)
00744         //   will be assigned in these ranges, permitting programs to correctly handle the default
00745         //   behavior of such characters when not otherwise supported.  For more information, see
00746         //   UAX #29: Text Boundaries [Breaks].
00747         //   Generated from Other_Default_Ignorable_Code_Point + Cf + Cc + Cs + Noncharacters - White_Space - annotation characters
00748         //   [Examples: soft hyphen, zero-width space, noncharacters (e.g. U+fffe, U+ffff, U+1fffe, U+1ffff, etc.), surrogates, language tags, variation selectors]
00749         ucfDcpDefaultIgnorableCodePoint = 1 << 20,
00750         // - Characters with the Lowercase property.  For more information, see Chapter 4 in [Unicode].
00751         //   Generated from: Other_Lowercase + Ll
00752         ucfDcpLowercase = 1 << 21,
00753         // - For programmatic determination of grapheme cluster boundaries.
00754         //   For more information, see UAX #29: Text Boundaries [Breaks].
00755         //   Generated from: [0..10FFFF] - Cc - Cf - Cs - Co - Cn - Zl - Zp - Grapheme_Extend
00756         ucfDcpGraphemeBase = 1 << 22,
00757         // - For programmatic determination of grapheme cluster boundaries.
00758         //   For more information, see UAX #29: Text Boundaries [Breaks].
00759         //   Generated from: Other_Grapheme_Extend + Me + Mn
00760         //   Note: depending on an application's interpretation of Co (private use), they may be either
00761         //         in Grapheme_Base, or in Grapheme_Extend, or in neither.
00762         ucfDcpGraphemeExtend = 1 << 23,
00763         // - Used to determine programming identifiers, as described in UAX #31: Identifier and Pattern Syntax.
00764         ucfDcpIdStart = 1 << 24,
00765         ucfDcpIdContinue = 1 << 25,
00766         // - Characters with the Math property. For more information, see Chapter 4 in [Unicode].
00767         //   Generated from: Sm + Other_Math
00768         ucfDcpMath = 1 << 26,
00769         // - Characters with the Uppercase property. For more information, see Chapter 4 in [Unicode].
00770         //   Generated from: Lu + Other_Uppercase
00771         ucfDcpUppercase = 1 << 27,
00772         // - Used to determine programming identifiers, as described in UAX #31: Identifier and Pattern Syntax.
00773         ucfDcpXidStart = 1 << 28,
00774         ucfDcpXidContinue = 1 << 29,
00775         ucfDcpMask = ucfDcpAlphabetic | ucfDcpDefaultIgnorableCodePoint | ucfDcpLowercase | ucfDcpGraphemeBase | ucfDcpGraphemeExtend |
00776                 ucfDcpIdStart | ucfDcpIdContinue | ucfDcpMath | ucfDcpUppercase | ucfDcpXidStart | ucfDcpXidContinue,
00777 }
00778 TUniChFlags;
00779
00780 typedef enum TUniChProperties_
00781 {
00782         // The flags from PropList.txt.
00783         // [The comments are from UCD.html.]
00784         // - ASCII characters commonly used for the representation of hexadecimal numbers.
00785         //   [= 0123456789abcdefABCDEF]
00786         ucfPrAsciiHexDigit = 1,
00787         // - Those format control characters which have specific functions in the Bidirectional Algorithm.
00788         ucfPrBidiControl = 2,
00789         // - Those punctuation characters explicitly called out as dashes in the Unicode Standard,
00790         //   plus compatibility equivalents to those. Most of these have the Pd General Category,
00791         //   but some have the Sm General Category because of their use in mathematics.
00792         //     U+0002d  HYPHEN-MINUS
00793         //     U+0058a  ARMENIAN HYPHEN
00794         //     U+005be  HEBREW PUNCTUATION MAQAF
00795         //     U+01806  MONGOLIAN TODO SOFT HYPHEN
00796         //     U+02010  HYPHEN
00797         //     U+02011  NON-BREAKING HYPHEN
00798         //     U+02012  FIGURE DASH
00799         //     U+02013  EN DASH
00800         //     U+02014  EM DASH
00801         //     U+02015  HORIZONTAL BAR
00802         //     U+02053  SWUNG DASH
00803         //     U+0207b  SUPERSCRIPT MINUS
00804         //     U+0208b  SUBSCRIPT MINUS
00805         //     U+02212  MINUS SIGN
00806         //     U+02e17  DOUBLE OBLIQUE HYPHEN
00807         //     U+0301c  WAVE DASH
00808         //     U+03030  WAVY DASH
00809         //     U+030a0  KATAKANA-HIRAGANA DOUBLE HYPHEN
00810         //     U+0fe31  PRESENTATION FORM FOR VERTICAL EM DASH
00811         //     U+0fe32  PRESENTATION FORM FOR VERTICAL EN DASH
00812         //     U+0fe58  SMALL EM DASH
00813         //     U+0fe63  SMALL HYPHEN-MINUS
00814         //     U+0ff0d  FULLWIDTH HYPHEN-MINUS
00815         ucfPrDash = 4,
00816         // - For a machine-readable list of deprecated characters.  No characters will ever be removed
00817         //   from the standard, but the usage of deprecated characters is strongly discouraged.
00818         ucfPrDeprecated = 8,
00819         // - Characters that linguistically modify the meaning of another character to which they apply.
00820         //   Some diacritics are not combining characters, and some combining characters are not diacritics.
00821         ucfPrDiacritic = 0x10,
00822         // - Characters whose principal function is to extend the value or shape of a preceding alphabetic
00823         //   character.  Typical of these are length and iteration marks.
00824         ucfPrExtender = 0x20,
00825         // - Used in determining default grapheme cluster boundaries.  For more information, see UAX #29: Text Boundaries.
00826         ucfPrGraphemeLink = 0x40,
00827         // - Characters commonly used for the representation of hexadecimal numbers, plus their compatibility equivalents.
00828         //   [= AsciiHexDigit + fullwidth digit {0..9} + fullwidth latin {small|capital} letter {a..f}]
00829         ucfPrHexDigit = 0x80,
00830         // - Those dashes used to mark connections between pieces of words, plus the Katakana middle dot.
00831         //   The Katakana middle dot functions like a hyphen, but is shaped like a dot rather than a dash.
00832         //     U+0002d  HYPHEN-MINUS
00833         //     U+000ad  SOFT HYPHEN
00834         //     U+0058a  ARMENIAN HYPHEN
00835         //     U+01806  MONGOLIAN TODO SOFT HYPHEN
00836         //     U+02010  HYPHEN
00837         //     U+02011  NON-BREAKING HYPHEN
00838         //     U+02e17  DOUBLE OBLIQUE HYPHEN
00839         //     U+030fb  KATAKANA MIDDLE DOT
00840         //     U+0fe63  SMALL HYPHEN-MINUS
00841         //     U+0ff0d  FULLWIDTH HYPHEN-MINUS
00842         //     U+0ff65  HALFWIDTH KATAKANA MIDDLE DOT
00843         ucfPrHyphen = 0x100,
00844         // - Characters considered to be CJKV (Chinese, Japanese, Korean, and Vietnamese) ideographs.
00845         ucfPrIdeographic = 0x200,
00846         // - Those format control characters which have specific functions for control of cursive joining and ligation.
00847         ucfPrJoinControl = 0x400,
00848         // - There are a small number of characters that do not use logical order.
00849         //   These characters require special handling in most processing.
00850         ucfPrLogicalOrderException = 0x800,
00851         // - Code points that are permanently reserved for internal use.
00852         ucfPrNoncharacterCodePoint = 0x1000,
00853         // - Used for pattern syntax as described in UAX #31: Identifier and Pattern Syntax.
00854         ucfPrPatternSyntax = 0x2000,
00855         ucfPrPatternWhiteSpace = 0x4000,
00856         // - Those punctuation characters that function as quotation marks.
00857         //     U+00022  QUOTATION MARK
00858         //     U+00027  APOSTROPHE
00859         //     U+000ab  LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
00860         //     U+000bb  RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
00861         //     U+02018  LEFT SINGLE QUOTATION MARK
00862         //     U+02019  RIGHT SINGLE QUOTATION MARK
00863         //     U+0201a  SINGLE LOW-9 QUOTATION MARK
00864         //     U+0201b  SINGLE HIGH-REVERSED-9 QUOTATION MARK
00865         //     U+0201c  LEFT DOUBLE QUOTATION MARK
00866         //     U+0201d  RIGHT DOUBLE QUOTATION MARK
00867         //     U+0201e  DOUBLE LOW-9 QUOTATION MARK
00868         //     U+0201f  DOUBLE HIGH-REVERSED-9 QUOTATION MARK
00869         //     U+02039  SINGLE LEFT-POINTING ANGLE QUOTATION MARK
00870         //     U+0203a  SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
00871         //     U+0300c  LEFT CORNER BRACKET
00872         //     U+0300d  RIGHT CORNER BRACKET
00873         //     U+0300e  LEFT WHITE CORNER BRACKET
00874         //     U+0300f  RIGHT WHITE CORNER BRACKET
00875         //     U+0301d  REVERSED DOUBLE PRIME QUOTATION MARK
00876         //     U+0301e  DOUBLE PRIME QUOTATION MARK
00877         //     U+0301f  LOW DOUBLE PRIME QUOTATION MARK
00878         //     U+0fe41  PRESENTATION FORM FOR VERTICAL LEFT CORNER BRACKET
00879         //     U+0fe42  PRESENTATION FORM FOR VERTICAL RIGHT CORNER BRACKET
00880         //     U+0fe43  PRESENTATION FORM FOR VERTICAL LEFT WHITE CORNER BRACKET
00881         //     U+0fe44  PRESENTATION FORM FOR VERTICAL RIGHT WHITE CORNER BRACKET
00882         //     U+0ff02  FULLWIDTH QUOTATION MARK
00883         //     U+0ff07  FULLWIDTH APOSTROPHE
00884         //     U+0ff62  HALFWIDTH LEFT CORNER BRACKET
00885         //     U+0ff63  HALFWIDTH RIGHT CORNER BRACKET
00886         ucfPrQuotationMark = 0x8000,
00887         // - Characters with a "soft dot", like i or j. An accent placed on these characters causes the dot to disappear.
00888         //   An explicit _dot above_ can be added where required, such as in Lithuanian.
00889         ucfPrSoftDotted = 0x10000,
00890         // - Sentence Terminal. Used in UAX #29: Text Boundaries.
00891         //     U+00021  EXCLAMATION MARK
00892         //     U+0002e  FULL STOP
00893         //     U+0003f  QUESTION MARK
00894         //     U+0203c  DOUBLE EXCLAMATION MARK
00895         //     U+0203d  INTERROBANG
00896         //     U+02047  DOUBLE QUESTION MARK
00897         //     U+02048  QUESTION EXCLAMATION MARK
00898         //     U+02049  EXCLAMATION QUESTION MARK
00899         //     U+03002  IDEOGRAPHIC FULL STOP
00900         //     [plus many characters from other writing systems]
00901         ucfPrSTerm = 0x20000,
00902         // - Those punctuation characters that generally mark the end of textual units.
00903         //   [JB note: this set contains more character than STerm.  For example, it contains
00904         //   the comma, colon and semicolon, whereas STerm doesn't.]
00905         //     U+00021  EXCLAMATION MARK
00906         //     U+0002c  COMMA
00907         //     U+0002e  FULL STOP
00908         //     U+0003a  COLON
00909         //     U+0003b  SEMICOLON
00910         //     U+0003f  QUESTION MARK
00911         //     U+0203c  DOUBLE EXCLAMATION MARK
00912         //     U+0203d  INTERROBANG
00913         //     U+02047  DOUBLE QUESTION MARK
00914         //     U+02048  QUESTION EXCLAMATION MARK
00915         //     U+02049  EXCLAMATION QUESTION MARK
00916         //     [plus *lots* of charcters from other writing systems]
00917         ucfPrTerminalPunctuation = 0x40000,
00918         // - Indicates all those characters that qualify as Variation Selectors.
00919         //   For details on the behavior of these characters, see StandardizedVariants.html and
00920         //   Section 16.4, Variation Selectors in [Unicode].
00921         ucfPrVariationSelector = 0x80000,
00922         // - Those separator characters and control characters which should be treated by
00923         //   programming languages as "white space" for the purpose of parsing elements.
00924         //   Note: ZERO WIDTH SPACE and ZERO WIDTH NO-BREAK SPACE are not included,
00925         //         since their functions are restricted to line-break control.
00926         //         Their names are unfortunately misleading in this respect.
00927         //   Note: There are other senses of "whitespace" that encompass a different set of characters.
00928         //         [JB note: e.g. there's a BIDI class for whitespace ('WS') in UnicodeData.txt.
00929         //         There's also a "Sp" class in the sentence boundary algorithm, see UAX #29, sec. 5.1.]
00930         //   This includes the following characters:
00931         //     U+0009  <control>
00932         //     U+000a  <control>
00933         //     U+000b  <control>
00934         //     U+000c  <control>
00935         //     U+000d  <control>
00936         //     U+0020  SPACE
00937         //     U+0085  <control>
00938         //     U+00a0  NO-BREAK SPACE
00939         //     U+1680  OGHAM SPACE MARK
00940         //     U+180e  MONGOLIAN VOWEL SEPARATOR
00941         //     U+2000  EN QUAD
00942         //     U+2001  EM QUAD
00943         //     U+2002  EN SPACE
00944         //     U+2003  EM SPACE
00945         //     U+2004  THREE-PER-EM SPACE
00946         //     U+2005  FOUR-PER-EM SPACE
00947         //     U+2006  SIX-PER-EM SPACE
00948         //     U+2007  FIGURE SPACE
00949         //     U+2008  PUNCTUATION SPACE
00950         //     U+2009  THIN SPACE
00951         //     U+200a  HAIR SPACE
00952         //     U+2028  LINE SEPARATOR
00953         //     U+2029  PARAGRAPH SEPARATOR
00954         //     U+202f  NARROW NO-BREAK SPACE
00955         //     U+205f  MEDIUM MATHEMATICAL SPACE
00956         //     U+3000  IDEOGRAPHIC SPACE
00957         ucfPrWhiteSpace = 0x100000
00958 }
00959 TUniChProperties;
00960
00961 typedef enum TUniChPropertiesX_
00962 {
00963         // More properties from PropList.txt.
00964         // - Used to derive the properties in DerivedCoreProperties.txt.
00965         ucfPxOtherAlphabetic = 1,
00966         ucfPxOtherDefaultIgnorableCodePoint = 2,
00967         ucfPxOtherGraphemeExtend = 4,
00968         ucfPxOtherIdContinue = 8,
00969         ucfPxOtherIdStart = 0x10,
00970         ucfPxOtherLowercase = 0x20,
00971         ucfPxOtherMath = 0x40,
00972         ucfPxOtherUppercase = 0x80,
00973         // - Used in ideographic description sequences.
00974         ucfPxIdsBinaryOperator = 0x100,
00975         ucfPxIdsTrinaryOperator = 0x200,
00976         ucfPxRadical = 0x400,
00977         ucfPxUnifiedIdeograph = 0x800
00978 }
00979 TUniChPropertiesX;
00980
00981 //-----------------------------------------------------------------------------
00982 // TUniChInfo -- contains information about a single Unicode codepoint
00983 //-----------------------------------------------------------------------------
00984
00985 class TUniChInfo
00986 {
00987 public:
00988         enum { // combining classes (for 'combClass'); from UnicodeData.txt
00989                 ccStarter = 0, // 0: Spacing, split, enclosing, reordrant, and Tibetan subjoined
00990                 ccOverlaysAndInterior = 1,
00991                 ccNuktas = 7,
00992                 ccHiraganaKatakanaVoicingMarks = 8,
00993                 ccViramas = 9,
00994                 ccFixedPositionStart = 10, // Start of fixed position classes
00995                 ccFixedPositionEnd = 199, // End of fixed position classes
00996                 ccBelowLeftAttached = 200,
00997                 ccBelowAttached = 202,
00998                 ccBelowRightAttached = 204,
00999                 ccLeftAttached = 208, // Left attached (reordrant around single base character)
01000                 ccRightAttached = 210,
01001                 ccAboveLeftAttached = 212,
01002                 ccAboveAttached = 214,
01003                 ccAboveRightAttached = 216,
01004                 ccBelowLeft = 218,
01005                 ccBelow = 220,
01006                 ccBelowRight = 222,
01007                 ccLeft = 224, // Left (reordrant around single base character)
01008                 ccRight = 226,
01009                 ccAboveLeft = 228,
01010                 ccAbove = 230,
01011                 ccAboveRight = 232,
01012                 ccDoubleBelow = 233,
01013                 ccDoubleAbove = 234,
01014                 ccBelowIotaSubscript = 240, // Below (iota subscript)
01015                 ccInvalid = 255 // not defined by Unicode
01016         };
01017         char chCat, chSubCat; // chCat + chSubCat together comprise the general category (from UnicodeData.txt)
01018         uchar combClass; // canonical combining class
01019         TUniChCategory cat; // = TUniChCategory(chCat)
01020         TUniChSubCategory subCat; // = TUniChSubCategory(cat << 8 | subCat)
01021         signed char script; // keyId into 'TUniChDb.scriptNames'; -1 if unknown
01022         int simpleUpperCaseMapping, simpleLowerCaseMapping, simpleTitleCaseMapping; // from UnicodeData.txt
01023         int decompOffset; // offset into 'TUniChDb.decompositions'; or -1 if the character doesn't change during decomposition
01024         int nameOffset; // offset into 'TUniChDb.charNames'
01025         int flags; // a combination of TUniChFlags
01026         int properties; // a combination of TUniChProperties
01027         int propertiesX; // a combination of TUniChPropertiesX
01028         ushort lineBreak; // from LineBreak.txt
01029
01030         // Converts a 2-letter linebreak code into a 16-bit integer.
01031         static inline ushort GetLineBreakCode(char c1, char c2) { return ((static_cast<ushort>(static_cast<uchar>(c1)) & 0xff) << 8) | ((static_cast<ushort>(static_cast<uchar>(c2)) & 0xff)); }
01032         static const ushort LineBreak_Unknown, LineBreak_ComplexContext, LineBreak_Numeric, LineBreak_InfixNumeric, LineBreak_Quotation;
01033
01034 public:
01035         void InitAfterLoad() {
01036                 cat = (TUniChCategory) chCat;
01037                 subCat = (TUniChSubCategory) (((static_cast<int>(static_cast<uchar>(chCat)) & 0xff) << 8) | (static_cast<int>(static_cast<uchar>(chSubCat)) & 0xff)); }
01038         void SetCatAndSubCat(const TUniChSubCategory catAndSubCat) {
01039                 cat = (TUniChCategory) ((int(catAndSubCat) >> 8) & 0xff);
01040                 subCat = catAndSubCat;
01041                 chCat = (char) cat; chSubCat = (char) (int(subCat) & 0xff); }
01042         friend class TUniChDb;
01043
01044         // Inexplicably missing from TSIn/TSOut...
01045         static inline void LoadUShort(TSIn& SIn, ushort& u) { SIn.LoadBf(&u, sizeof(u)); }
01046         static inline void LoadSChar(TSIn& SIn, signed char& u) { SIn.LoadBf(&u, sizeof(u)); }
01047         static inline void SaveUShort(TSOut& SOut, ushort u) { SOut.SaveBf(&u, sizeof(u)); }
01048         static inline void SaveSChar(TSOut& SOut, signed char u) { SOut.SaveBf(&u, sizeof(u)); }
01049
01050 public:
01051         void Save(TSOut& SOut) const {
01052                 SOut.Save(chCat); SOut.Save(chSubCat); SOut.Save(combClass); SaveSChar(SOut, script);
01053                 SOut.Save(simpleUpperCaseMapping); SOut.Save(simpleLowerCaseMapping); SOut.Save(simpleTitleCaseMapping);
01054                 SOut.Save(decompOffset); SOut.Save(nameOffset);
01055                 SOut.Save(flags); SOut.Save(properties); SOut.Save(propertiesX); SaveUShort(SOut, lineBreak); }
01056         void Load(TSIn& SIn) {
01057                 SIn.Load(chCat); SIn.Load(chSubCat); SIn.Load(combClass); LoadSChar(SIn, script);
01058                 SIn.Load(simpleUpperCaseMapping); SIn.Load(simpleLowerCaseMapping); SIn.Load(simpleTitleCaseMapping);
01059                 SIn.Load(decompOffset); SIn.Load(nameOffset);
01060                 SIn.Load(flags); SIn.Load(properties); SIn.Load(propertiesX); LoadUShort(SIn, lineBreak); InitAfterLoad(); }
01061         explicit TUniChInfo(TSIn& SIn) { Load(SIn); }
01062         TUniChInfo() : chCat(char(ucOther)), chSubCat(char(ucOtherNotAssigned & 0xff)), combClass(ccInvalid),
01063                 script(-1),simpleUpperCaseMapping(-1), simpleLowerCaseMapping(-1), simpleTitleCaseMapping(-1),
01064                 decompOffset(-1), nameOffset(-1), flags(0), properties(0), propertiesX(0), lineBreak(LineBreak_Unknown) {
01065                 InitAfterLoad(); }
01066
01067         // DerivedCoreProperties flags.
01068         bool IsDcpFlag(const TUniChFlags flag) const { Assert((flag & ucfDcpMask) == flag); return (flags & flag) == flag; }
01069         void ClrDcpFlags() { flags = flags & ~ucfDcpMask; }
01070         void SetDcpFlag(const TUniChFlags flag) { Assert((flag & ucfDcpMask) == flag); flags |= flag; }
01071         bool IsAlphabetic() const { return IsDcpFlag(ucfDcpAlphabetic); }
01072         bool IsUppercase() const { return IsDcpFlag(ucfDcpUppercase); }
01073         bool IsLowercase() const { return IsDcpFlag(ucfDcpLowercase); }
01074         bool IsMath() const { return IsDcpFlag(ucfDcpMath); }
01075         bool IsDefaultIgnorable() const { return IsDcpFlag(ucfDcpDefaultIgnorableCodePoint); }
01076         bool IsGraphemeBase() const { return IsDcpFlag(ucfDcpGraphemeBase); }
01077         bool IsGraphemeExtend() const { return IsDcpFlag(ucfDcpGraphemeExtend); }
01078         bool IsIdStart() const { return IsDcpFlag(ucfDcpIdStart); }
01079         bool IsIdContinue() const { return IsDcpFlag(ucfDcpIdContinue); }
01080         bool IsXidStart() const { return IsDcpFlag(ucfDcpXidStart); }
01081         bool IsXidContinue() const { return IsDcpFlag(ucfDcpXidContinue); }
01082
01083         // PropList.txt flags.
01084         bool IsProperty(const TUniChProperties flag) const { return (properties & flag) == flag; }
01085         void SetProperty(const TUniChProperties flag) { properties |= flag; }
01086         bool IsAsciiHexDigit() const { return IsProperty(ucfPrAsciiHexDigit); }
01087         bool IsBidiControl() const { return IsProperty(ucfPrBidiControl); }
01088         bool IsDash() const { return IsProperty(ucfPrDash); }
01089         bool IsDeprecated() const { return IsProperty(ucfPrDeprecated); }
01090         bool IsDiacritic() const { return IsProperty(ucfPrDiacritic); }
01091         bool IsExtender() const { return IsProperty(ucfPrExtender); }
01092         bool IsGraphemeLink() const { return IsProperty(ucfPrGraphemeLink); }
01093         bool IsHexDigit() const { return IsProperty(ucfPrHexDigit); }
01094         bool IsHyphen() const { return IsProperty(ucfPrHyphen); }
01095         bool IsIdeographic() const { return IsProperty(ucfPrIdeographic); }
01096         bool IsJoinControl() const { return IsProperty(ucfPrJoinControl); }
01097         bool IsLogicalOrderException() const { return IsProperty(ucfPrLogicalOrderException); }
01098         bool IsNoncharacter() const { return IsProperty(ucfPrNoncharacterCodePoint); }
01099         bool IsQuotationMark() const { return IsProperty(ucfPrQuotationMark); }
01100         bool IsSoftDotted() const { return IsProperty(ucfPrSoftDotted); }
01101         bool IsSTerminal() const { return IsProperty(ucfPrSTerm); }
01102         bool IsTerminalPunctuation() const { return IsProperty(ucfPrTerminalPunctuation); }
01103         bool IsVariationSelector() const { return IsProperty(ucfPrVariationSelector); }
01104         bool IsWhiteSpace() const { return IsProperty(ucfPrWhiteSpace); }
01105
01106         // Additional PropList.txt flags.
01107         bool IsPropertyX(const TUniChPropertiesX flag) const { return (propertiesX & flag) == flag; }
01108         void SetPropertyX(const TUniChPropertiesX flag) { propertiesX |= flag; }
01109
01110         // Miscellaneous flags.
01111         bool IsCompositionExclusion() const { return (flags & ucfCompositionExclusion) == ucfCompositionExclusion; }
01112         bool IsCompatibilityDecomposition() const { return (flags & ucfCompatibilityDecomposition) == ucfCompatibilityDecomposition; }
01113
01114         // Word-boundary flags.
01115         bool IsWbFlag(const TUniChFlags flag) const { Assert((flag & ucfWbMask) == flag); return (flags & flag) == flag; }
01116         void ClrWbAndSbFlags() { flags = flags & ~(ucfWbMask | ucfSbMask); }
01117         void SetWbFlag(const TUniChFlags flag) { Assert((flag & ucfWbMask) == flag); flags |= flag; }
01118         int GetWbFlags() const { return flags & ucfWbMask; }
01119         bool IsWbFormat() const { return IsWbFlag(ucfWbFormat); }
01120         TStr GetWbFlagsStr() const { return GetWbFlagsStr(GetWbFlags()); }
01121         static TStr GetWbFlagsStr(const int flags) { return TStr("") + (flags & ucfWbALetter ? "A" : "") +
01122                 (flags & ucfWbFormat ? "F" : "") + (flags & ucfWbKatakana ? "K" : "") + (flags & ucfWbMidLetter ? "M" : "") +
01123                 (flags & ucfWbMidNum ? "m" : "") + (flags & ucfWbNumeric ? "N" : "") + (flags & ucfWbExtendNumLet ? "E" : ""); }
01124
01125         // Sentence-boundary flags.
01126         bool IsSbFlag(const TUniChFlags flag) const { Assert((flag & ucfSbMask) == flag); return (flags & flag) == flag; }
01127         void SetSbFlag(const TUniChFlags flag) { Assert((flag & ucfSbMask) == flag); flags |= flag; }
01128         int GetSbFlags() const { return flags & ucfSbMask; }
01129         bool IsSbFormat() const { return IsSbFlag(ucfSbFormat); }
01130         TStr GetSbFlagsStr() const { return GetSbFlagsStr(GetSbFlags()); }
01131         static TStr GetSbFlagsStr(const int flags) { return TStr("") + (flags & ucfSbSep ? "S" : "") +
01132                 (flags & ucfSbFormat ? "F" : "") + (flags & ucfSbSp ? "_" : "") + (flags & ucfSbLower ? "L" : "") +
01133                 (flags & ucfSbUpper ? "U" : "") + (flags & ucfSbOLetter ? "O" : "") + (flags & ucfSbNumeric ? "N" : "") +
01134                 (flags & ucfSbATerm ? "A" : "") + (flags & ucfSbSTerm ? "T" : "") + (flags & ucfSbClose ? "C" : ""); }
01135
01136         bool IsSbSep() const { return (flags & ucfSbSep) == ucfSbSep; }
01137
01138         // Grapheme-boundary flags.
01139         bool IsGbExtend() const { return IsGraphemeExtend(); }
01140
01141         // Sec. 3.13, D47: C is cased iff it is uppercase, lowercase, or general_category == titlecase_letter.
01142         bool IsCased() const { return IsUppercase() || IsLowercase() || (subCat == ucLetterTitlecase); }
01143
01144         // Character categories.
01145         TUniChCategory GetCat() const { return (TUniChCategory) cat; }
01146         TUniChSubCategory GetSubCat() const { return (TUniChSubCategory) subCat; }
01147         // The following characters belong to the 'symbol/currency' subcategory:
01148         //     U+00024  DOLLAR SIGN
01149         //     U+000a2  CENT SIGN
01150         //     U+000a3  POUND SIGN
01151         //     U+000a4  CURRENCY SIGN
01152         //     U+000a5  YEN SIGN
01153         //     U+020a3  FRENCH FRANC SIGN
01154         //     U+020a4  LIRA SIGN
01155         //     U+020ac  EURO SIGN
01156         //     [and plenty of others]
01157         bool IsCurrency() const { return subCat == ucSymbolCurrency; }
01158         // Note: most private-use and surrogate characters aren't listed explicitly in UnicodeData.txt.
01159         // Thus, it's better to call TUniChDb's versions of these methods, which are aware of
01160         // the full ranges of private-use and surrogate characters.
01161         bool IsPrivateUse() const { return subCat == ucOtherPrivateUse; }
01162         bool IsSurrogate() const { return subCat == ucOtherSurrogate; }
01163
01164         inline static bool IsValidSubCat(const char chCat, const char chSubCat) {
01165                 static const char s[] = "LuLlLtLmLoMnMcMeNdNlNoPcPdPsPePiPfPoSmScSkSoZsZlZpCcCfCsCoCn";
01166                 for (const char *p = s; *p; p += 2)
01167                         if (chCat == p[0] && chSubCat == p[1]) return true;
01168                 return false; }
01169 };
01170
01171 //-----------------------------------------------------------------------------
01172 // TUniTrie -- a trie for suffixes that should not appear at the end
01173 // of a sentence
01174 //-----------------------------------------------------------------------------
01175
01176 template<typename TItem_>
01177 class TUniTrie
01178 {
01179 public:
01180         typedef TItem_ TItem;
01181 protected:
01182         class TNode {
01183         public:
01184                 TItem item;
01185                 int child, sib;
01186                 bool terminal;
01187                 TNode() : child(-1), sib(-1), terminal(false) { }
01188                 TNode(const TItem& item_, const int child_, const int sib_, const bool terminal_) : item(item_), child(child_), sib(sib_), terminal(terminal_) { }
01189         };
01190         typedef TVec<TNode> TNodeV;
01191         typedef TPair<TItem, TItem> TItemPr;
01192         typedef TTriple<TItem, TItem, TItem> TItemTr;
01193         typedef TUniVecIdx TVecIdx;
01194         THash<TItem, TVoid> singles; //
01195         THash<TItemPr, TVoid> pairs;
01196         THash<TItemTr, TInt> roots;
01197         TNodeV nodes;
01198 public:
01199         TUniTrie() { }
01200         void Clr() { singles.Clr(); pairs.Clr(); roots.Clr(); nodes.Clr(); }
01201
01202         bool Empty() const { return singles.Empty() && pairs.Empty() && roots.Empty(); }
01203
01204         bool Has1Gram(const TItem& item) const { return singles.IsKey(item); }
01205         bool Has2Gram(const TItem& last, const TItem& butLast) const { return pairs.IsKey(TItemPr(last, butLast)); }
01206         int Get3GramRoot(const TItem& last, const TItem& butLast, const TItem& butButLast) const {
01207                 int keyId = roots.GetKeyId(TItemTr(last, butLast, butButLast));
01208                 if (keyId < 0) return 0; else return roots[keyId]; }
01209         int GetChild(const int parentIdx, const TItem& item) const {
01210                 for (int childIdx = nodes[parentIdx].child; childIdx >= 0; ) {
01211                         const TNode &node = nodes[childIdx];
01212                         if (node.item == item) return childIdx;
01213                         childIdx = node.sib; }
01214                 return -1; }
01215         bool IsNodeTerminal(const int nodeIdx) const { return nodes[nodeIdx].terminal; }
01216
01217         // Adds a new string to the trie.  Note that the last characters appear
01218         // closer to the root of the trie.
01219         template<typename TSrcVec>
01220         void Add(const TSrcVec& src, const size_t srcIdx, const size_t srcCount)
01221         {
01222                 IAssert(srcCount > 0);
01223                 if (srcCount == 1) { singles.AddKey(TItem(src[TVecIdx(srcIdx)])); return; }
01224                 if (srcCount == 2) { pairs.AddKey(TItemPr(TItem(src[TVecIdx(srcIdx + 1)]), TItem(src[TVecIdx(srcIdx)]))); return; }
01225                 size_t srcLast = srcIdx + (srcCount - 1);
01226                 TItemTr tr = TItemTr(TItem(src[TVecIdx(srcLast)]), TItem(src[TVecIdx(srcLast - 1)]), TItem(src[TVecIdx(srcLast - 2)]));
01227                 int keyId = roots.GetKeyId(tr), curNodeIdx = -1;
01228                 if (keyId >= 0) curNodeIdx = roots[keyId];
01229                 else { curNodeIdx = nodes.Add(TNode(TItem(0), -1, -1, false)); roots.AddDat(tr, curNodeIdx); }
01230                 //
01231                 if (srcCount > 3) for (size_t srcPos = srcLast - 3; ; )
01232                 {
01233                         const TItem curItem = src[TVecIdx(srcPos)];
01234                         int childNodeIdx = nodes[curNodeIdx].child;
01235                         while (childNodeIdx >= 0) {
01236                                 TNode &childNode = nodes[childNodeIdx];
01237                                 if (childNode.item == curItem) break;
01238                                 childNodeIdx = childNode.sib; }
01239                         if (childNodeIdx < 0) {
01240                                 childNodeIdx = nodes.Add(TNode(curItem, -1, nodes[curNodeIdx].child, false));
01241                                 nodes[curNodeIdx].child = childNodeIdx; }
01242                         curNodeIdx = childNodeIdx;
01243                         if (srcPos == srcIdx) break; else srcPos--;
01244                 }
01245                 nodes[curNodeIdx].terminal = true;
01246         }
01247
01248         template<typename TSrcVec>
01249         void Add(const TSrcVec& src) { Add(src, 0, (size_t) src.Len()); }
01250 };
01251
01252 //-----------------------------------------------------------------------------
01253 // TUniChDb -- provides access to the Unicode Character Database
01254 //-----------------------------------------------------------------------------
01255
01256 class TUniChDb
01257 {
01258 protected:
01259         void InitAfterLoad();
01260         typedef TUniVecIdx TVecIdx;
01261
01262 public:
01263         THash<TInt, TUniChInfo> h; // key: codepoint
01264         TStrPool charNames;
01265         TStrIntH scripts; // keyID = used in TUniChInfo.script; key = script name; dat = number of characters (informative only)
01266         TIntV decompositions;
01267         THash<TIntPr, TInt> inverseDec;
01268         TUniCaseFolding caseFolding;
01269         // These hash tables contain only the unconditional mappings from SpecialCasing.txt.
01270         // The conditional mappings are hardcoded into GetCaseConverted().
01271         TIntIntVH specialCasingLower, specialCasingUpper, specialCasingTitle;
01272         int scriptUnknown; // = scripts.GetKey("Unknown")
01273
01274         TUniChDb() : scriptUnknown(-1) { }
01275         explicit TUniChDb(TSIn& SIn) { Load(SIn); }
01276         void Clr() {
01277                 h.Clr(); charNames.Clr(); decompositions.Clr(); inverseDec.Clr(); caseFolding.Clr();
01278                 specialCasingLower.Clr(); specialCasingUpper.Clr(); specialCasingTitle.Clr();
01279                 scripts.Clr(); }
01280         void Save(TSOut& SOut) const {
01281                 h.Save(SOut); charNames.Save(SOut); decompositions.Save(SOut);
01282                 inverseDec.Save(SOut); caseFolding.Save(SOut); scripts.Save(SOut);
01283                 specialCasingLower.Save(SOut); specialCasingUpper.Save(SOut); specialCasingTitle.Save(SOut);
01284                 SOut.SaveCs(); }
01285         void Load(TSIn& SIn) {
01286                 h.Load(SIn); charNames.~TStrPool(); new (&charNames) TStrPool(SIn);
01287                 decompositions.Load(SIn);
01288                 inverseDec.Load(SIn); caseFolding.Load(SIn); scripts.Load(SIn);
01289                 specialCasingLower.Load(SIn); specialCasingUpper.Load(SIn); specialCasingTitle.Load(SIn);
01290                 SIn.LoadCs(); InitAfterLoad(); }
01291         void LoadBin(const TStr& fnBin) {
01292                 PSIn SIn = TFIn::New(fnBin); Load(*SIn); }
01293         void Test(const TStr& basePath);
01294
01295         // File names used by LoadTxt() and its subroutines.
01296         static TStr GetCaseFoldingFn() { return "CaseFolding.txt"; }
01297         static TStr GetSpecialCasingFn() { return "SpecialCasing.txt"; }
01298         static TStr GetUnicodeDataFn() { return "UnicodeData.txt"; }
01299         static TStr GetCompositionExclusionsFn() { return "CompositionExclusions.txt"; }
01300         static TStr GetScriptsFn() { return "Scripts.txt"; }
01301         static TStr GetDerivedCorePropsFn() { return "DerivedCoreProperties.txt"; }
01302         static TStr GetLineBreakFn() { return "LineBreak.txt"; }
01303         static TStr GetPropListFn() { return "PropList.txt"; }
01304         static TStr GetAuxiliaryDir() { return "auxiliary"; }
01305         static TStr GetWordBreakTestFn() { return "WordBreakTest.txt"; }
01306         static TStr GetWordBreakPropertyFn() { return "WordBreakProperty.txt"; }
01307         static TStr GetSentenceBreakTestFn() { return "SentenceBreakTest.txt"; }
01308         static TStr GetSentenceBreakPropertyFn() { return "SentenceBreakProperty.txt"; }
01309         static TStr GetNormalizationTestFn() { return "NormalizationTest.txt"; }
01310         static TStr GetBinFn() { return "UniChDb.bin"; } // used only by Test()
01311
01312         //-------------------------------------------------------------------------
01313         // Script names
01314         //-------------------------------------------------------------------------
01315
01316         // These constants are used when initializing from the text files.
01317         static TStr GetScriptNameUnknown() { return "Unknown"; }
01318         static TStr GetScriptNameKatakana() { return "Katakana"; }
01319         static TStr GetScriptNameHiragana() { return "Hiragana"; }
01320         //
01321         const TStr& GetScriptName(const int scriptId) const { return scripts.GetKey(scriptId); }
01322         int GetScriptByName(const TStr& scriptName) const { return scripts.GetKeyId(scriptName); }
01323         int GetScript(const TUniChInfo& ci) const { int s = ci.script; if (s < 0) s = scriptUnknown; return s; }
01324         int GetScript(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return scriptUnknown; else return GetScript(h[i]); }
01325
01326         //-------------------------------------------------------------------------
01327         // Character namesnames
01328         //-------------------------------------------------------------------------
01329
01330         // GetCharName returns 0 if the name is unknown; GetCharNameS returns a string of the form "U+1234".
01331         const char *GetCharName(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return 0; int ofs = h[i].nameOffset; return ofs < 0 ? 0 : charNames.GetCStr(ofs); }
01332         TStr GetCharNameS(const int cp) const {
01333                 // ToDo: Add special processing for precomposed Hangul syllables (UAX #15, sec. 16).
01334                 const char *p = GetCharName(cp); if (p) return p;
01335                 char buf[20]; sprintf(buf, "U+%04x", cp); return TStr(buf); }
01336         template<class TSrcVec> void PrintCharNames(FILE *f, const TSrcVec& src, size_t srcIdx, const size_t srcCount, const TStr& prefix) const {
01337                 if (! f) f = stdout;
01338                 for (const size_t srcEnd = srcIdx + srcCount; srcIdx < srcEnd; srcIdx++) {
01339                         fprintf(f, "%s", prefix.CStr());
01340                         int cp = src[TVecIdx(srcIdx)]; fprintf(f, (cp >= 0x10000 ? "U+%05x" : "U+%04x "), cp);
01341                         fprintf(f, " %s\n", GetCharNameS(cp).CStr()); }}
01342         template<class TSrcVec> void PrintCharNames(FILE *f, const TSrcVec& src, const TStr& prefix) const { PrintCharNames(f, src, 0, src.Len(), prefix); }
01343
01344         //-------------------------------------------------------------------------
01345         // Character information
01346         //-------------------------------------------------------------------------
01347         // These methods provide access to a subset of the functionality
01348         // available in TUniChInfo.
01349
01350         bool IsGetChInfo(const int cp, TUniChInfo& ChInfo) {
01351                 int i = h.GetKeyId(cp);
01352                 if (i < 0) return false; else { ChInfo=h[i]; return true; }}
01353         TUniChCategory GetCat(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return ucOther; else return h[i].cat; }
01354         TUniChSubCategory GetSubCat(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return ucOtherNotAssigned; else return h[i].subCat; }
01355
01356         bool IsWbFlag(const int cp, const TUniChFlags flag) const { int i = h.GetKeyId(cp); if (i < 0) return false; else return h[i].IsWbFlag(flag); }
01357         int GetWbFlags(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return 0; else return h[i].GetWbFlags(); }
01358         bool IsSbFlag(const int cp, const TUniChFlags flag) const { int i = h.GetKeyId(cp); if (i < 0) return false; else return h[i].IsSbFlag(flag); }
01359         int GetSbFlags(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return 0; else return h[i].GetSbFlags(); }
01360
01361 #define ___UniFwd1(name) bool name(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return false; else return h[i].name(); }
01362 #define ___UniFwd2(name1, name2) ___UniFwd1(name1) ___UniFwd1(name2)
01363 #define ___UniFwd3(name1, name2, name3) ___UniFwd2(name1, name2) ___UniFwd1(name3)
01364 #define ___UniFwd4(name1, name2, name3, name4) ___UniFwd3(name1, name2, name3) ___UniFwd1(name4)
01365 #define ___UniFwd5(name1, name2, name3, name4, name5) ___UniFwd4(name1, name2, name3, name4) ___UniFwd1(name5)
01366
01367 #define DECLARE_FORWARDED_PROPERTY_METHODS \
01368         ___UniFwd5(IsAsciiHexDigit, IsBidiControl, IsDash, IsDeprecated, IsDiacritic) \
01369         ___UniFwd5(IsExtender, IsGraphemeLink, IsHexDigit, IsHyphen, IsIdeographic)  \
01370         ___UniFwd5(IsJoinControl, IsLogicalOrderException, IsNoncharacter, IsQuotationMark, IsSoftDotted)  \
01371         ___UniFwd4(IsSTerminal, IsTerminalPunctuation, IsVariationSelector, IsWhiteSpace)  \
01372         ___UniFwd5(IsAlphabetic, IsUppercase, IsLowercase, IsMath, IsDefaultIgnorable)  \
01373         ___UniFwd4(IsGraphemeBase, IsGraphemeExtend, IsIdStart, IsIdContinue)  \
01374         ___UniFwd2(IsXidStart, IsXidContinue)  \
01375         ___UniFwd3(IsCompositionExclusion, IsCompatibilityDecomposition, IsSbSep)  \
01376         ___UniFwd1(IsGbExtend)  \
01377         ___UniFwd2(IsCased, IsCurrency)
01378
01379         DECLARE_FORWARDED_PROPERTY_METHODS
01380
01381 #undef ___UniFwd1
01382
01383         bool IsPrivateUse(const int cp) const {
01384                 int i = h.GetKeyId(cp); if (i >= 0) return h[i].IsPrivateUse();
01385                 return (0xe000 <= cp && cp <= 0xf8ff) ||  // plane 0 private-use area
01386                         // Planes 15 and 16 are entirely for private use.
01387                         (0xf0000 <= cp && cp <= 0xffffd) || (0x100000 <= cp && cp <= 0x10fffd); }
01388         // Note: d800..dbff are high surrogates, dc00..dfff are low surrogates.
01389         // For db80..dbff it is clear that the surrogate pair containing this high surrogate
01390         // will refer to a private-use codepoint, but IsPrivateUse nevertheless returns false
01391         // for db80..dbff.  This is consistent with the category codes assigned in UnicodeData.txt.
01392         bool IsSurrogate(const int cp) const {
01393                 int i = h.GetKeyId(cp); if (i >= 0) return h[i].IsSurrogate();
01394                 return 0xd800 <= cp && cp <= 0xdcff; }
01395
01396         // Note: in particular, all Hangul characters (HangulLBase..HangulLBase + HangulLCount - 1
01397         // and HangulSBase..HangulSBase + HangulSCount - 1) should be treated as starters
01398         // for composition to work correctly.
01399         int GetCombiningClass(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return TUniChInfo::ccStarter; else return h[i].combClass; }
01400
01401         //-------------------------------------------------------------------------
01402         // Hangul constants
01403         //-------------------------------------------------------------------------
01404
01405         enum {
01406         HangulSBase = 0xAC00, HangulLBase = 0x1100, HangulVBase = 0x1161, HangulTBase = 0x11A7,
01407         HangulLCount = 19, HangulVCount = 21, HangulTCount = 28,
01408         HangulNCount = HangulVCount * HangulTCount,   // 588
01409         HangulSCount = HangulLCount * HangulNCount   // 11172
01410         };
01411
01412         //-------------------------------------------------------------------------
01413         // Word boundaries (UAX #29)
01414         //-------------------------------------------------------------------------
01415
01416 protected:
01417         // UAX #29, rule WB3: ignore Format and Extend characters.
01418         // [Note: rule SB5 for sentence boundaries is identical, and thus these methods will also be used for sentence-boundary detection.]
01419         static bool IsWbIgnored(const TUniChInfo& ci) { return ci.IsGbExtend() || ci.IsWbFormat(); }
01420         bool IsWbIgnored(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return false; else return IsWbIgnored(h[i]); }
01421         // Sets 'position' to the smallest index from 'position..srcEnd-1' that contains a non-ignored character.
01422         template<typename TSrcVec> void WbFindCurOrNextNonIgnored(const TSrcVec& src, size_t& position, const size_t srcEnd) const {
01423                 while (position < srcEnd && IsWbIgnored(src[TVecIdx(position)])) position++; }
01424         // Sets 'position' to the smallest index from 'position+1..srcEnd-1' that contains a non-ignored character.
01425         template<typename TSrcVec> void WbFindNextNonIgnored(const TSrcVec& src, size_t& position, const size_t srcEnd) const {
01426                 if (position >= srcEnd) return;
01427                 position++; while (position < srcEnd && IsWbIgnored(src[TVecIdx(position)])) position++; }
01428         // Sets 'position' to the smallest index from 'position+1..srcEnd-1' that contains a non-ignored character.
01429         template<typename TSrcVec> void WbFindNextNonIgnoredS(const TSrcVec& src, size_t& position, const size_t srcEnd) const {
01430                 if (position >= srcEnd) return;
01431                 if (IsSbSep(src[TVecIdx(position)])) { position++; return; }
01432                 position++; while (position < srcEnd && IsWbIgnored(src[TVecIdx(position)])) position++; }
01433         // Sets 'position' to the largest index from 'srcStart..position-1' that contains a non-ignored character.
01434         template<typename TSrcVec> bool WbFindPrevNonIgnored(const TSrcVec& src, const size_t srcStart, size_t& position) const {
01435                 if (position <= srcStart) return false;
01436                 while (position > srcStart) {
01437                         position--; if (! IsWbIgnored(src[TVecIdx(position)])) return true; }
01438                 return false; }
01439         // Test driver for WbFind*NonIgnored.
01440         void TestWbFindNonIgnored(const TIntV& src) const;
01441         void TestWbFindNonIgnored() const;
01442 public:
01443         // Finds the next word boundary strictly after 'position'.
01444         // Note that there is a valid word boundary at 'srcIdx + srcCount'.
01445         // If there is no such word boundary, it returns 'false' and sets 'position' to 'srcIdx + srcCount'.
01446         template<typename TSrcVec>
01447         bool FindNextWordBoundary(const TSrcVec& src, const size_t srcIdx, const size_t srcCount, size_t &position) const;
01448         // Creates, in 'dest', a vector of 'srcCount + 1' elements, where 'dest[i]' tells if there is a word
01449         // boundary between 'src[srcIdx + i - 1]' and 'src[srcIdx + i]'.  Note that 'dest[0]' and 'dest[srcCount]' are
01450         // always set to 'true'.
01451         template<typename TSrcVec>
01452         void FindWordBoundaries(const TSrcVec& src, const size_t srcIdx, const size_t srcCount, TBoolV& dest) const;
01453 protected:
01454         void TestFindNextWordOrSentenceBoundary(const TStr& basePath, bool sentence);
01455
01456         //-------------------------------------------------------------------------
01457         // Sentence boundaries (UAX #29)
01458         //-------------------------------------------------------------------------
01459
01460 protected:
01461         TUniTrie<TInt> sbExTrie;
01462
01463         // Checks whether a sentence that ended at src[position - 1]
01464         // would end in one of the suffixes from sbExTrie.
01465         template<typename TSrcVec>
01466         bool CanSentenceEndHere(const TSrcVec& src, const size_t srcIdx, const size_t position) const;
01467
01468 public:
01469         // Finds the next sentence boundary strictly after 'position'.
01470         // Note that there is a valid sentence boundary at 'srcIdx + srcCount'.
01471         // If there is no such sentence boundary, it returns 'false' and sets 'position' to 'srcIdx + srcCount'.
01472         template<typename TSrcVec>
01473         bool FindNextSentenceBoundary(const TSrcVec& src, const size_t srcIdx, const size_t srcCount, size_t &position) const;
01474         // Creates, in 'dest', a vector of 'srcCount + 1' elements, where 'dest[i]' tells if there is a sentence
01475         // boundary between 'src[srcIdx + i - 1]' and 'src[srcIdx + i]'.  Note that 'dest[0]' and 'dest[srcCount]' are
01476         // always set to 'true'.
01477         template<typename TSrcVec>
01478         void FindSentenceBoundaries(const TSrcVec& src, const size_t srcIdx, const size_t srcCount, TBoolV& dest) const;
01479
01480         // These methods allow the user to define a set of sentence boundary exceptions.
01481         // This is a set of strings, stored in 'sbExTrie'.  If the Unicode rules require
01482         // a sentence boundary in a position that would cause the sentence to end with
01483         // 'x (STerm | ATerm) Close* Sp* Sep?', where 'x' is a word from 'sbExTrie',
01484         // we will *not* place a sentence boundary there.
01485         //
01486         // NOTE: sbExTrie is not saved or loaded by the Save() and Load() methods.
01487         // By default, it is empty.  Use SbEx_Clr() to clear it, and SbEx_SetStdEnglish() to obtain
01488         // a standard set of English-language exceptions.
01489         void SbEx_Clr() { sbExTrie.Clr(); }
01490         template<class TSrcVec> void SbEx_Add(const TSrcVec& v) { sbExTrie.Add(v); }
01491         // template<> void SbEx_Add(const TStr& s) {
01492         void SbEx_Add(const TStr& s) {
01493           TIntV v; int n = s.Len(); v.Gen(n); for (int i = 0; i < n; i++) v[i] = int(uchar(s[i])); SbEx_Add(v); }
01494         void SbEx_AddUtf8(const TStr& s) { TUniCodec codec; TIntV v; codec.DecodeUtf8(s, v); SbEx_Add(v); }
01495         int SbEx_AddMulti(const TStr& words, const bool wordsAreUtf8 = true) { TStrV vec; words.SplitOnAllCh('|', vec);
01496                 for (int i = 0; i < vec.Len(); i++) if (wordsAreUtf8) SbEx_AddUtf8(vec[i]); else SbEx_Add(vec[i]);
01497                 return vec.Len(); }
01498         void SbEx_Set(const TUniTrie<TInt>& newTrie) { sbExTrie = newTrie; }
01499         int SbEx_SetStdEnglish() {
01500                 static const TStr data = "Ms|Mrs|Mr|Rev|Dr|Prof|Gov|Sen|Rep|Gen|Brig|Col|Capt|Lieut|Lt|Sgt|Pvt|Cmdr|Adm|Corp|St|Mt|Ft|e.g|e. g.|i.e|i. e|ib|ibid|s.v|s. v|s.vv|s. vv";
01501                 SbEx_Clr(); return SbEx_AddMulti(data, false); }
01502
01503         //-------------------------------------------------------------------------
01504         // Normalization, decomposition, etc. (UAX #15)
01505         //-------------------------------------------------------------------------
01506
01507 protected:
01508         // Adds, to 'dest', the decomposition of 'codePoint' (calling itself recursively if necessary).
01509         // If 'compatibility == false', only canonical decompositions are used.
01510         template<typename TDestCh>
01511         void AddDecomposition(const int codePoint, TVec<TDestCh>& dest, const bool compatibility) const;
01512 public:
01513         // This appends, to 'dest', the decomposed form of the source string.
01514         // - for normalization form D (NFD), i.e. canonical decomposition: use compatibility == false;
01515         // - for normalization form KD (NFKD), i.e. compatibility decomposition: use compatibility == true.
01516         template<typename TSrcVec, typename TDestCh>
01517         void Decompose(const TSrcVec& src, size_t srcIdx, const size_t srcCount,
01518                         TVec<TDestCh>& dest, bool compatibility, bool clrDest = true) const;
01519         template<typename TSrcVec, typename TDestCh>
01520         void Decompose(const TSrcVec& src, TVec<TDestCh>& dest, bool compatibility, bool clrDest = true) const {
01521                 Decompose(src, 0, src.Len(), dest, compatibility, clrDest); }
01522         // This performs canonical composition on the source string, and appends
01523         // the result to the destination string.  The source string should be the
01524         // result of a (canonical or compatibility) decomposition; if this is the
01525         // case, the composition will lead to a normalization form C (NFC) or
01526         // normalization form KC (NFKC), depending on whether canonical or compatibility
01527         // decomposition was used.
01528         template<typename TSrcVec, typename TDestCh>
01529         void Compose(const TSrcVec& src, size_t srcIdx, const size_t srcCount,
01530                         TVec<TDestCh>& dest, bool clrDest = true) const;
01531         template<typename TSrcVec, typename TDestCh>
01532         void Compose(const TSrcVec& src, TVec<TDestCh>& dest, bool clrDest = true) const {
01533                 Compose(src, 0, src.Len(), dest, clrDest); }
01534         // Calls Decompose, followed by Compose; thus the result is the NFC (if
01535         // compatibility == false) or NFKC (if compatibility == true) of the source string.
01536         // A temporary TIntV is used to contain the intermediate NF(K)D form of the
01537         // source string.
01538         template<typename TSrcVec, typename TDestCh>
01539         void DecomposeAndCompose(const TSrcVec& src, size_t srcIdx, const size_t srcCount,
01540                         TVec<TDestCh>& dest, bool compatibility, bool clrDest = true) const;
01541         template<typename TSrcVec, typename TDestCh>
01542         void DecomposeAndCompose(const TSrcVec& src, TVec<TDestCh>& dest, bool compatibility, bool clrDest = true) const {
01543                 DecomposeAndCompose(src, 0, src.Len(), dest, compatibility, clrDest); }
01544         // Copies the starter characters from 'src' to 'dest'; the other
01545         // characters are skipped.  'src' should already have been decomposed.
01546         // Returns the number of characters extracted.
01547         template<typename TSrcVec, typename TDestCh>
01548         size_t ExtractStarters(const TSrcVec& src, size_t srcIdx, const size_t srcCount,
01549                         TVec<TDestCh>& dest, bool clrDest = true) const;
01550         template<typename TSrcVec, typename TDestCh>
01551         size_t ExtractStarters(const TSrcVec& src, TVec<TDestCh>& dest, bool clrDest = true) const {
01552                 return ExtractStarters(src, 0, src.Len(), dest, clrDest); }
01553         // Extracts the starters into a temporary vector and then copies it into 'src'.
01554         template<typename TSrcVec>
01555         size_t ExtractStarters(TSrcVec& src) const {
01556                 TIntV temp; size_t retVal = ExtractStarters(src, temp);
01557                 src.Clr(); for (int i = 0; i < temp.Len(); i++) src.Add(temp[i]);
01558                 return retVal; }
01559
01560 protected:
01561         void TestComposition(const TStr& basePath);
01562
01563         //-------------------------------------------------------------------------
01564         // Initialization from the text files
01565         //-------------------------------------------------------------------------
01566
01567 protected:
01568         void InitWordAndSentenceBoundaryFlags(const TStr& basePath);
01569         void InitScripts(const TStr& basePath);
01570         void InitLineBreaks(const TStr& basePath);
01571         void InitDerivedCoreProperties(const TStr& basePath);
01572         void InitPropList(const TStr& basePath);
01573         void InitSpecialCasing(const TStr& basePath);
01574         void LoadTxt_ProcessDecomposition(TUniChInfo& ci, TStr s);
01575 public:
01576         void LoadTxt(const TStr& basePath);
01577         void SaveBin(const TStr& fnBinUcd);
01578
01579         //-------------------------------------------------------------------------
01580         // Case conversions
01581         //-------------------------------------------------------------------------
01582
01583 public:
01584         typedef enum TCaseConversion_ { ccLower = 0, ccUpper = 1, ccTitle = 2, ccMax = 3 } TCaseConversion;
01585         // Appends the case-converted form of 'src' to 'dest'.
01586         // 'how' defines what kind of case conversion is required.
01587         // 'turkic' should be set to true iff the text is in Turkic ('tr') or Azeri ('ar').
01588         // 'lithuanian' should be set to true iff the text is in Lithuanian ('lt').
01589         template<typename TSrcVec, typename TDestCh> void GetCaseConverted(const TSrcVec& src, size_t srcIdx, const size_t srcCount, TVec<TDestCh>& dest, const bool clrDest, const TCaseConversion how, const bool turkic, const bool lithuanian) const;
01590         template<typename TSrcVec, typename TDestCh> void GetLowerCase(const TSrcVec& src, size_t srcIdx, const size_t srcCount, TVec<TDestCh>& dest, const bool clrDest = true, const bool turkic = false, const bool lithuanian = false) const { GetCaseConverted(src, srcIdx, srcCount, dest, clrDest, ccLower, turkic, lithuanian); }
01591         template<typename TSrcVec, typename TDestCh> void GetUpperCase(const TSrcVec& src, size_t srcIdx, const size_t srcCount, TVec<TDestCh>& dest, const bool clrDest = true, const bool turkic = false, const bool lithuanian = false) const { GetCaseConverted(src, srcIdx, srcCount, dest, clrDest, ccUpper, turkic, lithuanian); }
01592         template<typename TSrcVec, typename TDestCh> void GetTitleCase(const TSrcVec& src, size_t srcIdx, const size_t srcCount, TVec<TDestCh>& dest, const bool clrDest = true, const bool turkic = false, const bool lithuanian = false) const { GetCaseConverted(src, srcIdx, srcCount, dest, clrDest, ccTitle, turkic, lithuanian); }
01593         template<typename TSrcVec, typename TDestCh> void GetLowerCase(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true, const bool turkic = false, const bool lithuanian = false) const { GetLowerCase(src, 0, src.Len(), dest, clrDest, turkic, lithuanian); }
01594         template<typename TSrcVec, typename TDestCh> void GetUpperCase(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true, const bool turkic = false, const bool lithuanian = false) const { GetUpperCase(src, 0, src.Len(), dest, clrDest, turkic, lithuanian); }
01595         template<typename TSrcVec, typename TDestCh> void GetTitleCase(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true, const bool turkic = false, const bool lithuanian = false) const { GetTitleCase(src, 0, src.Len(), dest, clrDest, turkic, lithuanian); }
01596
01597         // GetSimpleCaseConverted uses only the simple case mappings (from UnicodeData.txt).
01598         // This is simpler and faster.  Since each character now maps into exactly one
01599         // character, case conversion can also be done in place (see ToSimpleCaseConverted, etc.).
01600         template<typename TSrcVec, typename TDestCh> void GetSimpleCaseConverted(const TSrcVec& src, size_t srcIdx, const size_t srcCount, TVec<TDestCh>& dest, const bool clrDest, const TCaseConversion how) const;
01601         template<typename TSrcVec, typename TDestCh> void GetSimpleLowerCase(const TSrcVec& src, size_t srcIdx, const size_t srcCount, TVec<TDestCh>& dest, const bool clrDest = true) const { GetSimpleCaseConverted(src, srcIdx, srcCount, dest, clrDest, ccLower); }
01602         template<typename TSrcVec, typename TDestCh> void GetSimpleUpperCase(const TSrcVec& src, size_t srcIdx, const size_t srcCount, TVec<TDestCh>& dest, const bool clrDest = true) const { GetSimpleCaseConverted(src, srcIdx, srcCount, dest, clrDest, ccUpper); }
01603         template<typename TSrcVec, typename TDestCh> void GetSimpleTitleCase(const TSrcVec& src, size_t srcIdx, const size_t srcCount, TVec<TDestCh>& dest, const bool clrDest = true) const { GetSimpleCaseConverted(src, srcIdx, srcCount, dest, clrDest, ccTitle); }
01604         template<typename TSrcVec, typename TDestCh> void GetSimpleLowerCase(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true) const { GetSimpleLowerCase(src, 0, src.Len(), dest, clrDest); }
01605         template<typename TSrcVec, typename TDestCh> void GetSimpleUpperCase(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true) const { GetSimpleUpperCase(src, 0, src.Len(), dest, clrDest); }
01606         template<typename TSrcVec, typename TDestCh> void GetSimpleTitleCase(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true) const { GetSimpleTitleCase(src, 0, src.Len(), dest, clrDest); }
01607
01608         template<typename TSrcVec> void ToSimpleCaseConverted(TSrcVec& src, size_t srcIdx, const size_t srcCount, const TCaseConversion how) const;
01609         template<typename TSrcVec> void ToSimpleUpperCase(TSrcVec& src, size_t srcIdx, const size_t srcCount) const { ToSimpleCaseConverted(src, srcIdx, srcCount, ccUpper); }
01610         template<typename TSrcVec> void ToSimpleLowerCase(TSrcVec& src, size_t srcIdx, const size_t srcCount) const { ToSimpleCaseConverted(src, srcIdx, srcCount, ccLower); }
01611         template<typename TSrcVec> void ToSimpleTitleCase(TSrcVec& src, size_t srcIdx, const size_t srcCount) const { ToSimpleCaseConverted(src, srcIdx, srcCount, ccTitle); }
01612         template<typename TSrcVec> void ToSimpleUpperCase(TSrcVec& src) const { ToSimpleUpperCase(src, 0, src.Len()); }
01613         template<typename TSrcVec> void ToSimpleLowerCase(TSrcVec& src) const { ToSimpleLowerCase(src, 0, src.Len()); }
01614         template<typename TSrcVec> void ToSimpleTitleCase(TSrcVec& src) const { ToSimpleTitleCase(src, 0, src.Len()); }
01615
01616 public:
01617         friend class TUniCaseFolding;
01618
01619         // Case folding is an alternative to the above functions.  It is intended primarily
01620         // to produce strings that are suitable for comparisons.  For example,
01621         // ToLowerCase(sigma) = sigma, ToLowerCase(final-sigma) = final-sigma;
01622         // but ToCaseFolder(sigma) = sigma, ToCaseFolded(final-sigma) = sigma.
01623         // - 'turkic' enables special processing for Turkic languages (I-dot and i-dotless).
01624         // - 'full' enables full case mappings -- i.e. sometimes a character may be mapped
01625         //   into a string of two or more characters.
01626         // - Note: For best results, perform NFD(CaseFold(NFD(x)) or NFKD(CaseFold(NFKD(x)) on
01627         //   each string before comparing them (see sec. 3.13 of the standard).
01628         template<typename TSrcVec, typename TDestCh>
01629         void GetCaseFolded(const TSrcVec& src, size_t srcIdx, const size_t srcCount,
01630                 TVec<TDestCh>& dest, const bool clrDest, const bool full, const bool turkic = false) const { caseFolding.Fold(src, srcIdx, srcCount, dest, clrDest, full, turkic); }
01631         template<typename TSrcVec, typename TDestCh>
01632         void GetCaseFolded(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true, const bool full = true, const bool turkic = false) const {
01633                 GetCaseFolded(src, 0, src.Len(), dest, clrDest, full, turkic); }
01634         // ToCaseFolded folds the string in place.  However, this means that only the simple
01635         // case foldings can be used (the full ones could increase the length of the string).
01636         template<typename TSrcVec> void ToCaseFolded(TSrcVec& src, size_t srcIdx, const size_t srcCount, const bool turkic = false) const { caseFolding.FoldInPlace(src, srcIdx, srcCount, turkic); }
01637         template<typename TSrcVec> void ToCaseFolded(TSrcVec& src, const bool turkic = false) const { ToCaseFolded(src, 0, src.Len(), turkic); }
01638
01639 protected:
01640         void TestCaseConversion(const TStr& source, const TStr& trueLc, const TStr& trueTc, const TStr& trueUc, bool turkic, bool lithuanian);
01641         void TestCaseConversions();
01642
01643         //-------------------------------------------------------------------------
01644         // Text file reader for the Unicode character database
01645         //-------------------------------------------------------------------------
01646
01647 protected:
01648
01649         class TUcdFileReader
01650         {
01651         protected:
01652                 TChA buf;
01653         public:
01654                 TChA comment; // contains '#' and everything after it
01655         protected:
01656                 FILE *f;
01657                 int putBackCh;
01658                 int GetCh() {
01659                         if (putBackCh >= 0) { int c = putBackCh; putBackCh = EOF; return c; }
01660                         return fgetc(f); }
01661                 void PutBack(int c) { Assert(putBackCh == EOF); putBackCh = c; }
01662                 // Returns 'false' iff the EOF was encountered before anything was read.
01663                 bool ReadNextLine() {
01664                         buf.Clr(); comment.Clr();
01665                         bool inComment = false, first = true;
01666                         while (true) {
01667                                 int c = GetCh();
01668                                 if (c == EOF) return ! first;
01669                                 else if (c == 13) {
01670                                         c = GetCh(); if (c != 10) PutBack(c);
01671                                         return true; }
01672                                 else if (c == 10) return true;
01673                                 else if (c == '#') inComment = true;
01674                                 if (! inComment) buf += char(c);
01675                                 else comment += char(c); }
01676                                 /*first = false;*/}
01677         private:
01678                 TUcdFileReader& operator = (const TUcdFileReader& r) { Fail; return *((TUcdFileReader *) 0); }
01679                 TUcdFileReader(const TUcdFileReader& r) { Fail; }
01680         public:
01681                 TUcdFileReader() : f(0) { }
01682                 TUcdFileReader(const TStr& fileName) : f(0), putBackCh(EOF) { Open(fileName); }
01683                 void Open(const TStr& fileName) { Close(); f = fopen(fileName.CStr(), "rt"); IAssertR(f, fileName); putBackCh = EOF; }
01684                 void Close() { putBackCh = EOF; if (f) { fclose(f); f = 0; }}
01685                 ~TUcdFileReader() { Close(); }
01686                 bool GetNextLine(TStrV& dest) {
01687                         dest.Clr();
01688                         while (true) {
01689                                 if (! ReadNextLine()) return false;
01690                                 TStr line = buf; line.ToTrunc();
01691                                 if (line.Len() <= 0) continue;
01692                                 line.SplitOnAllCh(';', dest, false);
01693                                 for (int i = 0; i < dest.Len(); i++) dest[i].ToTrunc();
01694                                 return true; }}
01695                 static int ParseCodePoint(const TStr& s) {
01696                         int c; bool ok = s.IsHexInt(true, 0, 0x10ffff, c); IAssertR(ok, s); return c; }
01697                 static void ParseCodePointList(const TStr& s, TIntV& dest, bool ClrDestP = true) { // space-separated list
01698                         if (ClrDestP) dest.Clr();
01699                         TStrV parts; s.SplitOnWs(parts);
01700                         for (int i = 0; i < parts.Len(); i++) {
01701                                 int c; bool ok = parts[i].IsHexInt(true, 0, 0x10ffff, c); IAssertR(ok, s);
01702                                 dest.Add(c); } }
01703                 static void ParseCodePointRange(const TStr& s, int& from, int &to) { // xxxx or xxxx..yyyy
01704                         int i = s.SearchStr(".."); if (i < 0) { from = ParseCodePoint(s); to = from; return; }
01705                         from = ParseCodePoint(s.GetSubStr(0, i - 1));
01706                         to = ParseCodePoint(s.GetSubStr(i + 2, s.Len() - 1)); }
01707         };
01708
01709         //-------------------------------------------------------------------------
01710         // Helper class for processing the text files
01711         //-------------------------------------------------------------------------
01712         // Files such as DerivedCoreProps.txt often refer to ranges of codepoints,
01713         // and not all codepoints from the range have also been listed in
01714         // UnicodeData.txt.  Thus, new TUniChInfo instances will be created
01715         // when processing DerivedCoreProps.txt and similar files.
01716         // To assign the correct (sub)categories to these new codepoints,
01717         // the following class will extract the subcategory info from the
01718         // comments in DerivedCoreProps.txt and similar files.
01719
01720         class TSubcatHelper
01721         {
01722         public:
01723                 bool hasCat; TUniChSubCategory subCat;
01724                 TStrH invalidCatCodes;
01725                 TUniChDb &owner;
01726
01727                 TSubcatHelper(TUniChDb &owner_) : owner(owner_) { }
01728
01729                 void ProcessComment(TUniChDb::TUcdFileReader &reader)
01730                 {
01731                         hasCat = false; subCat = ucOtherNotAssigned;
01732                         if (reader.comment.Len() > 3)
01733                         {
01734                                 IAssert(reader.comment[0] == '#');
01735                                 IAssert(reader.comment[1] == ' ');
01736                                 char chCat = reader.comment[2], chSubCat = reader.comment[3];
01737                                 if (reader.comment.Len() > 4) IAssert(isspace(uchar(reader.comment[4])));
01738                                 if (TUniChInfo::IsValidSubCat(chCat, chSubCat)) {
01739                                         hasCat = true; subCat = (TUniChSubCategory) ((int(uchar(chCat)) << 8) | (int(uchar(chSubCat)))); }
01740                                 else invalidCatCodes.AddKey(TStr(chCat) + TStr(chSubCat));
01741                         }
01742                 }
01743
01744                 void SetCat(const int cp) {
01745                         int i = owner.h.GetKeyId(cp); IAssert(i >= 0);
01746                         IAssert(owner.h[i].subCat == ucOtherNotAssigned);
01747                         IAssert(hasCat);
01748                         owner.h[i].SetCatAndSubCat(subCat); }
01749                 void TestCat(const int cp) {
01750                         if (! hasCat) return;
01751                         int i = owner.h.GetKeyId(cp); IAssert(i >= 0);
01752                         IAssert(owner.h[i].subCat == subCat); }
01753
01754                 ~TSubcatHelper()
01755                 {
01756                         if (invalidCatCodes.IsKey("L&")) invalidCatCodes.DelKey("L&");
01757                         // Output any unexpected ones (there shouldn't be any).
01758                         if (! invalidCatCodes.Empty()) {
01759                                 printf("Invalid cat code(s) in the comments: ");
01760                                 for (int i = invalidCatCodes.FFirstKeyId(); invalidCatCodes.FNextKeyId(i); )
01761                                         printf(" \"%s\"", invalidCatCodes.GetKey(i).CStr());
01762                                 printf("\n"); }
01763                 }
01764         };
01765 };
01766
01767 //-----------------------------------------------------------------------------
01768 // TUnicode -- a sadly emasculated wrapper around TUniCodec and TUniChDb
01769 //-----------------------------------------------------------------------------
01770
01771 class TUnicode
01772 {
01773 public:
01774         TUniCodec codec;
01775         TUniChDb ucd;
01776
01777         TUnicode() { Init(); }
01778         explicit TUnicode(const TStr& fnBinUcd) { ucd.LoadBin(fnBinUcd); Init(); }
01779         void Init() { InitCodecs(); }
01780
01781         //-----------------------------------------------------------------------
01782         // UTF-8
01783         //-----------------------------------------------------------------------
01784
01785         // Returns the number of characters that have been successfully decoded.
01786         // This does not include any replacement characters that may have been inserted into 'dest'.
01787         int DecodeUtf8(const TIntV& src, TIntV& dest) const { return (int) codec.DecodeUtf8(src, dest); }
01788         int DecodeUtf8(const TStr& src, TIntV& dest) const { return (int) codec.DecodeUtf8(src, dest); }
01789
01790         // Returns the number of characters that have been successfully encoded.
01791         // This does not include any replacement characters that may have been inserted into 'dest'.
01792         int EncodeUtf8(const TIntV& src, TIntV& dest) const { return (int) codec.EncodeUtf8(src, dest); }
01793
01794         // The following wrapper around the UTF-8 encoder returns a TStr containing
01795         // the UTF-8-encoded version of the input string.
01796         TStr EncodeUtf8Str(const TIntV& src) const { return codec.EncodeUtf8Str(src); }
01797
01798         // encoding one character to UTF8
01799         static void EncodeUtf8(const uint& Ch, TChA& Dest);
01800         static TStr EncodeUtf8(const uint& Ch);
01801
01802         //-----------------------------------------------------------------------
01803         // UTF-16 Decoder
01804         //-----------------------------------------------------------------------
01805
01806         // Returns the number of characters that have been successfully decoded.
01807         // This does not include any replacement characters that may have been inserted into 'dest'.
01808         // Each element of 'src' is assumed to contain one byte of data.
01809         // srcCount must be even (though srcIdx doesn't need to be).
01810         int DecodeUtf16FromBytes(const TIntV& src, TIntV& dest,
01811                 const TUtf16BomHandling bomHandling = bomAllowed,
01812                 const TUniByteOrder defaultByteOrder = boMachineEndian) const {
01813                         return (int) codec.DecodeUtf16FromBytes(src, 0, src.Len(), dest, true, bomHandling, defaultByteOrder); }
01814
01815         // Here, each element of 'src' is treated as a 16-bit word.  The byte-order settings
01816         // are used to determine if the two bytes of each word should be swapped before further
01817         // processing.  For example, if a BOM is present, it must have the value 0xfeff; if it
01818         // actually has the value 0xfffe, this means that the two bytes of each word must be swapped.
01819         // Basically, the combination of the byteOrder parameter and the byte order mark (if present) at the
01820         // beginning of the source data is used to determine the "original" byte order of the data;
01821         // if this doesn't match the byte order of the local machine, the two bytes of each word will
01822         // be swapped during the decoding process.
01823         int DecodeUtf16FromWords(const TIntV& src, TIntV& dest,
01824                 const TUtf16BomHandling bomHandling = bomAllowed,
01825                 const TUniByteOrder defaultByteOrder = boMachineEndian) const {
01826                         return (int) codec.DecodeUtf16FromWords(src, 0, src.Len(), dest, true, bomHandling, defaultByteOrder); }
01827
01828         //-----------------------------------------------------------------------
01829         // UTF-16 Encoder
01830         //-----------------------------------------------------------------------
01831
01832         // Returns the number of characters that have been successfully encoded.
01833         // This does not include any replacement characters that may have been inserted into 'dest'.
01834         int EncodeUtf16ToWords(const TIntV& src, TIntV& dest, const bool insertBom,
01835                 const TUniByteOrder destByteOrder = boMachineEndian) const {
01836                         return (int) codec.EncodeUtf16ToWords(src, 0, src.Len(), dest, true, insertBom, destByteOrder); }
01837
01838         int EncodeUtf16ToBytes(const TIntV& src, TIntV& dest, const bool insertBom,
01839                 const TUniByteOrder destByteOrder = boMachineEndian) const {
01840                         return (int) codec.EncodeUtf16ToBytes(src, 0, src.Len(), dest, true, insertBom, destByteOrder); }
01841
01842         //-----------------------------------------------------------------------
01843         // 8-bit codecs
01844         //-----------------------------------------------------------------------
01845
01846         T8BitCodec<TEncoding_ISO8859_1> iso8859_1;
01847         T8BitCodec<TEncoding_ISO8859_2> iso8859_2;
01848         T8BitCodec<TEncoding_ISO8859_3> iso8859_3;
01849         T8BitCodec<TEncoding_ISO8859_4> iso8859_4;
01850         T8BitCodec<TEncoding_YuAscii> yuAscii;
01851         T8BitCodec<TEncoding_CP1250> cp1250;
01852         T8BitCodec<TEncoding_CP852> cp852;
01853         T8BitCodec<TEncoding_CP437> cp437;
01854
01855         //-----------------------------------------------------------------------
01856         // Codec registry
01857         //-----------------------------------------------------------------------
01858         // If you know you'll need ISO-8859-2, just use
01859         //   TUnicode unicode;
01860         //   unicode.iso8859_2.Encode(...);
01861         // If you don't know what you'll need, use:
01862         //   TUnicode unicode;
01863         //   PCodecBase myCodec = unicode.GetCodec(myCodecName);
01864         //   myCodec->Encode(...);
01865         // Note that the first approach is slightly more efficient because there
01866         // aren't any virtual method calls involved.
01867
01868 protected:
01869         THash<TStr, PCodecBase> codecs;
01870         static inline TStr NormalizeCodecName(const TStr& name) {
01871                 TStr s = name.GetLc(); s.ChangeStrAll("_", ""); s.ChangeStrAll("-", ""); return s; }
01872 public:
01873         void RegisterCodec(const TStr& nameList, const PCodecBase& codec) {
01874                 TStrV names; nameList.SplitOnWs(names);
01875                 for (int i = 0; i < names.Len(); i++)
01876                         codecs.AddDat(NormalizeCodecName(names[i]), codec); }
01877         void UnregisterCodec(const TStr& nameList) {
01878                 TStrV names; nameList.SplitOnWs(names);
01879                 for (int i = 0; i < names.Len(); i++)
01880                         codecs.DelKey(NormalizeCodecName(names[i])); }
01881         void ClrCodecs() { codecs.Clr(); }
01882         void InitCodecs();
01883         PCodecBase GetCodec(const TStr& name) const {
01884                 TStr s = NormalizeCodecName(name);
01885                 PCodecBase p; if (! codecs.IsKeyGetDat(s, p)) p.Clr();
01886                 return p; }
01887         void GetAllCodecs(TCodecBaseV& dest) const {
01888                 dest.Clr();
01889                 for (int i = codecs.FFirstKeyId(); codecs.FNextKeyId(i); ) {
01890                         PCodecBase codec = codecs[i]; bool found = false;
01891                         for (int j = 0; j < dest.Len(); j++) if (dest[j]() == codec()) { found = true; break; }
01892                         if (! found) dest.Add(codec); }}
01893
01894         //-------------------------------------------------------------------------
01895         // Word boundaries (UAX #29)
01896         //-------------------------------------------------------------------------
01897
01898         // Finds the next word boundary strictly after 'position'.
01899         // Note that there are valid word boundaries at 0 and at 'src.Len()'.
01900         // If there is no such word boundary, it returns 'false' and sets 'position' to 'src.Len()'.
01901         bool FindNextWordBoundary(const TIntV& src, int &position) const {
01902                 if (position < 0) { position = 0; return true; }
01903                 size_t position_; bool retVal = ucd.FindNextWordBoundary(src, 0, src.Len(), position_); position = int(position_); return retVal; }
01904         // Creates, in 'dest', a vector of 'src.Len() + 1' elements, where 'dest[i]' tells if there is a word
01905         // boundary between 'src[i - 1]' and 'src[i]'.  Note that 'dest[0]' and 'dest[src.Len()]' are
01906         // always set to 'true'.
01907         void FindWordBoundaries(const TIntV& src, TBoolV& dest) const { ucd.FindWordBoundaries(src, 0, src.Len(), dest); }
01908
01909         //-------------------------------------------------------------------------
01910         // Sentence boundaries (UAX #29)
01911         //-------------------------------------------------------------------------
01912
01913         // Finds the next sentence boundary strictly after 'position'.
01914         // Note that there are valid sentence boundaries at 0 and at 'src.Len()'.
01915         // If there is no such sentence boundary, it returns 'false' and sets 'position' to 'src.Len()'.
01916         bool FindNextSentenceBoundary(const TIntV& src, int &position) const {
01917                 if (position < 0) { position = 0; return true; }
01918                 size_t position_; bool retVal = ucd.FindNextSentenceBoundary(src, 0, src.Len(), position_); position = int(position_); return retVal; }
01919         // Creates, in 'dest', a vector of 'src.Len() + 1' elements, where 'dest[i]' tells if there is a sentence
01920         // boundary between 'src[i - 1]' and 'src[i]'.  Note that 'dest[0]' and 'dest[src.Len()]' are
01921         // always set to 'true'.
01922         void FindSentenceBoundaries(const TIntV& src, TBoolV& dest) const { ucd.FindSentenceBoundaries(src, 0, src.Len(), dest); }
01923
01924         void ClrSentenceBoundaryExceptions() { ucd.SbEx_Clr(); }
01925         void UseEnglishSentenceBoundaryExceptions() { ucd.SbEx_SetStdEnglish(); }
01926
01927         //-------------------------------------------------------------------------
01928         // Normalization, decomposition, etc. (UAX #15)
01929         //-------------------------------------------------------------------------
01930
01931         // This sets 'dest' to the decomposed form of the source string.
01932         // - for normalization form D (NFD), i.e. canonical decomposition: use compatibility == false;
01933         // - for normalization form KD (NFKD), i.e. compatibility decomposition: use compatibility == true.
01934         void Decompose(const TIntV& src, TIntV& dest, bool compatibility) const { ucd.Decompose(src, dest, compatibility, true); }
01935         // This performs canonical composition on the source string, and stores
01936         // the result in the destination vector.  The source string should be the
01937         // result of a (canonical or compatibility) decomposition; if this is the
01938         // case, the composition will lead to a normalization form C (NFC) or
01939         // normalization form KC (NFKC), depending on whether canonical or compatibility
01940         // decomposition was used.
01941         void Compose(const TIntV& src, TIntV& dest) const { return ucd.Compose(src, dest, true); }
01942         // Calls Decompose, followed by Compose; thus the result is the NFC (if
01943         // compatibility == false) or NFKC (if compatibility == true) of the source string.
01944         // A temporary TIntV is used to contain the intermediate NF(K)D form of the
01945         // source string.
01946         void DecomposeAndCompose(const TIntV& src, TIntV& dest, bool compatibility) const { return ucd.DecomposeAndCompose(src, dest, compatibility); }
01947         // Copies the starter characters from 'src' to 'dest'; the other
01948         // characters are skipped.  'src' should already have been decomposed.
01949         // Returns the number of characters extracted.  This function can be
01950         // used to remove diacritical marks from a string (after it has been decomposed!).
01951         int ExtractStarters(const TIntV& src, TIntV& dest) const { return (int) ucd.ExtractStarters(src, dest); }
01952         // Extracts the starters into a temporary vector and then copies it into 'src'.
01953         int ExtractStarters(TIntV& src) const { return (int) ucd.ExtractStarters(src); }
01954
01955         //-------------------------------------------------------------------------
01956         // Case conversions
01957         //-------------------------------------------------------------------------
01958         // NOTE: if you will be dealing with Turkish, Azeri or Lithuanian text,
01959         // use the case-conversion methods in TUniChDb, which allow the caller
01960         // to request language-specific case mappings for these languages.
01961
01962 public:
01963         typedef TUniChDb::TCaseConversion TCaseConversion;
01964         // Sets 'dest' to the case-converted form of 'src'.
01965         void GetLowerCase(const TIntV& src, TIntV& dest) const { ucd.GetLowerCase(src, dest, true, false, false); }
01966         void GetUpperCase(const TIntV& src, TIntV& dest) const { ucd.GetUpperCase(src, dest, true, false, false); }
01967         void GetTitleCase(const TIntV& src, TIntV& dest) const { ucd.GetTitleCase(src, dest, true, false, false); }
01968
01969         // GetSimpleCaseConverted uses only the simple case mappings (from UnicodeData.txt).
01970         // This is simpler and faster.  Since each character now maps into exactly one
01971         // character, case conversion can also be done in place (see ToSimpleCaseConverted, etc.).
01972         void GetSimpleLowerCase(const TIntV& src, TIntV& dest) const { ucd.GetSimpleLowerCase(src, dest, true); }
01973         void GetSimpleUpperCase(const TIntV& src, TIntV& dest) const { ucd.GetSimpleUpperCase(src, dest, true); }
01974         void GetSimpleTitleCase(const TIntV& src, TIntV& dest) const { ucd.GetSimpleTitleCase(src, dest, true); }
01975
01976         // These functions perform simple case-conversions in-place.
01977         void ToSimpleUpperCase(TIntV& src) const { ucd.ToSimpleUpperCase(src); }
01978         void ToSimpleLowerCase(TIntV& src) const { ucd.ToSimpleLowerCase(src); }
01979         void ToSimpleTitleCase(TIntV& src) const { ucd.ToSimpleTitleCase(src); }
01980
01981         // Case folding is an alternative to the above functions.  It is intended primarily
01982         // to produce strings that are suitable for comparisons.  For example,
01983         // ToLowerCase(sigma) = sigma, ToLowerCase(final-sigma) = final-sigma;
01984         // but ToCaseFolded(sigma) = sigma, ToCaseFolded(final-sigma) = sigma.
01985         // - 'full' enables full case mappings -- i.e. sometimes a character may be mapped
01986         //   into a string of two or more characters.
01987         // - Note: For best results, perform NFD(CaseFold(NFD(x)) or NFKD(CaseFold(NFKD(x)) on
01988         //   each string before comparing them (see sec. 3.13 of the standard).
01989         void GetCaseFolded(const TIntV& src, TIntV& dest, const bool full = true) const { return ucd.GetCaseFolded(src, dest, true, full, false); }
01990         // ToCaseFolded folds the string in place.  However, this means that only the simple
01991         // case foldings can be used (the full ones could increase the length of the string).
01992         void ToCaseFolded(TIntV& src) const { return ucd.ToCaseFolded(src, false); }
01993
01994         TStr GetUtf8CaseFolded(const TStr& s) const {
01995                 bool isAscii = true;
01996                 for (int i = 0, n = s.Len(); i < n; i++) if (uchar(s[i]) >= 128) { isAscii = false; break; }
01997                 if (isAscii) return s.GetLc();
01998                 TIntV src; DecodeUtf8(s, src);
01999                 TIntV dest; GetCaseFolded(src, dest);
02000                 return EncodeUtf8Str(dest); }
02001
02002         //-------------------------------------------------------------------------
02003         // Character properties
02004         //-------------------------------------------------------------------------
02005         // These methods simply call the corresponding TUniChDb method
02006         // (which typically calls the corresponding method of TUniChInfo).
02007         // See the declaration for DECLARE_FORWARDED_PROPERTY_METHODS for a complete list.
02008         // They are all of the form        bool IsXxxx(const int cp) const
02009         // Some of the more notable ones include:
02010         // - IsAlphabetic, IsUppercase, IsLowercase, IsMath, IsAsciiHexDigit
02011         //   IsDash, IsDeprecated, IsDiacritic, IsHexDigit, IsHyphen, IsIdeographic
02012         //   IsNoncharacter, IsQuotationMark, IsSoftDotted, IsTerminalPunctuation, IsWhiteSpace
02013
02014 #define ___UniFwd1(name) bool name(const int cp) const { return ucd.name(cp); }
02015         DECLARE_FORWARDED_PROPERTY_METHODS
02016 #undef DECLARE_FORWARDED_PROPERTY_METHODS
02017 #undef __UniFwd1
02018         ___UniFwd2(IsPrivateUse, IsSurrogate)
02019
02020         TUniChCategory GetCat(const int cp) const { return ucd.GetCat(cp); }
02021         TUniChSubCategory GetSubCat(const int cp) const { return ucd.GetSubCat(cp); }
02022
02023         // GetCharName returns 0 if the name is unknown; GetCharNameS returns a string of the form "U+1234".
02024         const char *GetCharName(const int cp) const { return ucd.GetCharName(cp); }
02025         TStr GetCharNameS(const int cp) const { return ucd.GetCharNameS(cp); }
02026
02027 };
02028
02029 //-----------------------------------------------------------------------------
02030 // TUniCodec -- UTF-8 Decoder
02031 //-----------------------------------------------------------------------------
02032
02033 // Returns the number of characters that have been successfully decoded.
02034 // This does not include any replacement characters that may have been inserted into 'dest'.
02035 template<typename TSrcVec, typename TDestCh>
02036 size_t TUniCodec::DecodeUtf8(
02037         const TSrcVec& src, size_t srcIdx, const size_t srcCount,
02038         TVec<TDestCh>& dest, const bool clrDest) const
02039 {
02040         size_t nDecoded = 0;
02041         if (clrDest) dest.Clr();
02042         const size_t origSrcIdx = srcIdx;
02043         const size_t srcEnd = srcIdx + srcCount;
02044         while (srcIdx < srcEnd)
02045         {
02046                 const size_t charSrcIdx = srcIdx;
02047                 uint c = src[TVecIdx(srcIdx)] & 0xff; srcIdx++;
02048                 if ((c & _1000_0000) == 0) {
02049                         // c is one of the characters 0..0x7f, encoded as a single byte.
02050                         dest.Add(TDestCh(c)); nDecoded++; continue; }
02051                 else if ((c & _1100_0000) == _1000_0000) {
02052                         // No character in a valid UTF-8-encoded string should begin with a byte of the form 10xxxxxx.
02053                         // We must have been thrown into the middle of a multi-byte character.
02054                         switch (errorHandling) {
02055                         case uehThrow: throw TUnicodeException(charSrcIdx, c, "Invalid character: 10xxxxxx.");
02056                         case uehAbort: return nDecoded;
02057                         case uehReplace: dest.Add(TDestCh(replacementChar)); continue;
02058                         case uehIgnore: continue;
02059                         default: Fail; } }
02060                 else
02061                 {
02062                         // c introduces a sequence of 2..6 bytes, depending on how many
02063                         // of the most significant bits of c are set.
02064                         uint nMoreBytes = 0, nBits = 0, minVal = 0;
02065                         if ((c & _1110_0000) == _1100_0000) nMoreBytes = 1, nBits = 5, minVal = 0x80;
02066                         else if ((c & _1111_0000) == _1110_0000) nMoreBytes = 2, nBits = 4, minVal = 0x800;
02067                         else if ((c & _1111_1000) == _1111_0000) nMoreBytes = 3, nBits = 3, minVal = 0x10000;
02068                         else if ((c & _1111_1100) == _1111_1000) nMoreBytes = 4, nBits = 2, minVal = 0x200000;
02069                         else if ((c & _1111_1110) == _1111_1100) nMoreBytes = 5, nBits = 1, minVal = 0x4000000;
02070                         else {
02071                                 // c is of the form 1111111x, which is invalid even in the early definitions of UTF-8
02072                                 // (which allowed the encoding of codepoints up to 2^31 - 1).  However, in principle this
02073                                 // could be used to encode 32-bit integers with the msb set: 1aaabbbbccccddddeeeeffffgggghhhh
02074                                 // could be encoded as 1111111a 10aabbbb 10ccccdd 10ddeeee 10ffffgg 10gghhhh.
02075                                 if (strict)  {
02076                                         switch (errorHandling) {
02077                                         case uehThrow: throw TUnicodeException(charSrcIdx, c, "Invalid character: 1111111x.");
02078                                         case uehAbort: return nDecoded;
02079                                         // In the case of uehReplace and uehIgnore, we'll read the next 5 bytes
02080                                         // and try to decode the character.  Then, since 'strict' is true and
02081                                         // the codepoint is clearly >= 2^31, we'll notice this as an error later
02082                                         // and (in the case of uehReplace) insert a replacement character then.
02083                                         // This is probably better than inserting a replacement character right
02084                                         // away and then trying to read the next byte as if a new character
02085                                         // was beginning there -- if the current byte is really followed by five
02086                                         // 10xxxxxx bytes, we'll just get six replacement characters in a row.
02087                                         case uehReplace: break; //dest.Add(TDestCh(replacementChar)); continue;
02088                                         case uehIgnore: break; // continue;
02089                                         default: Fail; } }
02090                                 nMoreBytes = 5; nBits = 2; minVal = 0x80000000u; }
02091                         // Decode this multi-byte sequence.
02092                         uint cOut = c & ((1 << nBits) - 1); // First extract the nBits least significant bits from c.
02093                         bool cancel = false;
02094                         for (uint i = 0; i < nMoreBytes && ! cancel; i++) {
02095                                 // See if there are enough bytes left in the source vector.
02096                                 if (! (srcIdx < srcEnd)) {
02097                                         switch (errorHandling) {
02098                                         case uehThrow: throw TUnicodeException(charSrcIdx, c, TInt::GetStr(nMoreBytes) + " more bytes expected, only " + TInt::GetStr(int(srcEnd - charSrcIdx - 1)) + " available.");
02099                                         case uehAbort: return nDecoded;
02100                                         case uehReplace: dest.Add(TDestCh(replacementChar)); cancel = true; continue;
02101                                         case uehIgnore: cancel = true; continue;
02102                                         default: Fail; } }
02103                                 // Read the next byte.
02104                                 c = src[TVecIdx(srcIdx)] & 0xff; srcIdx++;
02105                                 if ((c & _1100_0000) != _1000_0000) { // Each subsequent byte should be of the form 10xxxxxx.
02106                                         switch (errorHandling) {
02107                                         case uehThrow: throw TUnicodeException(charSrcIdx, c, "Byte " + TInt::GetStr(i) + " of " + TInt::GetStr(nMoreBytes) + " extra bytes should begin with 10xxxxxx.");
02108                                         case uehAbort: return nDecoded;
02109                                         case uehReplace: dest.Add(TDestCh(replacementChar)); srcIdx--; cancel = true; continue;
02110                                         case uehIgnore: srcIdx--; cancel = true; continue;
02111                                         default: Fail; } }
02112                                 cOut <<= 6; cOut |= (c & _0011_1111); }
02113                         if (cancel) continue;
02114                         if (strict) {
02115                                 // err1: This codepoint has been represented by more bytes than it should have been.
02116                                 // For example, cOut in the range 0..127 should be represented by a single byte,
02117                                 // not by two or more bytes.
02118                                 // - For example, this may happen in the "modified UTF-8" sometimes used for Java
02119                                 // serialization, where the codepoint 0 is encoded as 11000000 10000000 to avoid
02120                                 // the appearance of null bytes in the encoded stream.
02121                                 bool err1 = (cOut < minVal);
02122                                 // err2: Early definitions of UTF-8 allowed any 31-bit integer to be encoded, using up to 6 bytes.
02123                                 // However, later this was restricted to the codepoints 0..0x10ffff only, because only these
02124                                 // are valid Unicode codepoints.  Thus, no more than 4 bytes are ever necessary.
02125                                 bool err2 = (nMoreBytes > 3 || (nMoreBytes == 3 && cOut > 0x10ffff));
02126                                 if (err1 || err2) switch (errorHandling) {
02127                                         case uehThrow:
02128                                                 if (err1) throw TUnicodeException(charSrcIdx, c, "The codepoint 0x" + TInt::GetStr(cOut, "%08x") + " has been represented by too many bytes (" + TInt::GetStr(nMoreBytes + 1) + ").");
02129                                                 else if (err2) throw TUnicodeException(charSrcIdx, c, "Invalid multibyte sequence: it decodes into 0x" + TInt::GetStr(cOut, "%08x") + ", but only codepoints 0..0x10ffff are valid.");
02130                                                 else { Fail; break; }
02131                                         case uehAbort: return nDecoded;
02132                                         case uehReplace: dest.Add(TDestCh(replacementChar)); continue;
02133                                         case uehIgnore: continue;
02134                                         default: Fail; } }
02135                         // Add the decoded codepoint to the destination vector.
02136                         // If this is the first decoded character, and it's one of the byte-order marks
02137                         // (0xfffe and 0xfeff), we will skip it (unless skipBom is false).
02138                         if (! (skipBom && (cOut == 0xfffe || cOut == 0xfeff) && charSrcIdx == origSrcIdx)) {
02139                                 dest.Add(cOut); nDecoded++; }
02140                 } // else (multi-byte sequence)
02141         } // while
02142         return nDecoded;
02143 }
02144
02145 //-----------------------------------------------------------------------
02146 // TUniCodec -- UTF-8 Encoder
02147 //-----------------------------------------------------------------------
02148
02149 // Returns the number of characters that have been successfully encoded.
02150 // This does not include any replacement characters that may have been inserted into 'dest'.
02151 template<typename TSrcVec, typename TDestCh>
02152 size_t TUniCodec::EncodeUtf8(
02153         const TSrcVec& src, size_t srcIdx, const size_t srcCount,
02154         TVec<TDestCh>& dest, const bool clrDest) const
02155 {
02156         size_t nEncoded = 0;
02157         for (const size_t srcEnd = srcIdx + srcCount; srcIdx < srcEnd; srcIdx++)
02158         {
02159                 uint c = uint(src[TVecIdx(srcIdx)]);
02160                 bool err = false;
02161                 if (strict && c > 0x10ffff) {
02162                         err = true;
02163                         switch (errorHandling) {
02164                         case uehThrow: throw TUnicodeException(srcIdx, c, "Invalid character (0x" + TInt::GetStr(c, "%x") + "; only characters in the range 0..0x10ffff are allowed).");
02165                         case uehAbort: return nEncoded;
02166                         case uehReplace: c = replacementChar; break;
02167                         case uehIgnore: continue;
02168                         default: Fail; } }
02169                 if (c < 0x80u)
02170                         dest.Add(TDestCh(c & 0xffu));
02171                 else if (c < 0x800u) {
02172                         dest.Add(TDestCh(_1100_0000 | ((c >> 6) & _0001_1111)));
02173                         dest.Add(TDestCh(_1000_0000 | (c & _0011_1111))); }
02174                 else if (c < 0x10000u) {
02175                         dest.Add(TDestCh(_1110_0000 | ((c >> 12) & _0000_1111)));
02176                         dest.Add(TDestCh(_1000_0000 | ((c >> 6) & _0011_1111)));
02177                         dest.Add(TDestCh(_1000_0000 | (c & _0011_1111))); }
02178                 else if (c < 0x200000u) {
02179                         dest.Add(TDestCh(_1111_0000 | ((c >> 18) & _0000_0111)));
02180                         dest.Add(TDestCh(_1000_0000 | ((c >> 12) & _0011_1111)));
02181                         dest.Add(TDestCh(_1000_0000 | ((c >> 6) & _0011_1111)));
02182                         dest.Add(TDestCh(_1000_0000 | (c & _0011_1111))); }
02183                 else if (c < 0x4000000u) {
02184                         dest.Add(TDestCh(_1111_1000 | ((c >> 24) & _0000_0011)));
02185                         dest.Add(TDestCh(_1000_0000 | ((c >> 18) & _0011_1111)));
02186                         dest.Add(TDestCh(_1000_0000 | ((c >> 12) & _0011_1111)));
02187                         dest.Add(TDestCh(_1000_0000 | ((c >> 6) & _0011_1111)));
02188                         dest.Add(TDestCh(_1000_0000 | (c & _0011_1111))); }
02189                 else {
02190                         dest.Add(TDestCh(_1111_1100 | ((c >> 30) & _0000_0011)));
02191                         dest.Add(TDestCh(_1000_0000 | ((c >> 24) & _0011_1111)));
02192                         dest.Add(TDestCh(_1000_0000 | ((c >> 18) & _0011_1111)));
02193                         dest.Add(TDestCh(_1000_0000 | ((c >> 12) & _0011_1111)));
02194                         dest.Add(TDestCh(_1000_0000 | ((c >> 6) & _0011_1111)));
02195                         dest.Add(TDestCh(_1000_0000 | (c & _0011_1111))); }
02196                 if (! err) nEncoded++;
02197         }
02198         return nEncoded;
02199 }
02200
02201 //-----------------------------------------------------------------------
02202 // TUniCodec -- UTF-16 Encoder
02203 //-----------------------------------------------------------------------
02204
02205 // Returns the number of characters that have been successfully decoded.
02206 // This does not include any replacement characters that may have been inserted into 'dest'.
02207 // Each element of 'src' is assumed to contain one byte of data.
02208 // srcCount must be even (though srcIdx doesn't need to be).
02209 template<typename TSrcVec, typename TDestCh>
02210 size_t TUniCodec::DecodeUtf16FromBytes(
02211         const TSrcVec& src, size_t srcIdx, const size_t srcCount,
02212         TVec<TDestCh>& dest, const bool clrDest,
02213         const TUtf16BomHandling bomHandling,
02214         const TUniByteOrder defaultByteOrder) const
02215 {
02216         IAssert(srcCount % 2 == 0);
02217         IAssert(bomHandling == bomAllowed || bomHandling == bomRequired || bomHandling == bomIgnored);
02218         IAssert(defaultByteOrder == boMachineEndian || defaultByteOrder == boBigEndian || defaultByteOrder == boLittleEndian);
02219         if (clrDest) dest.Clr();
02220         size_t nDecoded = 0;
02221         if (srcCount <= 0) return nDecoded;
02222         const size_t origSrcIdx = srcIdx, srcEnd = srcIdx + srcCount;
02223         bool littleEndian = false;
02224         bool leDefault = (defaultByteOrder == boLittleEndian || (defaultByteOrder == boMachineEndian && IsMachineLittleEndian()));
02225         if (bomHandling == bomIgnored) littleEndian = leDefault;
02226         else if (bomHandling == bomAllowed || bomHandling == bomRequired)
02227         {
02228                 int byte1 = uint(src[TVecIdx(srcIdx)]) & 0xff, byte2 = uint(src[TVecIdx(srcIdx + 1)]) & 0xff;
02229                 if (byte1 == 0xfe && byte2 == 0xff) { littleEndian = false; if (skipBom) srcIdx += 2; }
02230                 else if (byte1 == 0xff && byte2 == 0xfe) { littleEndian = true; if (skipBom) srcIdx += 2; }
02231                 else if (bomHandling == bomAllowed) littleEndian = leDefault;
02232                 else { // Report an error.
02233                         switch (errorHandling) {
02234                         case uehThrow: throw TUnicodeException(srcIdx, byte1, "BOM expected at the beginning of the input vector (" + TInt::GetStr(byte1, "%02x") + " " + TInt::GetStr(byte2, "%02x") + " found instead).");
02235                         case uehAbort: case uehReplace: case uehIgnore: return size_t(-1);
02236                         default: Fail; } }
02237         }
02238         else Fail;
02239         while (srcIdx < srcEnd)
02240         {
02241                 const size_t charSrcIdx = srcIdx;
02242                 uint byte1 = uint(src[TVecIdx(srcIdx)]) & 0xff, byte2 = uint(src[TVecIdx(srcIdx + 1)]) & 0xff; srcIdx += 2;
02243                 uint c = littleEndian ? (byte1 | (byte2 << 8)) : (byte2 | (byte1 << 8));
02244                 if (Utf16FirstSurrogate <= c && c <= Utf16FirstSurrogate + 1023)
02245                 {
02246                         // c is the first character in a surrogate pair.  Read the next character.
02247                         if (! (srcIdx + 2 <= srcEnd)) {
02248                                 switch (errorHandling) {
02249                                 case uehThrow: throw TUnicodeException(charSrcIdx, c, "The second character of a surrogate pair is missing.");
02250                                 case uehAbort: return nDecoded;
02251                                 case uehReplace: dest.Add(TDestCh(replacementChar)); continue;
02252                                 case uehIgnore: continue;
02253                                 default: Fail; } }
02254                         uint byte1 = uint(src[TVecIdx(srcIdx)]) & 0xff, byte2 = uint(src[TVecIdx(srcIdx + 1)]) & 0xff; srcIdx += 2;
02255                         uint c2 = littleEndian ? (byte1 | (byte2 << 8)) : (byte2 | (byte1 << 8));
02256                         // c2 should be the second character of the surrogate pair.
02257                         if (c2 < Utf16SecondSurrogate || Utf16SecondSurrogate + 1023 < c2) {
02258                                 switch (errorHandling) {
02259                                 case uehThrow: throw TUnicodeException(charSrcIdx + 2, c2, "The second character of a surrogate pair should be in the range " + TInt::GetStr(Utf16SecondSurrogate, "%04x") + ".." + TInt::GetStr(Utf16SecondSurrogate + 1023, "%04x") + ", not " + TInt::GetStr(c2, "04x") + ".");
02260                                 case uehAbort: return nDecoded;
02261                                 // with uehReplace and uehIgnore, we'll just skip the first character of the surrogate pair; we'll process the second one during the next iteration, this time as an ordinary character
02262                                 case uehReplace: dest.Add(TDestCh(replacementChar)); srcIdx -= 2; continue;
02263                                 case uehIgnore: srcIdx -= 2; continue;
02264                                 default: Fail; } }
02265                         // c and c2 each contain 10 bits of information.
02266                         uint cc = ((c - Utf16FirstSurrogate) << 10) | (c2 - Utf16SecondSurrogate);
02267                         cc += 0x10000;
02268                         dest.Add(TDestCh(cc)); nDecoded++; continue;
02269                 }
02270                 else if (strict && Utf16SecondSurrogate <= c && c <= Utf16SecondSurrogate + 1023) {
02271                         switch (errorHandling) {
02272                         case uehThrow: throw TUnicodeException(charSrcIdx, c, "This 16-bit value should be used only as the second character of a surrogate pair.");
02273                         case uehAbort: return nDecoded;
02274                         case uehReplace: dest.Add(TDestCh(replacementChar)); continue;
02275                         case uehIgnore: continue;
02276                         default: Fail; } }
02277                 // If 'c' is the first character in the input stream, and it's a BOM, we might have to skip it.
02278                 if (charSrcIdx == origSrcIdx && (c == 0xfffeu || c == 0xfeffu) && skipBom) continue;
02279                 // Otherwise, store 'c' to the destination vector.
02280                 dest.Add(TDestCh(c)); nDecoded++;
02281         }
02282         return nDecoded;
02283 }
02284
02285 // Here, each element of 'src' is treated as a 16-bit word.  The byte-order settings
02286 // are used to determine if the two bytes of each word should be swapped before further
02287 // processing.  For example, if a BOM is present, it must have the value 0xfeff; if it
02288 // actually has the value 0xfffe, this means that the two bytes of each word must be swapped.
02289 // Basically, the combination of the byteOrder parameter and the byte order mark (if present) at the
02290 // beginning of the source data is used to determine the "original" byte order of the data;
02291 // if this doesn't match the byte order of the local machine, the two bytes of each word will
02292 // be swapped during the decoding process.
02293 template<typename TSrcVec, typename TDestCh>
02294 size_t TUniCodec::DecodeUtf16FromWords(
02295         const TSrcVec& src, size_t srcIdx, const size_t srcCount,
02296         TVec<TDestCh>& dest, bool clrDest,
02297         const TUtf16BomHandling bomHandling,
02298         const TUniByteOrder defaultByteOrder) const
02299 {
02300         IAssert(bomHandling == bomAllowed || bomHandling == bomRequired || bomHandling == bomIgnored);
02301         IAssert(defaultByteOrder == boMachineEndian || defaultByteOrder == boBigEndian || defaultByteOrder == boLittleEndian);
02302         if (clrDest) dest.Clr();
02303         size_t nDecoded = 0;
02304         if (srcCount <= 0) return nDecoded;
02305         const size_t origSrcIdx = srcIdx, srcEnd = srcIdx + srcCount;
02306         bool swap = false;
02307         bool isMachineLe = IsMachineLittleEndian();
02308         bool isDefaultLe = (defaultByteOrder == boLittleEndian || (defaultByteOrder == boMachineEndian && isMachineLe));
02309         if (bomHandling == bomIgnored) swap = (isDefaultLe != isMachineLe);
02310         else if (bomHandling == bomAllowed || bomHandling == bomRequired)
02311         {
02312                 int c = uint(src[TVecIdx(srcIdx)]) & 0xffff;
02313                 if (c == 0xfeff) { swap = false; if (skipBom) srcIdx += 1; }
02314                 else if (c == 0xfffe) { swap = true; if (skipBom) srcIdx += 1; }
02315                 else if (bomHandling == bomAllowed) swap = (isMachineLe != isDefaultLe);
02316                 else { // Report an error.
02317                         switch (errorHandling) {
02318                         case uehThrow: throw TUnicodeException(srcIdx, c, "BOM expected at the beginning of the input vector (" + TInt::GetStr(c, "%04x") + " found instead).");
02319                         case uehAbort: case uehReplace: case uehIgnore: return size_t(-1);
02320                         default: Fail; } }
02321         }
02322         else Fail;
02323         while (srcIdx < srcEnd)
02324         {
02325                 const size_t charSrcIdx = srcIdx;
02326                 uint c = uint(src[TVecIdx(srcIdx)]) & 0xffffu; srcIdx++;
02327                 if (swap) c = ((c >> 8) & 0xff) | ((c & 0xff) << 8);
02328                 if (Utf16FirstSurrogate <= c && c <= Utf16FirstSurrogate + 1023)
02329                 {
02330                         // c is the first character in a surrogate pair.  Read the next character.
02331                         if (! (srcIdx < srcEnd)) {
02332                                 switch (errorHandling) {
02333                                 case uehThrow: throw TUnicodeException(charSrcIdx, c, "The second character of a surrogate pair is missing.");
02334                                 case uehAbort: return nDecoded;
02335                                 case uehReplace: dest.Add(TDestCh(replacementChar)); continue;
02336                                 case uehIgnore: continue;
02337                                 default: Fail; } }
02338                         uint c2 = uint(src[TVecIdx(srcIdx)]) & 0xffffu; srcIdx++;
02339                         if (swap) c2 = ((c2 >> 8) & 0xff) | ((c2 & 0xff) << 8);
02340                         // c2 should be the second character of the surrogate pair.
02341                         if (c2 < Utf16SecondSurrogate || Utf16SecondSurrogate + 1023 < c2) {
02342                                 switch (errorHandling) {
02343                                 case uehThrow: throw TUnicodeException(charSrcIdx + 1, c2, "The second character of a surrogate pair should be in the range " + TInt::GetStr(Utf16SecondSurrogate, "%04x") + ".." + TInt::GetStr(Utf16SecondSurrogate + 1023, "%04x") + ", not " + TInt::GetStr(c2, "04x") + ".");
02344                                 case uehAbort: return nDecoded;
02345                                 // with uehReplace and uehIgnore, we'll just skip the first character of the surrogate pair; we'll process the second one during the next iteration, this time as an ordinary character
02346                                 case uehReplace: dest.Add(TDestCh(replacementChar)); srcIdx -= 1; continue;
02347                                 case uehIgnore: srcIdx -= 1; continue;
02348                                 default: Fail; } }
02349                         // c and c2 each contain 10 bits of information.
02350                         uint cc = ((c - Utf16FirstSurrogate) << 10) | (c2 - Utf16SecondSurrogate);
02351                         cc += 0x10000;
02352                         dest.Add(TDestCh(cc)); nDecoded++; continue;
02353                 }
02354                 else if (strict && Utf16SecondSurrogate <= c && c <= Utf16SecondSurrogate + 1023) {
02355                         switch (errorHandling) {
02356                         case uehThrow: throw TUnicodeException(charSrcIdx, c, "This 16-bit value should be used only as the second character of a surrogate pair.");
02357                         case uehAbort: return nDecoded;
02358                         case uehReplace: dest.Add(TDestCh(replacementChar)); continue;
02359                         case uehIgnore: continue;
02360                         default: Fail; } }
02361                 // If 'c' is the first character in the input stream, and it's a BOM, we might have to skip it.
02362                 if (charSrcIdx == origSrcIdx && (c == 0xfffeu || c == 0xfeffu) && skipBom) continue;
02363                 // Otherwise, store 'c' to the destination vector.
02364                 dest.Add(TDestCh(c)); nDecoded++;
02365         }
02366         return nDecoded;
02367 }
02368
02369 //-----------------------------------------------------------------------
02370 // TUniCodec -- UTF-16 Encoder
02371 //-----------------------------------------------------------------------
02372
02373 // Returns the number of characters that have been successfully encoded.
02374 // This does not include any replacement characters that may have been inserted into 'dest'.
02375 template<typename TSrcVec, typename TDestCh>
02376 size_t TUniCodec::EncodeUtf16ToWords(
02377         const TSrcVec& src, size_t srcIdx, const size_t srcCount,
02378         TVec<TDestCh>& dest, const bool clrDest, const bool insertBom,
02379         const TUniByteOrder destByteOrder) const
02380 {
02381         bool isMachineLe = IsMachineLittleEndian();
02382         bool swap = (destByteOrder == boLittleEndian && ! isMachineLe) || (destByteOrder == boBigEndian && isMachineLe);
02383         size_t nEncoded = 0, srcEnd = srcIdx + srcCount;
02384         if (insertBom) { dest.Add(TDestCh(swap ? 0xfffeu : 0xfeffu)); nEncoded++; }
02385         while (srcIdx < srcEnd)
02386         {
02387                 uint c = uint(src[TVecIdx(srcIdx)]); srcIdx++;
02388                 if (! (c <= 0x10ffffu)) {
02389                         switch (errorHandling) {
02390                         case uehThrow: throw TUnicodeException(srcIdx - 1, c, "UTF-16 only supports characters in the range 0..10ffff (not " + TUInt::GetStr(c, "%08x") + ").");
02391                         case uehAbort: return nEncoded;
02392                         case uehReplace: dest.Add(TDestCh(swap ? SwapBytes(replacementChar) : replacementChar)); continue;
02393                         case uehIgnore: continue;
02394                         default: Fail; } }
02395                 if (Utf16FirstSurrogate <= c && c < Utf16FirstSurrogate + 1023) {
02396                         switch (errorHandling) {
02397                         case uehThrow: throw TUnicodeException(srcIdx - 1, c, "UTF-16 cannot encode " + TUInt::GetStr(c, "%04x") + " as it belongs to the first surrogate range (" + TUInt::GetStr(Utf16FirstSurrogate, "%04x") + ".." + TUInt::GetStr(Utf16FirstSurrogate + 1023, "%04x") + ").");
02398                         case uehAbort: return nEncoded;
02399                         case uehReplace: dest.Add(TDestCh(swap ? SwapBytes(replacementChar) : replacementChar)); continue;
02400                         case uehIgnore: continue;
02401                         default: Fail; } }
02402                 if (Utf16SecondSurrogate <= c && c < Utf16SecondSurrogate + 1023) {
02403                         switch (errorHandling) {
02404                         case uehThrow: throw TUnicodeException(srcIdx - 1, c, "The character " + TUInt::GetStr(c, "%04x") + " belongs to the second surrogate range (" + TUInt::GetStr(Utf16FirstSurrogate, "%04x") + ".." + TUInt::GetStr(Utf16FirstSurrogate + 1023, "%04x") + "), which is not allowed with strict == true.");
02405                         case uehAbort: return nEncoded;
02406                         case uehReplace: dest.Add(TDestCh(swap ? SwapBytes(replacementChar) : replacementChar)); continue;
02407                         case uehIgnore: continue;
02408                         default: Fail; } }
02409                 // If c is <= 0xffff, it can be stored directly.
02410                 if (c <= 0xffffu) {
02411                         if (swap) c = ((c >> 8) & 0xff) | ((c & 0xff) << 8);
02412                         dest.Add(TDestCh(c)); nEncoded++; continue; }
02413                 // Otherwise, represent c by a pair of surrogate characters.
02414                 c -= 0x10000u; IAssert(/*0 <= c &&*/ c <= 0xfffffu);
02415                 uint c1 = (c >> 10) & 1023, c2 = c & 1023;
02416                 c1 += Utf16FirstSurrogate; c2 += Utf16SecondSurrogate;
02417                 if (swap) {
02418                         c1 = ((c1 >> 8) & 0xff) | ((c1 & 0xff) << 8);
02419                         c2 = ((c2 >> 8) & 0xff) | ((c2 & 0xff) << 8); }
02420                 dest.Add(TDestCh(c1));
02421                 dest.Add(TDestCh(c2));
02422                 nEncoded++; continue;
02423         }
02424         return nEncoded;
02425 }
02426
02427 template<typename TSrcVec, typename TDestCh>
02428 size_t TUniCodec::EncodeUtf16ToBytes(
02429         const TSrcVec& src, size_t srcIdx, const size_t srcCount,
02430         TVec<TDestCh>& dest, const bool clrDest, const bool insertBom,
02431         const TUniByteOrder destByteOrder) const
02432 {
02433         bool isDestLe = (destByteOrder == boLittleEndian || (destByteOrder == boMachineEndian && IsMachineLittleEndian()));
02434         size_t nEncoded = 0, srcEnd = srcIdx + srcCount;
02435         if (insertBom) { dest.Add(isDestLe ? 0xff : 0xfe); dest.Add(isDestLe ? 0xfe : 0xff); nEncoded++; }
02436         while (srcIdx < srcEnd)
02437         {
02438                 uint c = uint(src[TVecIdx(srcIdx)]); srcIdx++;
02439                 if (! (c <= 0x10ffffu)) {
02440                         switch (errorHandling) {
02441                         case uehThrow: throw TUnicodeException(srcIdx - 1, c, "UTF-16 only supports characters in the range 0..10ffff (not " + TUInt::GetStr(c, "%08x") + ").");
02442                         case uehAbort: return nEncoded;
02443 #define ___OutRepl if (isDestLe) { dest.Add(replacementChar & 0xff); dest.Add((replacementChar >> 8) & 0xff); } else { dest.Add((replacementChar >> 8) & 0xff); dest.Add(replacementChar & 0xff); }
02444                         case uehReplace: ___OutRepl; continue;
02445                         case uehIgnore: continue;
02446                         default: Fail; } }
02447                 if (Utf16FirstSurrogate <= c && c < Utf16FirstSurrogate + 1023) {
02448                         switch (errorHandling) {
02449                         case uehThrow: throw TUnicodeException(srcIdx - 1, c, "UTF-16 cannot encode " + TUInt::GetStr(c, "%04x") + " as it belongs to the first surrogate range (" + TUInt::GetStr(Utf16FirstSurrogate, "%04x") + ".." + TUInt::GetStr(Utf16FirstSurrogate + 1023, "%04x") + ").");
02450                         case uehAbort: return nEncoded;
02451                         case uehReplace: ___OutRepl; continue;
02452                         case uehIgnore: continue;
02453                         default: Fail; } }
02454                 if (Utf16SecondSurrogate <= c && c < Utf16SecondSurrogate + 1023) {
02455                         switch (errorHandling) {
02456                         case uehThrow: throw TUnicodeException(srcIdx - 1, c, "The character " + TUInt::GetStr(c, "%04x") + " belongs to the second surrogate range (" + TUInt::GetStr(Utf16FirstSurrogate, "%04x") + ".." + TUInt::GetStr(Utf16FirstSurrogate + 1023, "%04x") + "), which is not allowed with strict == true.");
02457                         case uehAbort: return nEncoded;
02458                         case uehReplace: ___OutRepl; continue;
02459                         case uehIgnore: continue;
02460                         default: Fail; } }
02461 #undef ___OutRepl
02462                 // If c is <= 0xffff, it can be stored directly.
02463                 if (c <= 0xffffu) {
02464                         if (isDestLe) { dest.Add(c & 0xff); dest.Add((c >> 8) & 0xff); }
02465                         else { dest.Add((c >> 8) & 0xff); dest.Add(c & 0xff); }
02466                         nEncoded++; continue; }
02467                 // Otherwise, represent c by a pair of surrogate characters.
02468                 c -= 0x10000u; IAssert(/*0 <= c &&*/ c <= 0xfffffu);
02469                 uint c1 = (c >> 10) & 1023, c2 = c & 1023;
02470                 c1 += Utf16FirstSurrogate; c2 += Utf16SecondSurrogate;
02471                 if (isDestLe) { dest.Add(c1 & 0xff); dest.Add((c1 >> 8) & 0xff); dest.Add(c2 & 0xff); dest.Add((c2 >> 8) & 0xff); }
02472                 else { dest.Add((c1 >> 8) & 0xff); dest.Add(c1 & 0xff); dest.Add((c2 >> 8) & 0xff); dest.Add(c2 & 0xff); }
02473                 nEncoded++; continue;
02474         }
02475         return nEncoded;
02476 }
02477
02478 //-----------------------------------------------------------------------------
02479 // TUniChDb -- word boundaries
02480 //-----------------------------------------------------------------------------
02481
02482 template<typename TSrcVec>
02483 bool TUniChDb::FindNextWordBoundary(const TSrcVec& src, const size_t srcIdx, const size_t srcCount, size_t &position) const
02484 {
02485         // WB1.  Break at the start of text.
02486         if (position < srcIdx) { position = srcIdx; return true; }
02487         // If we are beyond the end of the text, there aren't any word breaks left.
02488         const size_t srcEnd = srcIdx + srcCount;
02489         if (position >= srcEnd) return false;
02490         // If 'position' is currently at an ignored character, move it back to the last nonignored character.
02491         size_t origPos = position;
02492         if (IsWbIgnored(src[TVecIdx(position)])) {
02493                 if (! WbFindPrevNonIgnored(src, srcIdx, position))
02494                         position = origPos;
02495         }
02496         // Determine the previous nonignored character (before 'position').
02497         size_t posPrev = position;
02498         if (! WbFindPrevNonIgnored(src, srcIdx, posPrev)) posPrev = position;
02499         // Sec 6.2.  Allow a break between Sep and an ignored character.
02500         if (position == origPos && position + 1 < srcEnd && IsSbSep(src[TVecIdx(position)]) && IsWbIgnored(src[TVecIdx(position + 1)])) { position += 1; return true; }
02501         // Determine the next nonignored character (after 'position').
02502         size_t posNext = position; WbFindNextNonIgnored(src, posNext, srcEnd);
02503         size_t posNext2;
02504         int cPrev = (posPrev < position ? (int) src[TVecIdx(posPrev)] : -1), cCur = (position < srcEnd ? (int) src[TVecIdx(position)] : -1);
02505         int cNext = (position < posNext && posNext < srcEnd ? (int) src[TVecIdx(posNext)] : -1);
02506         int wbfPrev = GetWbFlags(cPrev), wbfCur = GetWbFlags(cCur), wbfNext = GetWbFlags(cNext);
02507         int cNext2, wbfNext2;
02508         //
02509         for ( ; position < srcEnd; posPrev = position, position = posNext, posNext = posNext2,
02510                                                            cPrev = cCur, cCur = cNext, cNext = cNext2,
02511                                                            wbfPrev = wbfCur, wbfCur = wbfNext, wbfNext = wbfNext2)
02512         {
02513                 // Should there be a word boundary between 'position' and 'posNext' (or, more accurately,
02514                 // between src[posNext - 1] and src[posNext] --- any ignored characters between 'position'
02515                 // and 'posNext' are considered to belong to the previous character ('position'), not to the next one)?
02516                 posNext2 = posNext; WbFindNextNonIgnored(src, posNext2, srcEnd);
02517                 cNext2 = (posNext < posNext2 && posNext2 < srcEnd ? (int) src[TVecIdx(posNext2)] : -1);
02518                 wbfNext2 = GetWbFlags(cNext2);
02519 #define TestCurNext(curFlag, nextFlag) if ((wbfCur & curFlag) == curFlag && (wbfNext & nextFlag) == nextFlag) continue
02520 #define TestCurNext2(curFlag, nextFlag, next2Flag) if ((wbfCur & curFlag) == curFlag && (wbfNext & nextFlag) == nextFlag && (wbfNext2 & next2Flag) == next2Flag) continue
02521 #define TestPrevCurNext(prevFlag, curFlag, nextFlag) if ((wbfPrev & prevFlag) == prevFlag && (wbfCur & curFlag) == curFlag && (wbfNext & nextFlag) == nextFlag) continue
02522                 // WB3.  Do not break within CRLF.
02523                 if (cCur == 13 && cNext == 10) continue;
02524                 // WB5.  Do not break between most letters.
02525                 TestCurNext(ucfWbALetter, ucfWbALetter);
02526                 // WB6.  Do not break letters across certain punctuation.
02527                 TestCurNext2(ucfWbALetter, ucfWbMidLetter, ucfWbALetter);
02528                 // WB7.  Do not break letters across certain punctuation.
02529                 TestPrevCurNext(ucfWbALetter, ucfWbMidLetter, ucfWbALetter);
02530                 // WB8.  Do not break within sequences of digits, or digits adjacent to letters.
02531                 TestCurNext(ucfWbNumeric, ucfWbNumeric);
02532                 // WB9.  Do not break within sequences of digits, or digits adjacent to letters.
02533                 TestCurNext(ucfWbALetter, ucfWbNumeric);
02534                 // WB10.  Do not break within sequences of digits, or digits adjacent to letters.
02535                 TestCurNext(ucfWbNumeric, ucfWbALetter);
02536                 // WB11.  Do not break within sequences, such as "3.2" or "3.456,789".
02537                 TestPrevCurNext(ucfWbNumeric, ucfWbMidNum, ucfWbNumeric);
02538                 // WB12.  Do not break within sequences, such as "3.2" or "3.456,789".
02539                 TestCurNext2(ucfWbNumeric, ucfWbMidNum, ucfWbNumeric);
02540                 // WB13.  Do not break between Katakana.
02541                 TestCurNext(ucfWbKatakana, ucfWbKatakana);
02542                 // WB13a.  Do not break from extenders.
02543                 if ((wbfCur & (ucfWbALetter | ucfWbNumeric | ucfWbKatakana | ucfWbExtendNumLet)) != 0 &&
02544                         (wbfNext & ucfWbExtendNumLet) == ucfWbExtendNumLet) continue;
02545                 // WB13b.  Do not break from extenders.
02546                 if ((wbfCur & ucfWbExtendNumLet) == ucfWbExtendNumLet &&
02547                         (wbfNext & (ucfWbALetter | ucfWbNumeric | ucfWbKatakana)) != 0) continue;
02548                 // WB14.  Otherwise, break everywhere.
02549                 position = posNext; return true;
02550 #undef TestCurNext
02551 #undef TestCurNext2
02552 #undef TestPrevCurNext
02553         }
02554         // WB2.  Break at the end of text.
02555         IAssert(position == srcEnd);
02556         return true;
02557 }
02558
02559 // ToDo: provide a more efficient implementation of this.
02560 template<typename TSrcVec>
02561 void TUniChDb::FindWordBoundaries(const TSrcVec& src, const size_t srcIdx, const size_t srcCount, TBoolV& dest) const
02562 {
02563         if (size_t(dest.Len()) != srcCount + 1) dest.Gen(TVecIdx(srcCount + 1));
02564         dest.PutAll(false);
02565         size_t position = srcIdx;
02566         dest[TVecIdx(position - srcIdx)] = true;
02567         while (position < srcIdx + srcCount)
02568         {
02569                 size_t oldPos = position;
02570                 FindNextWordBoundary(src, srcIdx, srcCount, position);
02571     if (oldPos < position) {
02572                   Assert(oldPos < position);
02573     }
02574     Assert(position <= srcIdx + srcCount);
02575                 dest[TVecIdx(position - srcIdx)] = true;
02576         }
02577         Assert(dest[TVecIdx(srcCount)]);
02578 }
02579
02580 //-----------------------------------------------------------------------------
02581 // TUniChDb -- sentence boundaries
02582 //-----------------------------------------------------------------------------
02583
02584 template<typename TSrcVec>
02585 bool TUniChDb::CanSentenceEndHere(const TSrcVec& src, const size_t srcIdx, const size_t position) const
02586 {
02587         if (sbExTrie.Empty()) return true;
02588         // We'll move back from the position where a sentence-boundary is being considered.
02589         size_t pos = position;
02590         if (! WbFindPrevNonIgnored(src, srcIdx, pos)) return true;
02591         int c = (int) src[TVecIdx(pos)]; int sfb = GetSbFlags(c);
02592         // - Skip the Sep, if there is one.
02593         if ((c & ucfSbSep) == ucfSbSep) {
02594                 if (! WbFindPrevNonIgnored(src, srcIdx, pos)) return true;
02595                 c = (int) src[TVecIdx(pos)]; sfb = GetSbFlags(c); }
02596         // - Skip any Sp characters.
02597         while ((sfb & ucfSbSp) == ucfSbSp) {
02598                 if (! WbFindPrevNonIgnored(src, srcIdx, pos)) return true;
02599                 c = (int) src[TVecIdx(pos)]; sfb = GetSbFlags(c); }
02600         // - Skip any Close characters.
02601         while ((sfb & ucfSbSp) == ucfSbSp) {
02602                 if (! WbFindPrevNonIgnored(src, srcIdx, pos)) return true;
02603                 c = (int) src[TVecIdx(pos)]; sfb = GetSbFlags(c); }
02604         // - Skip any ATerm | STerm characters.
02605         while ((sfb & (ucfSbATerm | ucfSbSTerm)) != 0) {
02606                 if (! WbFindPrevNonIgnored(src, srcIdx, pos)) return true;
02607                 c = (int) src[TVecIdx(pos)]; sfb = GetSbFlags(c); }
02608         // Now start moving through the trie.
02609         int cLast = c, cButLast = -1, cButButLast = -1, len = 1, node = -1;
02610         while (true)
02611         {
02612                 bool atEnd = (! WbFindPrevNonIgnored(src, srcIdx, pos));
02613                 c = (atEnd ? -1 : (int) src[TVecIdx(pos)]);
02614                 TUniChCategory cat = GetCat(c);
02615                 if (atEnd || ! (cat == ucLetter || cat == ucNumber || cat == ucSymbol)) {
02616                         // Check if the suffix we've read so far is one of those that appear in the trie.
02617                         if (len == 1) return ! sbExTrie.Has1Gram(cLast);
02618                         if (len == 2) return ! sbExTrie.Has2Gram(cLast, cButLast);
02619                         IAssert(len >= 3); IAssert(node >= 0);
02620                         if (sbExTrie.IsNodeTerminal(node)) return false;
02621                         if (atEnd) return true; }
02622                 if (len == 1) { cButLast = c; len++; }
02623                 else if (len == 2) { cButButLast = c; len++;
02624                         // Now we have read the last three characters; start descending the suitable subtrie.
02625                         node = sbExTrie.Get3GramRoot(cLast, cButLast, cButButLast);
02626                         if (node < 0) return true; }
02627                 else {
02628                         // Descend down the trie.
02629                         node = sbExTrie.GetChild(node, c);
02630                         if (node < 0) return true; }
02631         }
02632         //return true;
02633 }
02634
02635 template<typename TSrcVec>
02636 bool TUniChDb::FindNextSentenceBoundary(const TSrcVec& src, const size_t srcIdx, const size_t srcCount, size_t &position) const
02637 {
02638         // SB1.  Break at the start of text.
02639         if (position < srcIdx) { position = srcIdx; return true; }
02640         // If we are beyond the end of the text, there aren't any word breaks left.
02641         const size_t srcEnd = srcIdx + srcCount;
02642         if (position >= srcEnd) return false;
02643         // If 'position' is currently at an ignored character, move it back to the last nonignored character.
02644         size_t origPos = position;
02645         if (IsWbIgnored(src[TVecIdx(position)])) {
02646                 if (! WbFindPrevNonIgnored(src, srcIdx, position))
02647                         position = origPos;
02648         }
02649         // Determine the previous nonignored character (before 'position').
02650         size_t posPrev = position;
02651         if (! WbFindPrevNonIgnored(src, srcIdx, posPrev)) posPrev = position;
02652         // Sec 6.2.  Allow a break between Sep and an ignored character.
02653         if (position == origPos && position + 1 < srcEnd && IsSbSep(src[TVecIdx(position)]) && IsWbIgnored(src[TVecIdx(position + 1)])) { position += 1; return true; }
02654         // Determine the next nonignored character (after 'position').
02655         size_t posNext = position; WbFindNextNonIgnored(src, posNext, srcEnd);
02656         size_t posNext2;
02657         int cPrev = (posPrev < position ? (int) src[TVecIdx(posPrev)] : -1), cCur = (position < srcEnd ? (int) src[TVecIdx(position)] : -1);
02658         int cNext = (position < posNext && posNext < srcEnd ? (int) src[TVecIdx(posNext)] : -1);
02659         int sbfPrev = GetSbFlags(cPrev), sbfCur = GetSbFlags(cCur), sbfNext = GetSbFlags(cNext);
02660         int cNext2, sbfNext2;
02661         // Initialize the state of the peek-back automaton.
02662         typedef enum { stInit, stATerm, stATermSp, stATermSep, stSTerm, stSTermSp, stSTermSep } TPeekBackState;
02663         TPeekBackState backState;
02664         {
02665                 size_t pos = position;
02666                 bool wasSep = false, wasSp = false, wasATerm = false, wasSTerm = false;
02667                 while (true)
02668                 {
02669                         if (! WbFindPrevNonIgnored(src, srcIdx, pos)) break;
02670                         // Skip at most one Sep.
02671                         int cp = (int) src[TVecIdx(pos)]; int sbf = GetSbFlags(cp);
02672                         if ((sbf & ucfSbSep) == ucfSbSep) {
02673                                 wasSep = true;
02674                                 if (! WbFindPrevNonIgnored(src, srcIdx, pos)) break;
02675                                 cp = (int) src[TVecIdx(pos)]; sbf = GetSbFlags(cp); }
02676                         // Skip zero or more Sp's.
02677                         bool stop = false;
02678                         while ((sbf & ucfSbSp) == ucfSbSp) {
02679                                 wasSp = true;
02680                                 if (! WbFindPrevNonIgnored(src, srcIdx, pos)) { stop = true; break; }
02681                                 cp = (int) src[TVecIdx(pos)]; sbf = GetSbFlags(cp); }
02682                         if (stop) break;
02683                         // Skip zero or more Close's.
02684                         while ((sbf & ucfSbClose) == ucfSbClose) {
02685                                 if (! WbFindPrevNonIgnored(src, srcIdx, pos)) { stop = true; break; }
02686                                 cp = (int) src[TVecIdx(pos)]; sbf = GetSbFlags(cp); }
02687                         if (stop) break;
02688                         // Process an ATerm or STerm.
02689                         wasATerm = ((sbf & ucfSbATerm) == ucfSbATerm);
02690                         wasSTerm = ((sbf & ucfSbSTerm) == ucfSbSTerm);
02691                         break;
02692                 }
02693                 if (wasATerm) backState = (wasSep ? stATermSep : wasSp ? stATermSp : stATerm);
02694                 else if (wasSTerm) backState = (wasSep ? stSTermSep : wasSp ? stSTermSp : stSTerm);
02695                 else backState = stInit;
02696         }
02697         // Initialize the state of the peek-ahead automaton.  This state tells us what follows
02698         // after we skip all contiguous characters from the complement of the set {OLetter, Upper, Lower, Sep, STerm, ATerm}.
02699         // Thus, the next character is either OLetter, Upper, Lower, Sep, STerm, ATerm, or the end of the input string.
02700         // Our peek-ahead automaton must tell us whether it is Lower or something else.
02701         typedef enum { stUnknown, stLower, stNotLower } TPeekAheadState;
02702         TPeekAheadState aheadState = stUnknown;
02703         //
02704         for ( ; position < srcEnd; posPrev = position, position = posNext, posNext = posNext2,
02705                                                            cPrev = cCur, cCur = cNext, cNext = cNext2,
02706                                                            sbfPrev = sbfCur, sbfCur = sbfNext, sbfNext = sbfNext2)
02707         {
02708                 // Should there be a word boundary between 'position' and 'posNext' (or, more accurately,
02709                 // between src[posNext - 1] and src[posNext] --- any ignored characters between 'position'
02710                 // and 'posNext' are considered to belong to the previous character ('position'), not to the next one)?
02711                 posNext2 = posNext; WbFindNextNonIgnored(src, posNext2, srcEnd);
02712                 cNext2 = (posNext < posNext2 && posNext2 < srcEnd ? (int) src[TVecIdx(posNext2)] : -1);
02713                 sbfNext2 = GetSbFlags(cNext2);
02714                 // Update the peek-back automaton.
02715 #define TestCur(curFlag) ((sbfCur & ucfSb##curFlag) == ucfSb##curFlag)
02716 #define Trans(curFlag, newState) if (TestCur(curFlag)) { backState = st##newState; break; }
02717                 switch (backState) {
02718                         case stInit: Trans(ATerm, ATerm); Trans(STerm, STerm); break;
02719                         case stATerm: Trans(Sp, ATermSp); Trans(Sep, ATermSep); Trans(ATerm, ATerm); Trans(STerm, STerm); Trans(Close, ATerm); backState = stInit; break;
02720                         case stSTerm: Trans(Sp, STermSp); Trans(Sep, STermSep); Trans(ATerm, ATerm); Trans(STerm, STerm); Trans(Close, STerm); backState = stInit; break;
02721                         case stATermSp: Trans(Sp, ATermSp); Trans(Sep, ATermSep); Trans(ATerm, ATerm); Trans(STerm, STerm); backState = stInit; break;
02722                         case stSTermSp: Trans(Sp, STermSp); Trans(Sep, STermSep); Trans(ATerm, ATerm); Trans(STerm, STerm); backState = stInit; break;
02723                         case stATermSep: Trans(ATerm, ATerm); Trans(STerm, STerm); backState = stInit; break;
02724                         case stSTermSep: Trans(ATerm, ATerm); Trans(STerm, STerm); backState = stInit; break;
02725                         default: IAssert(false); }
02726 #undef Trans
02727 #undef TestCur
02728                 // Update the peek-ahead automaton.
02729 #define IsPeekAheadSkippable(sbf) ((sbf & (ucfSbOLetter | ucfSbUpper | ucfSbLower | ucfSbSep | ucfSbSTerm | ucfSbATerm)) == 0)
02730                 if (! IsPeekAheadSkippable(sbfCur)) {
02731                         bool isLower = ((sbfCur & ucfSbLower) == ucfSbLower);
02732                         if (aheadState == stLower) IAssert(isLower);
02733                         else if (aheadState == stNotLower) IAssert(! isLower);
02734                         // We haven't peaked ahead farther than this so far -- invalidate the state.
02735                         aheadState = stUnknown; }
02736                 if (aheadState == stUnknown)
02737                 {
02738                         // Peak ahead to the next non-peekahead-skippable character.
02739                         size_t pos = posNext;
02740                         while (pos < srcEnd) {
02741                                 int cp = (int) src[TVecIdx(pos)]; int sbf = GetSbFlags(cp);
02742                                 if (! IsPeekAheadSkippable(sbf)) {
02743                                         if ((sbf & ucfSbLower) == ucfSbLower) aheadState = stLower;
02744                                         else aheadState = stNotLower;
02745                                         break; }
02746                                 WbFindNextNonIgnored(src, pos, srcEnd); }
02747                         if (! (pos < srcEnd)) aheadState = stNotLower;
02748                 }
02749 #undef IsPeekAheadSkippable
02750                 //
02751 #define TestCurNext(curFlag, nextFlag) if ((sbfCur & curFlag) == curFlag && (sbfNext & nextFlag) == nextFlag) continue
02752 #define TestCurNext2(curFlag, nextFlag, next2Flag) if ((sbfCur & curFlag) == curFlag && (sbfNext & nextFlag) == nextFlag && (sbfNext2 & next2Flag) == next2Flag) continue
02753 #define TestPrevCurNext(prevFlag, curFlag, nextFlag) if ((sbfPrev & prevFlag) == prevFlag && (sbfCur & curFlag) == curFlag && (sbfNext & nextFlag) == nextFlag) continue
02754                 // SB3.  Do not break within CRLF.
02755                 if (cCur == 13 && cNext == 10) continue;
02756                 // SB4.  Break ater paragraph separators.
02757                 if ((sbfCur & ucfSbSep) == ucfSbSep) {
02758                         if (! CanSentenceEndHere(src, srcIdx, position)) continue;
02759                         position = posNext; return true; }
02760                 // Do not break after ambiguous terminators like period, if they are immediately followed by a number
02761                 // or lowercase letter, if they are between uppercase letters, or if the first following letter
02762                 // (optionally after certain punctuation) is lowercase.  For example, a period may be an abbreviation
02763                 // or numeric period, and thus may not mark the end of a sentence.
02764                 TestCurNext(ucfSbATerm, ucfSbNumeric); // SB6
02765                 TestPrevCurNext(ucfSbUpper, ucfSbATerm, ucfSbUpper); // SB7
02766                 // SB8a.  (STerm | ATerm) Close* Sp* [do not break] (STerm | ATerm)
02767                 if ((backState == stATerm || backState == stATermSp || backState == stSTerm || backState == stSTermSp) &&
02768                         (sbfNext & (ucfSbSTerm | ucfSbATerm)) != 0) continue;
02769                 // SB8*.  ATerm Close* Sp* [do not break] ( ! (OLetter | Upper | Lower | Sep | STerm | ATerm) )* Lower
02770                 if ((backState == stATerm || backState == stATermSp) && aheadState == stLower) continue;
02771                 // Break after sentence terminators, but include closing punctuation, trailing spaces, and a paragraph separator (if present).
02772                 // SB9. ( STerm | ATerm ) Close* [do not break] ( Close | Sp | Sep )
02773                 if ((backState == stATerm || backState == stSTerm) && (sbfNext & (ucfSbClose | ucfSbSp | ucfSbSep)) != 0) continue;
02774                 // SB10. ( STerm | ATerm ) Close* Sp* [do not break] ( Sp | Sep )
02775                 // SB11*. ( STerm | ATerm ) Close* Sp* Sep? [do break]
02776                 if (backState == stATerm || backState == stATermSp || backState == stATermSep || backState == stSTerm || backState == stSTermSp || backState == stSTermSep) {
02777                         if ((sbfNext & (ucfSbSp | ucfSbSep)) != 0) continue; // SB10
02778                         if (! CanSentenceEndHere(src, srcIdx, position)) continue;
02779                         position = posNext; return true; } // SB11
02780                 // WB12.  Otherwise, do not break.
02781                 continue;
02782 #undef TestCurNext
02783 #undef TestCurNext2
02784 #undef TestPrevCurNext
02785         }
02786         // WB2.  Break at the end of text.
02787         IAssert(position == srcEnd);
02788         return true;
02789 }
02790
02791 // ToDo: provide a more efficient implementation of this.
02792 template<typename TSrcVec>
02793 void TUniChDb::FindSentenceBoundaries(const TSrcVec& src, const size_t srcIdx, const size_t srcCount, TBoolV& dest) const
02794 {
02795         if (size_t(dest.Len()) != srcCount + 1) dest.Gen(TVecIdx(srcCount + 1));
02796         dest.PutAll(false);
02797         size_t position = srcIdx;
02798         dest[TVecIdx(position - srcIdx)] = true;
02799         while (position < srcIdx + srcCount)
02800         {
02801                 size_t oldPos = position;
02802                 FindNextSentenceBoundary(src, srcIdx, srcCount, position);
02803     if (oldPos < position) {
02804                   Assert(oldPos < position);
02805     }
02806     Assert(position <= srcIdx + srcCount);
02807                 dest[TVecIdx(position - srcIdx)] = true;
02808         }
02809         Assert(dest[TVecIdx(srcCount)]);
02810 }
02811
02812 //-----------------------------------------------------------------------------
02813 // TUniChDb -- case conversions
02814 //-----------------------------------------------------------------------------
02815
02816 template<typename TSrcVec, typename TDestCh>
02817 void TUniChDb::GetCaseConverted(const TSrcVec& src, size_t srcIdx, const size_t srcCount,
02818                                                                 TVec<TDestCh>& dest, const bool clrDest,
02819                                                                 const TUniChDb::TCaseConversion how,
02820                                                                 const bool turkic, const bool lithuanian) const
02821 {
02822         const TIntIntVH &specials = (how == ccUpper ? specialCasingUpper : how == ccLower ? specialCasingLower : how == ccTitle ? specialCasingTitle : *((TIntIntVH *) 0));
02823         if (clrDest) dest.Clr();
02824         enum {
02825                 GreekCapitalLetterSigma = 0x3a3,
02826                 GreekSmallLetterSigma = 0x3c3,
02827                 GreekSmallLetterFinalSigma = 0x3c2,
02828                 LatinCapitalLetterI = 0x49,
02829                 LatinCapitalLetterJ = 0x4a,
02830                 LatinCapitalLetterIWithOgonek = 0x12e,
02831                 LatinCapitalLetterIWithGrave = 0xcc,
02832                 LatinCapitalLetterIWithAcute = 0xcd,
02833                 LatinCapitalLetterIWithTilde = 0x128,
02834                 LatinCapitalLetterIWithDotAbove = 0x130,
02835                 LatinSmallLetterI = 0x69,
02836                 CombiningDotAbove = 0x307
02837         };
02838         //
02839         bool seenCased = false, seenTwoCased = false; int cpFirstCased = -1;
02840         size_t nextWordBoundary = srcIdx;
02841         TBoolV wordBoundaries; bool wbsKnown = false;
02842         for (const size_t origSrcIdx = srcIdx, srcEnd = srcIdx + srcCount; srcIdx < srcEnd; )
02843         {
02844                 int cp = src[TVecIdx(srcIdx)]; srcIdx++;
02845                 //if (turkic && cp == 0x130 && how == ccLower) printf("!");
02846                 // For conversion to titlecase, the first cased character of each word
02847                 // must be converted to titlecase; everything else must be converted
02848                 // to lowercase.
02849                 TUniChDb::TCaseConversion howHere;
02850                 if (how != ccTitle) howHere = how;
02851                 else {
02852                         if (srcIdx - 1 == nextWordBoundary) { // A word starts/ends here.
02853                                 seenCased = false; seenTwoCased = false; cpFirstCased = -1;
02854                                 size_t next = nextWordBoundary; FindNextWordBoundary(src, origSrcIdx, srcCount, next);
02855                                 IAssert(next > nextWordBoundary); nextWordBoundary = next; }
02856                         bool isCased = IsCased(cp);
02857                         if (isCased && ! seenCased) { howHere = ccTitle; seenCased = true; cpFirstCased = cp; }
02858                         else { howHere = ccLower;
02859                                 if (isCased && seenCased) seenTwoCased = true; }
02860                 }
02861                 // First, process the conditional mappings from SpecialCasing.txt.
02862                 // These will be processed in code -- they were ignored while
02863                 // we were reading SpecialCasing.txt itself.
02864                 if (cp == GreekCapitalLetterSigma && howHere == ccLower)
02865                 {
02866                         // SpecialCasing.txt mentions the 'FinalSigma' condition, but sec. 3.13 of
02867                         // the standard doesn't define it.  We'll use FinalCased instead.
02868                         // FinalCased: within the closest word boundaries containing C,
02869                         // there is a cased letter before C, and there is no cased letter after C.
02870                         //size_t nextBoundary = srcIdx - 1; FindNextWordBoundary(src, srcIdx, srcCount, nextBoundary);
02871                         if (! wbsKnown) { FindWordBoundaries(src, origSrcIdx, srcCount, wordBoundaries); wbsKnown = true; }
02872                         size_t srcIdx2 = srcIdx; bool casedAfter = false;
02873                         if (how == ccTitle)
02874                                 printf("!");
02875                         //while (srcIdx2 < nextBoundary)
02876                         while (! wordBoundaries[TVecIdx(srcIdx2 - origSrcIdx)])
02877                         {
02878                                 int cp2 = src[TVecIdx(srcIdx2)]; srcIdx2++;
02879                                 if (IsCased(cp2)) { casedAfter = true; break; }
02880                         }
02881                         if (! casedAfter)
02882                         {
02883                                 //size_t prevBoundary = srcIdx - 1;
02884                                 //FindPreviousWordBoundary(src, srcIdx, srcCount, prevBoundary);
02885                                 srcIdx2 = srcIdx - 1; bool casedBefore = false;
02886                                 //while (prevBoundary < srcIdx2)
02887                                 while (! wordBoundaries[TVecIdx(srcIdx2 - origSrcIdx)])
02888                                 {
02889                                         --srcIdx2; int cp2 = src[TVecIdx(srcIdx2)];
02890                                         if (IsCased(cp2)) { casedBefore = true; break; }
02891                                 }
02892                                 if (casedBefore) {
02893                                         // Now we have a FinalCased character.
02894                                         dest.Add(GreekSmallLetterFinalSigma); Assert(howHere == ccLower); continue; }
02895                         }
02896                         // If we got here, add a non-final sigma.
02897                         dest.Add(GreekSmallLetterSigma); continue;
02898                 }
02899                 else if (lithuanian)
02900                 {
02901                         if (howHere == ccLower)
02902                         {
02903                                 if (cp == LatinCapitalLetterI || cp == LatinCapitalLetterJ || cp == LatinCapitalLetterIWithOgonek)
02904                                 {
02905                                         bool moreAbove = false;
02906                                         for (size_t srcIdx2 = srcIdx; srcIdx2 < srcEnd; )
02907                                         {
02908                                                 const int cp2 = src[TVecIdx(srcIdx2)]; srcIdx2++;
02909                                                 const int cc2 = GetCombiningClass(cp2);
02910                                                 if (cc2 == TUniChInfo::ccStarter) break;
02911                                                 if (cc2 == TUniChInfo::ccAbove) { moreAbove = true; break; }
02912                                         }
02913                                         if (moreAbove)
02914                                         {
02915                                                 if (cp == LatinCapitalLetterI) { dest.Add(0x69); dest.Add(0x307); continue; }
02916                                                 if (cp == LatinCapitalLetterJ) { dest.Add(0x6a); dest.Add(0x307); continue; }
02917                                                 if (cp == LatinCapitalLetterIWithOgonek) { dest.Add(0x12f); dest.Add(0x307); continue; }
02918                                         }
02919                                 }
02920                                 else if (cp == LatinCapitalLetterIWithGrave) { dest.Add(0x69); dest.Add(0x307); dest.Add(0x300); continue; }
02921                                 else if (cp == LatinCapitalLetterIWithAcute) { dest.Add(0x69); dest.Add(0x307); dest.Add(0x301); continue; }
02922                                 else if (cp == LatinCapitalLetterIWithTilde) { dest.Add(0x69); dest.Add(0x307); dest.Add(0x303); continue; }
02923                         }
02924                         if (cp == CombiningDotAbove)
02925                         {
02926                                 // Lithuanian, howHere != ccLower.
02927                                 // AfterSoftDotted := the last preceding character with a combining class
02928                                 // of zero before C was Soft_Dotted, and there is no intervening combining
02929                                 // character class 230 (ABOVE).
02930                                 bool afterSoftDotted = false;
02931                                 size_t srcIdx2 = srcIdx - 1; // now srcIdx2 is the index from which we got 'cp'
02932                                 while (origSrcIdx < srcIdx2)
02933                                 {
02934                                         --srcIdx2; int cp2 = src[TVecIdx(srcIdx2)];
02935                                         int cc2 = GetCombiningClass(cp2);
02936                                         if (cc2 == TUniChInfo::ccAbove) break;
02937                                         if (cc2 == TUniChInfo::ccStarter) {
02938                                                 afterSoftDotted = IsSoftDotted(cp2); break; }
02939                                 }
02940                                 if (afterSoftDotted)
02941                                 {
02942                                         Assert(lithuanian);
02943                                         // Remove DOT ABOVE after "i" with upper or titlecase.
02944                                         // - Note: but this must only be done if that "i" was actually placed into uppercase (if how == ccTitle,
02945                                         //   the "i" may have been kept lowercase and thus we shouldn't remove the dot).
02946                                         if (how == ccLower) { dest.Add(0x307); continue; }
02947                                         if (how == ccUpper) continue;
02948                                         Assert(how == ccTitle);
02949                                         Assert(howHere == ccLower); // because CombiningDotAbove is not a cased character
02950                                         if (seenCased && ! seenTwoCased) continue; // The "i" has been placed into uppercase; thus, remove the dot.
02951                                         dest.Add(0x307); continue;
02952                                 }
02953                         }
02954                 }
02955                 else if (turkic) // language code 'tr' (Turkish) and 'az' (Azeri)
02956                 {
02957                         // I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
02958                         // The following rules handle those cases.
02959                         if (cp == LatinCapitalLetterIWithDotAbove) {
02960                                 dest.Add(howHere == ccLower ? 0x69 : 0x130); continue; }
02961                         // When lowercasing, remove dot_above in the sequence I + dot_above,
02962                         // which will turn into i.  This matches the behavior of the
02963                         // canonically equivalent I-dot_above.
02964                         else if (cp == CombiningDotAbove)
02965                         {
02966                                 // AfterI: the last preceding base character was an uppercase I,
02967                                 // and there is no intervening combining character class 230 (ABOVE).
02968                                 bool afterI = false;
02969                                 size_t srcIdx2 = srcIdx - 1; // now srcIdx2 is the index from which we got 'cp'
02970                                 while (origSrcIdx < srcIdx2)
02971                                 {
02972                                         --srcIdx2; int cp2 = src[TVecIdx(srcIdx2)];
02973                                         if (cp2 == LatinCapitalLetterI) { afterI = true; break; }
02974                                         int cc2 = GetCombiningClass(cp2);
02975                                         if (cc2 == TUniChInfo::ccAbove || cc2 == TUniChInfo::ccStarter) break;
02976                                 }
02977                                 if (afterI) {
02978                                         if (how == ccTitle && seenCased && ! seenTwoCased) {
02979                                                 // Sec. 3.13 defines title-casing in an unusual way: find the first cased character in each word;
02980                                                 // if found, map it to titlecase; otherwise, map all characters in that word to lowercase.
02981                                                 // This suggests that if a cased character is found, others in that word should be left alone.
02982                                                 // This seems unusual; we map all other characters to lowercase instead.
02983                                                 // But this leads to problems with e.g. I followed by dot-above (U+0307): since the dot-above
02984                                                 // is not the first cased character (it isn't even cased), we attempt to set it to lowercase;
02985                                                 // but since afterI is also true here, this would mean deleting it.  Thus our titlecased
02986                                                 // form of "I followed by dot-above" would be just "I", which is clearly wrong.
02987                                                 // So we treat this as a special case here.
02988                                                 IAssert(cpFirstCased == LatinCapitalLetterI);
02989                                                 dest.Add(0x307); continue; }
02990                                         if (howHere != ccLower) dest.Add(0x307);
02991                                         continue; }
02992                         }
02993                         // When lowercasing, unless an I is before a dot_above,
02994                         // it turns into a dotless i.
02995                         else if (cp == LatinCapitalLetterI)
02996                         {
02997                                 // BeforeDot: C is followed by U+0307 (combining dot above).
02998                                 // Any sequence of characters with a combining class that is
02999                                 // neither 0 nor 230 may intervene between the current character
03000                                 // and the combining dot above.
03001                                 bool beforeDot = false;
03002                                 for (size_t srcIdx2 = srcIdx; srcIdx2 < srcEnd; )
03003                                 {
03004                                         const int cp2 = src[TVecIdx(srcIdx2)]; srcIdx2++;
03005                                         if (cp2 == 0x307) { beforeDot = true; break; }
03006                                         const int cc2 = GetCombiningClass(cp2);
03007                                         if (cc2 == TUniChInfo::ccStarter || cc2 == TUniChInfo::ccAbove) break;
03008                                 }
03009                                 if (! beforeDot) {
03010                                         dest.Add(howHere == ccLower ? 0x131 : 0x49); continue; }
03011                         }
03012                         // When uppercasing, i turns into a dotted capital I.
03013                         else if (cp == LatinSmallLetterI)
03014                         {
03015                                 dest.Add(howHere == ccLower ? 0x69 : 0x130); continue;
03016                         }
03017                 }
03018                 // Try to use the unconditional mappings.
03019                 const TIntIntVH &specHere = (
03020                         howHere == how ? specials :
03021                         howHere == ccLower ? specialCasingLower :
03022                         howHere == ccTitle ? specialCasingTitle :
03023                         howHere == ccUpper ? specialCasingUpper : *((TIntIntVH *) 0));
03024                 int i = specHere.GetKeyId(cp);
03025                 if (i >= 0) { TUniCaseFolding::AppendVector(specHere[i], dest); continue; }
03026                 // Try to use the simple (one-character) mappings.
03027                 i = h.GetKeyId(cp);
03028                 if (i >= 0) {
03029                         const TUniChInfo &ci = h[i];
03030                         int cpNew = (
03031                                 howHere == ccLower ? ci.simpleLowerCaseMapping :
03032                                 howHere == ccUpper ? ci.simpleUpperCaseMapping :
03033                                                                          ci.simpleTitleCaseMapping);
03034                         if (cpNew < 0) cpNew = cp;
03035                         dest.Add(cpNew); continue; }
03036                 // As a final resort, leave 'cp' unchanged.
03037                 dest.Add(cp);
03038         }
03039 }
03040
03041 template<typename TSrcVec, typename TDestCh>
03042 void TUniChDb::GetSimpleCaseConverted(const TSrcVec& src, size_t srcIdx, const size_t srcCount,
03043         TVec<TDestCh>& dest, const bool clrDest, const TCaseConversion how) const
03044 {
03045         if (clrDest) dest.Clr();
03046         bool seenCased = false; size_t nextWordBoundary = srcIdx;
03047         for (const size_t origSrcIdx = srcIdx, srcEnd = srcIdx + srcCount; srcIdx < srcEnd; )
03048         {
03049                 const int cp = src[TVecIdx(srcIdx)]; srcIdx++;
03050                 int i = h.GetKeyId(cp); if (i < 0) { dest.Add(cp); continue; }
03051                 const TUniChInfo &ci = h[i];
03052                 // With titlecasing, the first cased character of each word must be put into titlecase,
03053                 // all others into lowercase.  This is what the howHere variable is for.
03054                 TUniChDb::TCaseConversion howHere;
03055                 if (how != ccTitle) howHere = how;
03056                 else {
03057                         if (srcIdx - 1 == nextWordBoundary) { // A word starts/ends here.
03058                                 seenCased = false;
03059                                 size_t next = nextWordBoundary; FindNextWordBoundary(src, origSrcIdx, srcCount, next);
03060                                 IAssert(next > nextWordBoundary); nextWordBoundary = next; }
03061                         bool isCased = IsCased(cp);
03062                         if (isCased && ! seenCased) { howHere = ccTitle; seenCased = true; }
03063                         else howHere = ccLower;
03064                 }
03065                 int cpNew = (howHere == ccTitle ? ci.simpleTitleCaseMapping : howHere == ccUpper ? ci.simpleUpperCaseMapping : ci.simpleLowerCaseMapping);
03066                 if (cpNew < 0) cpNew = cp;
03067                 dest.Add(cpNew);
03068         }
03069 }
03070
03071 template<typename TSrcVec>
03072 void TUniChDb::ToSimpleCaseConverted(TSrcVec& src, size_t srcIdx, const size_t srcCount, const TCaseConversion how) const
03073 {
03074         bool seenCased = false; size_t nextWordBoundary = srcIdx;
03075         for (const size_t origSrcIdx = srcIdx, srcEnd = srcIdx + srcCount; srcIdx < srcEnd; srcIdx++)
03076         {
03077                 const int cp = src[TVecIdx(srcIdx)];
03078                 int i = h.GetKeyId(cp); if (i < 0) continue;
03079                 const TUniChInfo &ci = h[i];
03080                 // With titlecasing, the first cased character of each word must be put into titlecase,
03081                 // all others into lowercase.  This is what the howHere variable is for.
03082                 TUniChDb::TCaseConversion howHere;
03083                 if (how != ccTitle) howHere = how;
03084                 else {
03085                         if (srcIdx == nextWordBoundary) { // A word starts/ends here.
03086                                 seenCased = false;
03087                                 size_t next = nextWordBoundary; FindNextWordBoundary(src, origSrcIdx, srcCount, next);
03088                                 IAssert(next > nextWordBoundary); nextWordBoundary = next; }
03089                         bool isCased = IsCased(cp);
03090                         if (isCased && ! seenCased) { howHere = ccTitle; seenCased = true; }
03091                         else howHere = ccLower;
03092                 }
03093                 int cpNew = (howHere == ccTitle ? ci.simpleTitleCaseMapping : howHere == ccUpper ? ci.simpleUpperCaseMapping : ci.simpleLowerCaseMapping);
03094                 if (cpNew >= 0) src[TVecIdx(srcIdx)] = cpNew;
03095         }
03096 }
03097
03098 //-----------------------------------------------------------------------------
03099 // TUniChDb -- composition, decomposition, normal forms
03100 //-----------------------------------------------------------------------------
03101
03102 template<typename TDestCh>
03103 void TUniChDb::AddDecomposition(const int codePoint, TVec<TDestCh>& dest, const bool compatibility) const
03104 {
03105         if (HangulSBase <= codePoint && codePoint < HangulSBase + HangulSCount)
03106         {
03107                 // UAX #15, sec. 16: Hangul decomposition
03108                 const int SIndex = codePoint - HangulSBase;
03109                 const int L = HangulLBase + SIndex / HangulNCount;
03110                 const int V = HangulVBase + (SIndex % HangulNCount) / HangulTCount;
03111                 const int T = HangulTBase + (SIndex % HangulTCount);
03112                 dest.Add(L); dest.Add(V);
03113                 if (T != HangulTBase) dest.Add(T);
03114                 return;
03115         }
03116         int i = h.GetKeyId(codePoint); if (i < 0) { dest.Add(codePoint); return; }
03117         const TUniChInfo &ci = h[i];
03118         int ofs = ci.decompOffset; if (ofs < 0) { dest.Add(codePoint); return; }
03119         if ((! compatibility) && ci.IsCompatibilityDecomposition()) { dest.Add(codePoint); return; }
03120         while (true) {
03121                 int cp = decompositions[ofs++]; if (cp < 0) return;
03122                 AddDecomposition(cp, dest, compatibility); }
03123 }
03124
03125 template<typename TSrcVec, typename TDestCh>
03126 void TUniChDb::Decompose(const TSrcVec& src, size_t srcIdx, const size_t srcCount,
03127                 TVec<TDestCh>& dest, const bool compatibility, bool clrDest) const
03128 {
03129         if (clrDest) dest.Clr();
03130         const size_t destStart = dest.Len()/*, srcEnd = srcIdx + srcCount*/;
03131         // Decompose the string.
03132         while (srcIdx < srcCount) {
03133                 AddDecomposition(src[TVecIdx(srcIdx)], dest, compatibility); srcIdx++; }
03134         // Rearrange the decomposed string into canonical order.
03135         for (size_t destIdx = destStart, destEnd = dest.Len(); destIdx < destEnd; )
03136         {
03137                 size_t j = destIdx;
03138                 int cp = dest[TVecIdx(destIdx)]; destIdx++;
03139                 int cpCls = GetCombiningClass(cp);
03140                 if (cpCls == TUniChInfo::ccStarter) continue;
03141                 while (destStart < j && GetCombiningClass(dest[TVecIdx(j - 1)]) > cpCls) {
03142                         dest[TVecIdx(j)] = dest[TVecIdx(j - 1)]; j--; }
03143                 dest[TVecIdx(j)] = cp;
03144         }
03145 }
03146
03147 template<typename TSrcVec, typename TDestCh>
03148 void TUniChDb::DecomposeAndCompose(const TSrcVec& src, size_t srcIdx, const size_t srcCount,
03149                 TVec<TDestCh>& dest, bool compatibility, bool clrDest) const
03150 {
03151         if (clrDest) dest.Clr();
03152         TIntV temp;
03153         Decompose(src, srcIdx, srcCount, temp, compatibility);
03154         Compose(temp, 0, temp.Len(), dest, clrDest);
03155 }
03156
03157 template<typename TSrcVec, typename TDestCh>
03158 void TUniChDb::Compose(const TSrcVec& src, size_t srcIdx, const size_t srcCount,
03159                 TVec<TDestCh>& dest, bool clrDest) const
03160 {
03161         if (clrDest) dest.Clr();
03162         bool lastStarterKnown = false; // has a starter been encountered yet?
03163         size_t lastStarterPos = size_t(-1);  // the index (in 'dest') of the last starter
03164         int cpLastStarter = -1; // the codepoint of the last starter (i.e. cpLastStarter == dest[lastStarterPos])
03165         const size_t srcEnd = srcIdx + srcCount;
03166         int ccMax = -1; // The highest combining class among the characters since the last starter.
03167         while (srcIdx < srcEnd)
03168         {
03169                 const int cp = src[TVecIdx(srcIdx)]; srcIdx++;
03170                 const int cpClass = GetCombiningClass(cp);
03171                 //int cpCombined = -1;
03172                 // If there is a starter with which 'cp' can be combined, and from which it is not blocked
03173                 // by some intermediate character, we can try to combine them.
03174                 if (lastStarterKnown && ccMax < cpClass)
03175                 {
03176                         int j = inverseDec.GetKeyId(TIntPr(cpLastStarter, cp));
03177                         int cpCombined = -1;
03178                         do {
03179                                 // Try to look up a composition in the inverseDec table.
03180                                 if (j >= 0) { cpCombined = inverseDec[j]; break; }
03181                                 // UAX #15, sec. 16: Hangul composition
03182                                 // - Try to combine L and V.
03183                                 const int LIndex = cpLastStarter - HangulLBase;
03184                                 if (0 <= LIndex && LIndex < HangulLCount) {
03185                                         const int VIndex = cp - HangulVBase;
03186                                         if (0 <= VIndex && VIndex < HangulVCount) {
03187                                                 cpCombined = HangulSBase + (LIndex * HangulVCount + VIndex) * HangulTCount;
03188                                                 break; } }
03189                                 // - Try to combine LV and T.
03190                                 const int SIndex = cpLastStarter - HangulSBase;
03191                                 if (0 <= SIndex && SIndex < HangulSCount && (SIndex % HangulTCount) == 0)
03192                                 {
03193                                         const int TIndex = cp - HangulTBase;
03194                                         if (0 <= TIndex && TIndex < HangulTCount) {
03195                                                 cpCombined = cpLastStarter + TIndex;
03196                                                 break; }
03197                                 }
03198                         } while (false);
03199                         // If a combining character has been found, use it to replace the old cpStarter.
03200                         if (cpCombined >= 0) {
03201                                 dest[TVecIdx(lastStarterPos)] = cpCombined;
03202                                 Assert(GetCombiningClass(cpCombined) == TUniChInfo::ccStarter);
03203                                 // if (cpCombined is not a starter) { starterKnown = false; lastStarterPos = size_t(01); cpLastStarter = -1; } else
03204                                 cpLastStarter = cpCombined; continue; }
03205                 }
03206                 if (cpClass == TUniChInfo::ccStarter) { // 'cp' is a starter, remember it for later.  Set ccMax to -1 so that this starter can be combined with another starter.
03207                         lastStarterKnown = true; lastStarterPos = dest.Len(); cpLastStarter = cp; ccMax = cpClass - 1; }
03208                 else if (cpClass > ccMax) // Remember cp's class as the new maximum class since the last starter (for blocking).
03209                         ccMax = cpClass;
03210                 dest.Add(cp);
03211         }
03212 }
03213
03214 template<typename TSrcVec, typename TDestCh>
03215 size_t TUniChDb::ExtractStarters(const TSrcVec& src, size_t srcIdx, const size_t srcCount,
03216                 TVec<TDestCh>& dest, bool clrDest) const
03217 {
03218         if (clrDest) dest.Clr();
03219         size_t retVal = 0;
03220         for (const size_t srcEnd = srcIdx + srcCount; srcIdx < srcEnd; srcIdx++) {
03221                 const int cp = src[TVecIdx(srcIdx)];
03222                 if (GetCombiningClass(cp) == TUniChInfo::ccStarter)
03223                         { dest.Add(cp); retVal++; } }
03224         return retVal;
03225 }
03226
03227 inline bool AlwaysFalse()
03228 {
03229         int sum = 0;
03230         for (int i = 0; i < 5; i++) sum += i;
03231         return sum > 100;
03232 }
03233
03234 inline bool AlwaysTrue()
03235 {
03236         int sum = 0;
03237         for (int i = 0; i < 5; i++) sum += i;
03238         return sum < 100;
03239 }
03240
03241 /*
03242
03243 Notes on decomposition:
03244
03245 - In UnicodeData.txt, there is a field with the decomposition mapping.
03246   This field may also include a tag, <...>.
03247   If there is a tag, this is a compatibility mapping.
03248   Otherwise it is a canonical mapping.
03249 - Canonical decomposition uses only canonical mappings,
03250   compatibility decomposition uses both canonical and compatibility mappings.
03251 - Decomposition:
03252   1. Apply the decomposition mappings (canonical or canonical+compatibility), recursively.
03253   2. Put the string into canonical order, which means:
03254      while there exists a pair of characters, A immediately followed by B,
03255          such that combiningclass(A) > combiningclass(B) > 0  [an "exchangeable pair"]:
03256            swap A and B;
03257   This results in NFD (normalized form D, after canonical decomposition)
03258   or NFKD (normalized form KD, after compatibility decomposition).
03259 - Canonical composition:
03260   1. Before composition, the string should have been decomposed
03261      (using either canonical or compatibility decomposition).
03262   2. For each character C (from left to right):
03263      2.1.  Find the last starter S before C (if not found, continue).
03264          2.2.  If there is, between S and C, some character with a combining class >= than that of C, then continue.
03265          2.3.  If there exists a character L for which the canonical decomposition is S+L
03266                and L is not in the composition exclusion table [i.e. L is a "primary composite"],
03267                    then replace S by L, and remove C.
03268   This results in NFC (normalized form C, with canonical decomposition followed by canonical composition)
03269   or NFKC (normalized form KD, with compatibility decomposition followed by canonical composition).
03270 - Composition exclusion table:
03271   - Anything in CompositionExclusions.txt.
03272   - Singletons: characters whose canonical decomposition is a single character.
03273   - Non-starter decompositions: characters whose canonical decomposition begins with a non-starter.
03274
03275 Example:
03276                  E-grave  (00c8; composition class 0; canonical decomposition: 0045 0300)
03277                                  E-macron (0112; composition class 0;                          0045 0304)
03278                                  grave   (0300; composition class 230)
03279                  macron  (0304; composition class 230)
03280   source string: 00c8 0304
03281   after canonical decomposition (or compatibility decomposition, they would be the same here): 0045 0300 0304
03282   after canonical composition: 00c8 0304
03283
03284   cc(horn) = 216
03285   cc(dot below) = 220
03286   cc(dot above) = 230
03287
03288 ToDos:
03289 - case folding - je misljen predvsem za primerjanje tako dobljenih nizov.
03290   Funkcija f(s) = NFC(toCaseFold(s)) je idempotentna.
03291   Funkcija g(s) = NFKC(toCaseFold(s)) pa ni -- ce hocemo to, moramo pri foldingu
03292   upostevati se nekaj dodatnih mappingov (glej 5.18, zadnji odstavek; DerivedNormalizationProps.txt).
03293 - Zdi se, da je CaseFolding.txt v bistvu cisto navaden folding v lowercase.
03294   Ker hocemo imeti tudi ostale foldinge, glejmo raje SpecialCasing.txt
03295   (+ simple case mappinge v UnicodeData.txt).
03296   Predlagam, da pri branju SpecialCasing.txt conditional mappinge kar ignoriramo
03297   in jih potem upostevamo posebej kar v source kodi nasih programov [za
03298   podrobno definicijo pogojev pa glej tabelo 3.13].
03299   - Pripis: vseeno se mi zdi, da je CaseFolding.txt nekaj malo drugacnega od navadnega lowercase.
03300     Na primer, za small final sigma 03c2 je tam navedeno, naj se spremeni v navadno small sigma 03c3.
03301         To ne sledi niti iz UnicodeData.txt niti iz SpecialCasing.txt, pa ceprav v UCD.html pise,
03302         da je CaseFolding.txt izpeljan iz njiju.  Glavni namen CaseFolding.txt naj bi bil za
03303         potrebe "locale-independent case folding" (table 4.1 in sec. 5.18).
03304   - Preden se zacnes ubadati s case conversioni, si oglej razdelek 3.13
03305     in se posebej str. 90.
03306   - Glej str. 91 o kombinaciji N[K]FD + caseFold + N[K]FD
03307   - definicija cased ipd. na str. 89
03308 - isIdentifierStart(c), isIdentifierEnd(c) -- sec. 5.15
03309   Glej DerivedCoreProperties.txt, kjer je na podoben nacin definiranih se kup podobnih
03310   stvari, med drugim isLowerCase in isUpperCase.  Tam je tudi isLetter, isAlphabetic itd. (sec. 4.9).
03311   To je se najbolje dodati med flagse posameznega characterja.
03312 - general category: sec. 4.5
03313 - motivacija za titlecase: 5.18
03314 - primerjaj nas dosedanji izracun compositionExclusion s tistim, kar je naracunano v DerivedNormalizationProps.txt
03315   pod Full_Composition_Exclusion
03316 - script names: Scripts.txt in UAX #24.
03317 - block names: Blocks.txt
03318 - space characters: table 6.2 in baje tudi UCD.html
03319 - dash characters: table 6.3
03320 */
03321
03322 //#endif
03323