SNAP Library , Developer Reference  2013-01-07 14:03:36
SNAP, a general purpose, high performance system for analysis and manipulation of large networks
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines
unicode.h
Go to the documentation of this file.
00001 #include "bd.h"
00002 
00003 //#ifndef unicode_h
00004 //#define unicode_h
00005 
00007 // Includes
00008 //#include "base.h"
00009 #include <new>
00010 
00011 typedef int TUniVecIdx;
00012 
00013 //-----------------------------------------------------------------------------
00014 // TUniCodec -- an UTF-8 and UTF-16 Encoder/Decoder
00015 //-----------------------------------------------------------------------------
00016 
00017 // Error handling modes for the TUniCodec class.
00018 typedef enum TUnicodeErrorHandling_
00019 {
00020         // What happens when an error occurs:
00021         uehIgnore = 0,  // - it is silently ignored (nothing is added to the output vector)
00022         uehThrow = 1,   // - an exception is thrown (TUnicodeException)
00023         uehReplace = 2, // - the replacement character is added to the output vector
00024         uehAbort = 3    // - the encoding/decoding process stops immediately
00025 }
00026 TUnicodeErrorHandling;
00027 
00028 class TUnicodeException
00029 {
00030 public:
00031         TStr message;  // error message
00032         size_t srcIdx; // the position in the source vector where the error occurred
00033         int srcChar;   // the source character at the position srcIdx
00034         TUnicodeException(size_t srcIdx_, int srcChar_, const TStr& message_) :
00035                 message(message_), srcIdx(srcIdx_), srcChar(srcChar_) { }
00036 };
00037 
00038 typedef enum TUniByteOrder_
00039 {
00040         boMachineEndian = 0,
00041         boLittleEndian = 1,
00042         boBigEndian = 2
00043 }
00044 TUniByteOrder;
00045 
00046 typedef enum TUtf16BomHandling_
00047 {
00048         bomAllowed = 0,   // if a BOM is present, it is used to determine the byte order; otherwise, the default byte order is used
00049         bomRequired = 1,  // if a BOM is present, it is used to determine the byte order; otherwise, an error is reported
00050         bomIgnored = 2    // the default byte order is used; if a BOM is present, it is treated like any other character
00051 }
00052 TUtf16BomHandling;
00053 
00054 class TUniCodec
00055 {
00056 public:
00057         // 0xfffd is defined as the replacement character by the Unicode standard.
00058         // By default, it is rendered as a question mark inside a diamond: "<?>".
00059         enum { DefaultReplacementChar = 0xfffd };
00060 
00061         // The replacement character is inserted into the destination vector
00062         // if an error occurs in the source vector.  By default, this is set
00063         // to DefaultReplacementChar.
00064         int replacementChar;
00065         // The error handling mode.
00066         TUnicodeErrorHandling errorHandling;
00067         // There are a number of situations where there is strictly speaking an error in
00068         // the source data although it can still be decoded in a reasonably meaningful way.
00069         // If strict == true, these situations are treated as errors.  Examples:
00070         // - when decoding UTF-8:
00071         //   - a codepoint represented by more bytes than necessary (e.g. one of the characters 0..127
00072         //     encoded as a two-byte sequence)
00073         //   - a codepoint > 0x10ffff
00074         // - when decoding UTF-16:
00075         //   - a codepoint from the range reserved for the second character of a surrogate pair
00076         //     is not preceded by a codepoint from the range reserved for the first character of a surrogate pair
00077         // - when encoding UTF-8:
00078         //   - a codepoint > 0x10ffff
00079         // - when encoding UTF-16:
00080         //   - a codepoint from the range reserved from the second character of a surrogate pair
00081         //     [note that a codepoint > 0x10ffff, or from the range reserved for the first character of a
00082         //     surrogate pair, is always an error, even with strict == false]
00083         bool strict;
00084         // skipBom == true means: If a byte-order-mark (0xfffe or 0xfeff) occurs at the beginning
00085         // of the source vector, it is skipped (when decoding).
00086         // - Note: a BOM is not really useful in UTF-8 encoded data.  However, the .NET UTF8Encoding
00087         //   emits 0xfeff by default as a kind of preamble.  It gets encoded as 3 bytes, ef bb bf,
00088         //   and can be helpful to make the data easier to recognize as UTF-8 encoded data.
00089         bool skipBom;
00090 
00091         TUniCodec() : replacementChar(DefaultReplacementChar), errorHandling(uehIgnore), strict(false), skipBom(true)
00092         {
00093         }
00094 
00095         TUniCodec(TUnicodeErrorHandling errorHandling_, bool strict_, int replacementChar_, bool skipBom_) :
00096                 replacementChar(replacementChar_), errorHandling(errorHandling_), strict(strict_), skipBom(skipBom_)
00097         {
00098         }
00099 
00100 protected:
00101         enum {
00102 #define DefineByte(b7, b6, b5, b4, b3, b2, b1, b0) _ ## b7 ## b6 ## b5 ## b4 ## _ ## b3 ## b2 ## b1 ## b0 = (b7 << 7) | (b6 << 6) | (b5 << 5) | (b4 << 4) | (b3 << 3) | (b2 << 2) | (b1 << 1) | b0
00103                 DefineByte(1, 0, 0, 0, 0, 0, 0, 0),
00104                 DefineByte(1, 1, 0, 0, 0, 0, 0, 0),
00105                 DefineByte(1, 1, 1, 0, 0, 0, 0, 0),
00106                 DefineByte(1, 1, 1, 1, 0, 0, 0, 0),
00107                 DefineByte(1, 1, 1, 1, 1, 0, 0, 0),
00108                 DefineByte(1, 1, 1, 1, 1, 1, 0, 0),
00109                 DefineByte(1, 1, 1, 1, 1, 1, 1, 0),
00110                 DefineByte(0, 0, 1, 1, 1, 1, 1, 1),
00111                 DefineByte(0, 0, 0, 1, 1, 1, 1, 1),
00112                 DefineByte(0, 0, 0, 0, 1, 1, 1, 1),
00113                 DefineByte(0, 0, 0, 0, 0, 1, 1, 1),
00114                 DefineByte(0, 0, 0, 0, 0, 0, 1, 1)
00115 #undef DefineByte
00116         };
00117 
00118         typedef TUniVecIdx TVecIdx;
00119         //friend class TUniChDb;
00120         friend class TUniCaseFolding;
00121 
00122 public:
00123 
00124         //-----------------------------------------------------------------------
00125         // UTF-8
00126         //-----------------------------------------------------------------------
00127 
00128         // Returns the number of characters that have been successfully decoded.
00129         // This does not include any replacement characters that may have been inserted into 'dest'.
00130         template<typename TSrcVec, typename TDestCh>
00131         size_t DecodeUtf8(
00132                 const TSrcVec& src, size_t srcIdx, const size_t srcCount,
00133                 TVec<TDestCh>& dest, const bool clrDest = true) const;
00134         template<typename TSrcVec, typename TDestCh>
00135         size_t DecodeUtf8(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true) const { return DecodeUtf8(src, 0, src.Len(), dest, clrDest); }
00136 
00137         // Returns the number of characters that have been successfully encoded.
00138         // This does not include any replacement characters that may have been inserted into 'dest'.
00139         template<typename TSrcVec, typename TDestCh>
00140         size_t EncodeUtf8(
00141                 const TSrcVec& src, size_t srcIdx, const size_t srcCount,
00142                 TVec<TDestCh>& dest, const bool clrDest = true) const;
00143         template<typename TSrcVec, typename TDestCh>
00144         size_t EncodeUtf8(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true) const { return EncodeUtf8(src, 0, src.Len(), dest, clrDest); }
00145 
00146         // The following wrappers around the UTF-8 encoder return a TStr containing
00147         // the UTF-8-encoded version of the input string.
00148         template<typename TSrcVec> TStr EncodeUtf8Str(const TSrcVec& src, size_t srcIdx, const size_t srcCount) const { TVec<char> temp; EncodeUtf8(src, srcIdx, srcCount, temp); TStr retVal = &(temp[0]); return retVal; }
00149         template<typename TSrcVec> TStr EncodeUtf8Str(const TSrcVec& src) const { TVec<char> temp; EncodeUtf8(src, temp); temp.Add(0); TStr retVal = &(temp[0]); return retVal; }
00150 
00151         //-----------------------------------------------------------------------
00152         // UTF-16 Decoder
00153         //-----------------------------------------------------------------------
00154 
00155 protected:
00156         enum {
00157                 Utf16FirstSurrogate = 0xd800,
00158                 Utf16SecondSurrogate = 0xdc00
00159         };
00160 
00161         static bool IsMachineLittleEndian();
00162 
00163 public:
00164 
00165         // Returns the number of characters that have been successfully decoded.
00166         // This does not include any replacement characters that may have been inserted into 'dest'.
00167         // Each element of 'src' is assumed to contain one byte of data.
00168         // srcCount must be even (though srcIdx doesn't need to be).
00169         template<typename TSrcVec, typename TDestCh>
00170         size_t DecodeUtf16FromBytes(
00171                 const TSrcVec& src, size_t srcIdx, const size_t srcCount,
00172                 TVec<TDestCh>& dest, const bool clrDest,
00173                 const TUtf16BomHandling bomHandling = bomAllowed,
00174                 const TUniByteOrder defaultByteOrder = boMachineEndian) const;
00175 
00176         // Here, each element of 'src' is treated as a 16-bit word.  The byte-order settings
00177         // are used to determine if the two bytes of each word should be swapped before further
00178         // processing.  For example, if a BOM is present, it must have the value 0xfeff; if it
00179         // actually has the value 0xfffe, this means that the two bytes of each word must be swapped.
00180         // Basically, the combination of the byteOrder parameter and the byte order mark (if present) at the
00181         // beginning of the source data is used to determine the "original" byte order of the data;
00182         // if this doesn't match the byte order of the local machine, the two bytes of each word will
00183         // be swapped during the decoding process.
00184         template<typename TSrcVec, typename TDestCh>
00185         size_t DecodeUtf16FromWords(
00186                 const TSrcVec& src, size_t srcIdx, const size_t srcCount,
00187                 TVec<TDestCh>& dest, bool clrDest,
00188                 const TUtf16BomHandling bomHandling = bomAllowed,
00189                 const TUniByteOrder defaultByteOrder = boMachineEndian) const;
00190 
00191         //-----------------------------------------------------------------------
00192         // UTF-16 Encoder
00193         //-----------------------------------------------------------------------
00194 
00195         // Returns the number of characters that have been successfully encoded.
00196         // This does not include any replacement characters that may have been inserted into 'dest'.
00197         //
00198         // Notes:
00199         // - UTF-16 *cannot* encode characters above 0x10ffff, so their presence is always
00200         //   treated as an error, regardless of the value of 'strict'.
00201         // - Characters from the range Utf16FirstSurrogate through Utf16FirstSurrogate + 1023
00202         //   cannot be encoded by UTF-16 either, as they would be misinterpreted during decoding
00203         //   as the first character of a surrogate pair.
00204         // - Characters from the range Utf16SecondSurrogate through Utf16SecondSurrogate + 1023
00205         //   can be encoded in principle; however, if strict == true, they are treated as errors.
00206         template<typename TSrcVec, typename TDestCh>
00207         size_t EncodeUtf16ToWords(
00208                 const TSrcVec& src, size_t srcIdx, const size_t srcCount,
00209                 TVec<TDestCh>& dest, const bool clrDest, const bool insertBom,
00210                 const TUniByteOrder destByteOrder = boMachineEndian) const;
00211 
00212         template<typename TSrcVec, typename TDestCh>
00213         size_t EncodeUtf16ToBytes(
00214                 const TSrcVec& src, size_t srcIdx, const size_t srcCount,
00215                 TVec<TDestCh>& dest, const bool clrDest, const bool insertBom,
00216                 const TUniByteOrder destByteOrder = boMachineEndian) const;
00217 
00218         //-----------------------------------------------------------------------
00219         // Helper declarations for the test drivers
00220         //-----------------------------------------------------------------------
00221 
00222 protected:
00223 
00224         static uint GetRndUint(TRnd& rnd);
00225         static uint GetRndUint(TRnd& rnd, uint minVal, uint maxVal);
00226 
00227         //-----------------------------------------------------------------------
00228         // UTF-8 Test Driver
00229         //-----------------------------------------------------------------------
00230 
00231 protected:
00232         void TestUtf8(bool decode, size_t expectedRetVal, bool expectedThrow, const TIntV& src, const TIntV& expectedDest, FILE *f);
00233         // Generates a random UTF-8-encoded stream according to the specifications in 'testCaseDesc',
00234         // then calls TestUtf8 to make sure that DecodeUtf8 reacts as expected.
00235         void TestDecodeUtf8(TRnd& rnd, const TStr& testCaseDesc);
00236 public:
00237         void TestUtf8();
00238 
00239         //-----------------------------------------------------------------------
00240         // UTF-16 Test Driver
00241         //-----------------------------------------------------------------------
00242 
00243 protected:
00244         void WordsToBytes(const TIntV& src, TIntV& dest);
00245         void TestUtf16(bool decode, size_t expectedRetVal, bool expectedThrow, const TIntV& src, const TIntV& expectedDest,
00246                 // Note: insertBom is only used with the encoder.  When encoding, 'defaultByteOrder' is used as the destination byte order.
00247                 const TUtf16BomHandling bomHandling, const TUniByteOrder defaultByteOrder, const bool insertBom,
00248                 FILE *f);
00249         static inline int SwapBytes(int x) {
00250                 return ((x >> 8) & 0xff) | ((x & 0xff) << 8); }
00251         // Generates a random UTF-16-encoded stream according to the specifications in 'testCaseDesc',
00252         // then calls TestUtf16 to make sure that DecodeUtf16 reacts as expected.
00253         void TestDecodeUtf16(TRnd& rnd, const TStr& testCaseDesc,
00254                 const TUtf16BomHandling bomHandling,
00255                 const TUniByteOrder defaultByteOrder,
00256                 const bool insertBom);
00257 public:
00258         void TestUtf16();
00259 
00260 };
00261 
00262 //-----------------------------------------------------------------------------
00263 // Case folding
00264 //-----------------------------------------------------------------------------
00265 // Note: there's no need to access this class directly.
00266 // Use TUniChDb::GetCaseFolded() instead.
00267 
00268 typedef THash<TInt, TIntV> TIntIntVH;
00269 
00270 class TUniCaseFolding
00271 {
00272 protected:
00273         TIntH cfCommon, cfSimple, cfTurkic;
00274         TIntIntVH cfFull;
00275 
00276         template<typename TSrcDat, typename TDestDat>
00277         inline static void AppendVector(const TVec<TSrcDat>& src, TVec<TDestDat>& dest) {
00278                 for (int i = 0; i < src.Len(); i++) dest.Add(src[i]); }
00279         friend class TUniChDb;
00280         typedef TUniVecIdx TVecIdx;
00281 
00282 public:
00283         TUniCaseFolding() { }
00284         explicit TUniCaseFolding(TSIn& SIn) : cfCommon(SIn), cfSimple(SIn), cfTurkic(SIn), cfFull(SIn) { SIn.LoadCs(); }
00285         void Load(TSIn& SIn) { cfCommon.Load(SIn); cfSimple.Load(SIn); cfFull.Load(SIn); cfTurkic.Load(SIn); SIn.LoadCs(); }
00286         void Save(TSOut& SOut) const { cfCommon.Save(SOut); cfSimple.Save(SOut); cfFull.Save(SOut); cfTurkic.Save(SOut); SOut.SaveCs(); }
00287         void Clr() { cfCommon.Clr(); cfSimple.Clr(); cfFull.Clr(); cfTurkic.Clr(); }
00288         void LoadTxt(const TStr& fileName);
00289 
00290         // Use 'turkic' when processing text in a Turkic language (tr, az).  This only affects the uppercase I and I-with-dot-above.
00291         template<typename TSrcVec, typename TDestCh>
00292         void Fold(const TSrcVec& src, size_t srcIdx, const size_t srcCount,
00293                 TVec<TDestCh>& dest, const bool clrDest, const bool full, const bool turkic) const
00294         {
00295                 for (const size_t srcEnd = srcIdx + srcCount; srcIdx < srcEnd; )
00296                 {
00297                         int c = src[TVecIdx(srcIdx)], i; srcIdx++;
00298                         if (turkic && ((i = cfTurkic.GetKeyId(c)) >= 0)) { dest.Add(cfTurkic[i]); continue; }
00299                         if (full && ((i = cfFull.GetKeyId(c)) >= 0)) { AppendVector(cfFull[i], dest); continue; }
00300                         if ((! full) && ((i = cfSimple.GetKeyId(c)) >= 0)) { dest.Add(cfSimple[i]); continue; }
00301                         i = cfCommon.GetKeyId(c); if (i >= 0) dest.Add(cfCommon[i]); else dest.Add(c);
00302                 }
00303         }
00304 
00305         template<typename TSrcVec>
00306         void FoldInPlace(TSrcVec& src, size_t srcIdx, const size_t srcCount, const bool turkic) const
00307         {
00308                 for (const size_t srcEnd = srcIdx + srcCount; srcIdx < srcEnd; srcIdx++)
00309                 {
00310                         int c = src[TVecIdx(srcIdx)], i;
00311                         if (turkic && ((i = cfTurkic.GetKeyId(c)) >= 0)) { src[TVecIdx(srcIdx)] = cfTurkic[i]; continue; }
00312                         if ((i = cfSimple.GetKeyId(c)) >= 0) { src[TVecIdx(srcIdx)] = cfSimple[i]; continue; }
00313                         i = cfCommon.GetKeyId(c); if (i >= 0) src[TVecIdx(srcIdx)] = cfCommon[i];
00314                 }
00315         }
00316 
00317 protected:
00318         void Test(const TIntV& src, const TIntV& expectedDest, const bool full, const bool turkic, FILE *f);
00319 public:
00320         void Test();
00321 };
00322 
00323 //-----------------------------------------------------------------------------
00324 // TCodecBase -- an abstract base class for codecs
00325 //-----------------------------------------------------------------------------
00326 
00327 class TCodecBase;
00328 typedef TPt<TCodecBase> PCodecBase;
00329 typedef TVec<PCodecBase> TCodecBaseV;
00330 
00331 class TCodecBase
00332 {
00333 protected:
00334         TCRef CRef;
00335         friend class TPt<TCodecBase>;
00336 public:
00337         virtual ~TCodecBase() { }
00338 
00339         template<class TCodecImpl>
00340         static PCodecBase New(); /* {
00341                 return new TCodecWrapper<TCodecImpl>(); } */
00342 
00343         virtual TStr GetName() const = 0;
00344         virtual void Test() const { }
00345 
00346         // Returns the number of characters that have been successfully decoded.
00347         // This does not include any replacement characters that may have been inserted into 'dest'.
00348         virtual size_t ToUnicode(const TIntV& src, size_t srcIdx, const size_t srcCount, TIntV& dest, const bool clrDest = true) const = 0;
00349         virtual size_t ToUnicode(const TStr& src, size_t srcIdx, const size_t srcCount, TIntV& dest, const bool clrDest = true) const = 0;
00350 
00351         size_t ToUnicode(const TIntV& src, TIntV& dest, const bool clrDest = true) const { return ToUnicode(src, 0, src.Len(), dest, clrDest); }
00352         size_t ToUnicode(const TStr& src, TIntV& dest, const bool clrDest = true) const { return ToUnicode(src, 0, src.Len(), dest, clrDest); }
00353 
00354         // Returns the number of characters that have been successfully encoded.
00355         // This does not include any replacement characters that may have been inserted into 'dest'.
00356         virtual size_t FromUnicode(const TIntV& src, size_t srcIdx, const size_t srcCount, TIntV& dest, const bool clrDest = true) const = 0;
00357         virtual size_t FromUnicode(const TIntV& src, size_t srcIdx, const size_t srcCount, TChA& dest, const bool clrDest = true) const = 0;
00358         virtual size_t FromUnicode(const TIntV& src, size_t srcIdx, const size_t srcCount, TStr& dest, const bool clrDest = true) const = 0;
00359 
00360         size_t FromUnicode(const TIntV& src, TIntV& dest, const bool clrDest = true) const { return FromUnicode(src, 0, src.Len(), dest, clrDest); }
00361         size_t FromUnicode(const TIntV& src, TChA& dest, const bool clrDest = true) const { return FromUnicode(src, 0, src.Len(), dest, clrDest); }
00362         size_t FromUnicode(const TIntV& src, TStr& dest, const bool clrDest = true) const { return FromUnicode(src, 0, src.Len(), dest, clrDest); }
00363 };
00364 
00365 //-----------------------------------------------------------------------------
00366 // TCodecWrapper -- a descendant of TCodecBase; relies on a template
00367 // parameter class for the actual implementation of the codec.
00368 //-----------------------------------------------------------------------------
00369 // Thus, if you know in advance that you'll need ISO-8859-2, just use
00370 // T8BitCodec<TEncoding_ISO8859_2>.  If you don't know the encoding
00371 // in advance, use a PCodecBase pointing to a suitable specialization
00372 // of TCodecWrapper<...>.  You can TUnicode::GetCodec(TStr& name)
00373 // to obtain a suitable pointer.
00374 
00375 template<class TCodecImpl_>
00376 class TCodecWrapper : public TCodecBase
00377 {
00378 public:
00379         typedef TCodecImpl_ TCodecImpl;
00380         TCodecImpl impl;
00381 public:
00382 
00383         virtual TStr GetName() const { return impl.GetName(); }
00384 
00385         virtual void Test() const { impl.Test(); }
00386 
00387         virtual size_t ToUnicode(const TIntV& src, size_t srcIdx, const size_t srcCount, TIntV& dest, const bool clrDest = true) const {
00388                 return impl.ToUnicode(src, srcIdx, srcCount, dest, clrDest); }
00389         virtual size_t ToUnicode(const TStr& src, size_t srcIdx, const size_t srcCount, TIntV& dest, const bool clrDest = true) const {
00390                 return impl.ToUnicode(src, srcIdx, srcCount, dest, clrDest); }
00391 
00392         virtual size_t FromUnicode(const TIntV& src, size_t srcIdx, const size_t srcCount, TIntV& dest, const bool clrDest = true) const {
00393                 return impl.FromUnicode(src, srcIdx, srcCount, dest, clrDest); }
00394         virtual size_t FromUnicode(const TIntV& src, size_t srcIdx, const size_t srcCount, TChA& dest, const bool clrDest = true) const {
00395                 return impl.FromUnicode(src, srcIdx, srcCount, dest, clrDest); }
00396         virtual size_t FromUnicode(const TIntV& src, size_t srcIdx, const size_t srcCount, TStr& dest, const bool clrDest = true) const {
00397                 TChA buf; size_t retVal = impl.FromUnicode(src, srcIdx, srcCount, buf, false);
00398                 if (clrDest) dest += buf.CStr(); else dest = buf.CStr();
00399                 return retVal; }
00400 };
00401 
00402 template<class TCodecImpl>
00403 PCodecBase TCodecBase::New() {
00404   return new TCodecWrapper<TCodecImpl>();
00405 }
00406 
00407 //-----------------------------------------------------------------------------
00408 // TVecElt -- a template for determining the type of a vector's elements
00409 //-----------------------------------------------------------------------------
00410 
00411 template<class TVector_>
00412 class TVecElt
00413 {
00414 };
00415 
00416 template<class TDat>
00417 class TVecElt<TVec<TDat> >
00418 {
00419 public:
00420         typedef TVec<TDat> TVector;
00421         typedef TDat TElement;
00422         static inline void Add(TVector& vector, const TElement& element) { vector.Add(element); }
00423 };
00424 
00425 template<>
00426 class TVecElt<TChA>
00427 {
00428 public:
00429         typedef TChA TVector;
00430         typedef char TElement;
00431         static inline void Add(TVector& vector, const TElement& element) { vector += element; }
00432 };
00433 
00434 
00435 //-----------------------------------------------------------------------------
00436 // T8BitCodec -- a class for converting between 8-bit encodings and Unicode
00437 //-----------------------------------------------------------------------------
00438 
00439 class TEncoding_ISO8859_1
00440 {
00441 public:
00442         static inline TStr GetName() { return "ISO-8859-1"; }
00443         static int ToUnicode(int c) { Assert(0 <= c && c <= 255); return c; }
00444         static int FromUnicode(int c) { if (0 <= c && c <= 255) return c; else return -1; }
00445 };
00446 
00447 class TEncoding_ISO8859_2 // ISO Latin 2
00448 {
00449 public:
00450         static inline TStr GetName() { return "ISO-8859-2"; }
00451         static const int toUnicodeTable[6 * 16], fromUnicodeTable1[14 * 16], fromUnicodeTable2[2 * 16];
00452         static int ToUnicode(int c) { Assert(0 <= c && c <= 255);
00453                 if (c < 0xa0) return c; else return toUnicodeTable[c - 0xa0]; }
00454         static int FromUnicode(int c) {
00455                 if (0 <= c && c < 0xa0) return c;
00456                 else if (0xa0 <= c && c < 0x180) return fromUnicodeTable1[c - 0xa0];
00457                 else if (0x2c0 <= c && c < 0x2e0) return fromUnicodeTable2[c - 0x2c0];
00458                 else return -1; }
00459 };
00460 
00461 class TEncoding_ISO8859_3
00462 {
00463 public:
00464         static inline TStr GetName() { return "ISO-8859-3"; }
00465         static const int toUnicodeTable[6 * 16], fromUnicodeTable1[14 * 16], fromUnicodeTable2[2];
00466         static int ToUnicode(int c) { Assert(0 <= c && c <= 255);
00467                 if (c < 0xa0) return c; else return toUnicodeTable[c - 0xa0]; }
00468         static int FromUnicode(int c) {
00469                 if (0 <= c && c < 0xa0) return c;
00470                 else if (0xa0 <= c && c < 0x180) return fromUnicodeTable1[c - 0xa0];
00471                 else if (0x2d8 <= c && c < 0x2da) return fromUnicodeTable2[c - 0x2d8];
00472                 else return -1; }
00473 };
00474 
00475 class TEncoding_ISO8859_4
00476 {
00477 public:
00478         static inline TStr GetName() { return "ISO-8859-4"; }
00479         static const int toUnicodeTable[6 * 16], fromUnicodeTable1[14 * 16], fromUnicodeTable2[2 * 16];
00480         static int ToUnicode(int c) { Assert(0 <= c && c <= 255);
00481                 if (c < 0xa0) return c; else return toUnicodeTable[c - 0xa0]; }
00482         static int FromUnicode(int c) {
00483                 if (0 <= c && c < 0xa0) return c;
00484                 else if (0xa0 <= c && c < 0x180) return fromUnicodeTable1[c - 0xa0];
00485                 else if (0x2c0 <= c && c < 0x2e0) return fromUnicodeTable2[c - 0x2c0];
00486                 else return -1; }
00487 };
00488 
00489 class TEncoding_YuAscii
00490 {
00491 public:
00492         static const int uniChars[10], yuAsciiChars[10];
00493         static inline TStr GetName() { return "YU-ASCII"; }
00494         static int ToUnicode(int c) { Assert(0 <= c && c <= 255);
00495                 for (int i = 0; i < int(sizeof(yuAsciiChars) / sizeof(yuAsciiChars[0])); i++)
00496                         if (c == yuAsciiChars[i]) return uniChars[i];
00497                 return c; }
00498         static int FromUnicode(int c) {
00499                 for (int i = 0; i < int(sizeof(uniChars) / sizeof(uniChars[0])); i++)
00500                         if (c == uniChars[i]) return yuAsciiChars[i];
00501                         else if(c == yuAsciiChars[i]) return -1;
00502                 if (0 <= c && c <= 255) return c; else return -1; }
00503 };
00504 
00505 class TEncoding_CP437 // DOS US
00506 {
00507 public:
00508         static inline TStr GetName() { return "CP437"; }
00509         static const int toUnicodeTable[8 * 16], fromUnicodeTable1[6 * 16], fromUnicodeTable2[4 * 16], fromUnicodeTable3[6 * 16], fromUnicodeTable4[11 * 16];
00510         static int ToUnicode(int c) { Assert(0 <= c && c <= 255);
00511                 if (c < 0x80) return c; else return toUnicodeTable[c - 0x80]; }
00512         static int FromUnicode(int c) {
00513                 if (0 <= c && c < 0x80) return c;
00514                 else if (0xa0 <= c && c < 0x100) return fromUnicodeTable1[c - 0xa0];
00515                 else if (0x390 <= c && c < 0x3d0) return fromUnicodeTable2[c - 0x390];
00516                 else if (0x2210 <= c && c < 0x2270) return fromUnicodeTable3[c - 0x2210];
00517                 else if (0x2500 <= c && c < 0x25b0) return fromUnicodeTable4[c - 0x2500];
00518                 else if (c == 0x192) return 0x9f;
00519                 else if (c == 0x207f) return 0xfc;
00520                 else if (c == 0x20a7) return 0x9e;
00521                 else if (c == 0x2310) return 0xa9;
00522                 else if (c == 0x2320) return 0xf4;
00523                 else if (c == 0x2321) return 0xf5;
00524                 else return -1; }
00525 };
00526 
00527 class TEncoding_CP852 // DOS Latin 2
00528 {
00529 public:
00530         static inline TStr GetName() { return "CP852"; }
00531         static const int toUnicodeTable[8 * 16], fromUnicodeTable1[14 * 16], fromUnicodeTable2[2 * 16], fromUnicodeTable3[11 * 16];
00532         static int ToUnicode(int c) { Assert(0 <= c && c <= 255);
00533                 if (c < 0x80) return c; else return toUnicodeTable[c - 0x80]; }
00534         static int FromUnicode(int c) {
00535                 if (0 <= c && c < 0x80) return c;
00536                 else if (0xa0 <= c && c < 0x180) return fromUnicodeTable1[c - 0xa0];
00537                 else if (0x2c0 <= c && c < 0x2e0) return fromUnicodeTable2[c - 0x2c0];
00538                 else if (0x2500 <= c && c < 0x25b0) return fromUnicodeTable3[c - 0x2500];
00539                 else return -1; }
00540 };
00541 
00542 class TEncoding_CP1250 // Windows-1250, similar to ISO Latin 2
00543 {
00544 public:
00545         static inline TStr GetName() { return "CP1250"; }
00546         static const int toUnicodeTable[8 * 16], fromUnicodeTable1[14 * 16], fromUnicodeTable2[2 * 16], fromUnicodeTable3[3 * 16];
00547         static int ToUnicode(int c) { Assert(0 <= c && c <= 255);
00548                 if (c < 0x80) return c; else return toUnicodeTable[c - 0x80]; }
00549         static int FromUnicode(int c) {
00550                 if (0 <= c && c < 0x80) return c;
00551                 else if (0xa0 <= c && c < 0x180) return fromUnicodeTable1[c - 0xa0];
00552                 else if (0x2c0 <= c && c < 0x2e0) return fromUnicodeTable2[c - 0x2c0];
00553                 else if (0x2010 <= c && c < 0x2040) return fromUnicodeTable3[c - 0x2010];
00554                 else if (c == 0x20ac) return 0x80;
00555                 else if (c == 0x2122) return 0x99;
00556                 else return -1; }
00557 };
00558 
00559 template<class TEncoding_>
00560 class T8BitCodec
00561 {
00562 protected:
00563         typedef TUniVecIdx TVecIdx;
00564 public:
00565         typedef TEncoding_ TEncoding;
00566         TUnicodeErrorHandling errorHandling;
00567         int replacementChar;
00568 
00569         T8BitCodec() : errorHandling(uehIgnore), replacementChar(TUniCodec::DefaultReplacementChar) { }
00570         T8BitCodec(TUnicodeErrorHandling errorHandling_, int replacementChar_ = TUniCodec::DefaultReplacementChar) :
00571                 errorHandling(errorHandling_), replacementChar(replacementChar_) { }
00572         static TStr GetName() { return TEncoding::GetName(); }
00573 
00574         void Test() const
00575         {
00576                 int nDecoded = 0;
00577                 for (int c = 0; c <= 255; c++) {
00578                         int cu = TEncoding::ToUnicode(c); if (cu == -1) continue;
00579                         nDecoded++;
00580                         IAssert(0 <= cu && cu < 0x110000);
00581                         int c2 = TEncoding::FromUnicode(cu);
00582                         IAssert(c2 == c); }
00583                 int nEncoded = 0;
00584                 for (int cu = 0; cu < 0x110000; cu++) {
00585                         int c = TEncoding::FromUnicode(cu); if (c == -1) continue;
00586                         nEncoded++;
00587                         IAssert(0 <= c && c <= 255);
00588                         int cu2 = TEncoding::ToUnicode(c);
00589                         IAssert(cu2 == cu); }
00590                 IAssert(nDecoded == nEncoded);
00591         }
00592 
00593         // Returns the number of characters that have been successfully decoded.
00594         // This does not include any replacement characters that may have been inserted into 'dest'.
00595         template<typename TSrcVec, typename TDestCh>
00596         size_t ToUnicode(
00597                 const TSrcVec& src, size_t srcIdx, const size_t srcCount,
00598                 TVec<TDestCh>& dest, const bool clrDest = true) const
00599         {
00600                 if (clrDest) dest.Clr();
00601                 size_t toDo = srcCount;
00602                 while (toDo-- > 0) {
00603                         int chSrc = ((int) src[TVecIdx(srcIdx)]) & 0xff; srcIdx++;
00604                         int chDest = TEncoding::ToUnicode(chSrc);
00605                         dest.Add(chDest); }
00606                 return srcCount;
00607         }
00608         template<typename TSrcVec, typename TDestCh>
00609         size_t ToUnicode(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true) const { return ToUnicode(src, 0, src.Len(), dest, clrDest); }
00610 
00611         size_t ToUnicode(const TIntV& src, TIntV& dest, const bool clrDest = true) const { return ToUnicode(src, 0, src.Len(), dest, clrDest); }
00612         size_t ToUnicode(const TStr& src, TIntV& dest, const bool clrDest = true) const { return ToUnicode(src, 0, src.Len(), dest, clrDest); }
00613 
00614         // Returns the number of characters that have been successfully encoded.
00615         // This does not include any replacement characters that may have been inserted into 'dest'.
00616         template<typename TSrcVec, typename TDestVec>
00617         size_t FromUnicode(
00618                 const TSrcVec& src, size_t srcIdx, const size_t srcCount,
00619                 TDestVec& dest, const bool clrDest = true) const
00620         {
00621                 typedef typename TVecElt<TDestVec>::TElement TDestCh;
00622                 if (clrDest) dest.Clr();
00623                 size_t toDo = srcCount, nEncoded = 0;
00624                 while (toDo-- > 0) {
00625                         int chSrc = (int) src[TVecIdx(srcIdx)]; srcIdx++;
00626                         int chDest = TEncoding::FromUnicode(chSrc);
00627                         if (chDest < 0) {
00628                                 switch (errorHandling) {
00629                                 case uehThrow: throw TUnicodeException(srcIdx - 1, chSrc, "Invalid character for encoding into " + GetName() + ".");
00630                                 case uehAbort: return nEncoded;
00631                                 case uehReplace: TVecElt<TDestVec>::Add(dest, TDestCh(replacementChar)); continue;
00632                                 case uehIgnore: continue;
00633                                 default: Fail; } }
00634                         TVecElt<TDestVec>::Add(dest, TDestCh(chDest)); nEncoded++; }
00635                 return nEncoded;
00636         }
00637 
00638         template<typename TSrcVec, typename TDestVec>
00639         size_t FromUnicode(const TSrcVec& src, TDestVec& dest, const bool clrDest = true) const { return FromUnicode(src, 0, src.Len(), dest, clrDest); }
00640 
00641         size_t UniToStr(const TIntV& src, size_t srcIdx, const size_t srcCount, TStr& dest, const bool clrDest = true) const {
00642                 TChA buf; size_t retVal = FromUnicode(src, srcIdx, srcCount, buf, false);
00643                 if (clrDest) dest += buf.CStr(); else dest = buf.CStr();
00644                 return retVal; }
00645         size_t UniToStr(const TIntV& src, TStr& dest, const bool clrDest = true) const { return UniToStr(src, 0, src.Len(), dest, clrDest); }
00646 };
00647 
00648 typedef T8BitCodec<TEncoding_ISO8859_1> TCodec_ISO8859_1;
00649 typedef T8BitCodec<TEncoding_ISO8859_2> TCodec_ISO8859_2;
00650 typedef T8BitCodec<TEncoding_ISO8859_3> TCodec_ISO8859_3;
00651 typedef T8BitCodec<TEncoding_ISO8859_4> TCodec_ISO8859_4;
00652 typedef T8BitCodec<TEncoding_CP852> TCodec_CP852;
00653 typedef T8BitCodec<TEncoding_CP437> TCodec_CP437;
00654 typedef T8BitCodec<TEncoding_CP1250> TCodec_CP1250;
00655 typedef T8BitCodec<TEncoding_YuAscii> TCodec_YuAscii;
00656 
00657 //-----------------------------------------------------------------------------
00658 // Various declarations used by the Unicode Character Database
00659 //-----------------------------------------------------------------------------
00660 
00661 typedef enum TUniChCategory_
00662 {
00663 #define DefineUniCat(cat, c) uc ## cat = (int(uchar(c)) & 0xff)
00664         DefineUniCat(Letter, 'L'),             // ucLetter
00665         DefineUniCat(Mark, 'M'),
00666         DefineUniCat(Number, 'N'),
00667         DefineUniCat(Punctuation, 'P'),
00668         DefineUniCat(Symbol, 'S'),
00669         DefineUniCat(Separator, 'Z'),
00670         DefineUniCat(Other, 'C')
00671 #undef DefineUniCat
00672 }
00673 TUniChCategory;
00674 
00675 typedef enum TUniChSubCategory_
00676 {
00677 #define DefineUniSubCat(cat, subCat, c) uc ## cat ## subCat = ((uc ## cat) << 8) | (int(uchar(c)) & 0xff)
00678         DefineUniSubCat(Letter, Uppercase, 'u'),            // ucLetterUppercase
00679         DefineUniSubCat(Letter, Lowercase, 'l'),
00680         DefineUniSubCat(Letter, Titlecase, 't'),
00681         DefineUniSubCat(Letter, Modifier, 'm'),
00682         DefineUniSubCat(Letter, Other, 'o'),
00683         DefineUniSubCat(Mark, Nonspacing, 'n'),
00684         DefineUniSubCat(Mark, SpacingCombining, 'c'),
00685         DefineUniSubCat(Mark, Enclosing, 'e'),
00686         DefineUniSubCat(Number, DecimalDigit, 'd'),
00687         DefineUniSubCat(Number, Letter, 'l'),
00688         DefineUniSubCat(Number, Other, 'o'),
00689         DefineUniSubCat(Punctuation, Connector, 'c'),
00690         DefineUniSubCat(Punctuation, Dash, 'd'),
00691         DefineUniSubCat(Punctuation, Open, 's'),
00692         DefineUniSubCat(Punctuation, Close, 'e'),
00693         DefineUniSubCat(Punctuation, InitialQuote, 'i'),
00694         DefineUniSubCat(Punctuation, FinalQuote, 'f'),
00695         DefineUniSubCat(Punctuation, Other, 'o'),
00696         DefineUniSubCat(Symbol, Math, 'm'),
00697         DefineUniSubCat(Symbol, Currency, 'c'),
00698         DefineUniSubCat(Symbol, Modifier, 'k'),
00699         DefineUniSubCat(Symbol, Other, 'o'),
00700         DefineUniSubCat(Separator, Space, 's'),
00701         DefineUniSubCat(Separator, Line, 'l'),
00702         DefineUniSubCat(Separator, Paragraph, 'p'),
00703         DefineUniSubCat(Other, Control, 'c'),
00704         DefineUniSubCat(Other, Format, 'f'),
00705         DefineUniSubCat(Other, Surrogate, 's'),
00706         DefineUniSubCat(Other, PrivateUse, 'o'),
00707         DefineUniSubCat(Other, NotAssigned, 'n')
00708 }
00709 TUniChSubCategory;
00710 
00711 typedef enum TUniChFlags_
00712 {
00713         ucfCompatibilityDecomposition = 1, // if this flag is not set, the decomposition is canonical
00714         ucfCompositionExclusion = 1 << 1,       // from CompositionExclusions.txt
00715         // Flags used when searching for word boundaries.  See UAX #29.
00716         ucfWbFormat = 1 << 2,
00717         ucfWbKatakana = 1 << 3,
00718         ucfWbALetter = 1 << 4,
00719         ucfWbMidLetter = 1 << 5,
00720         ucfWbMidNum = 1 << 6,
00721         ucfWbNumeric = 1 << 7,
00722         ucfWbExtendNumLet = 1 << 8,
00723         // Flags used with sentence boundaries (Sep is also used with word boundaries).  See UAX #29.
00724         ucfSbSep = 1 << 9,
00725         ucfSbFormat = 1 << 10,
00726         ucfSbSp = 1 << 11,
00727         ucfSbLower = 1 << 12,
00728         ucfSbUpper = 1 << 13,
00729         ucfSbOLetter = 1 << 14,
00730         ucfSbNumeric = 1 << 15,
00731         ucfSbATerm = 1 << 16,
00732         ucfSbSTerm = 1 << 17,
00733         ucfSbClose = 1 << 18,
00734         ucfSbMask = ucfSbSep | ucfSbFormat | ucfSbSp | ucfSbLower | ucfSbUpper | ucfSbOLetter | ucfSbNumeric | ucfSbATerm | ucfSbSTerm | ucfSbClose,
00735         ucfWbMask = ucfWbFormat | ucfWbKatakana | ucfWbALetter | ucfWbMidLetter | ucfWbMidNum | ucfWbNumeric | ucfWbExtendNumLet | ucfSbSep,
00736         // Flags from DerivedCoreProperties.txt.
00737         // [The comments are from UCD.html.]
00738         // - Characters with the Alphabetic property. For more information, see Chapter 4 in [Unicode].
00739         //   Generated from: Other_Alphabetic + Lu + Ll + Lt + Lm + Lo + Nl
00740         ucfDcpAlphabetic = 1 << 19,
00741         // - For programmatic determination of default-ignorable code points.
00742         //   New characters that should be ignored in processing (unless explicitly supported)
00743         //   will be assigned in these ranges, permitting programs to correctly handle the default
00744         //   behavior of such characters when not otherwise supported.  For more information, see
00745         //   UAX #29: Text Boundaries [Breaks].
00746         //   Generated from Other_Default_Ignorable_Code_Point + Cf + Cc + Cs + Noncharacters - White_Space - annotation characters
00747         //   [Examples: soft hyphen, zero-width space, noncharacters (e.g. U+fffe, U+ffff, U+1fffe, U+1ffff, etc.), surrogates, language tags, variation selectors]
00748         ucfDcpDefaultIgnorableCodePoint = 1 << 20,
00749         // - Characters with the Lowercase property.  For more information, see Chapter 4 in [Unicode].
00750         //   Generated from: Other_Lowercase + Ll
00751         ucfDcpLowercase = 1 << 21,
00752         // - For programmatic determination of grapheme cluster boundaries.
00753         //   For more information, see UAX #29: Text Boundaries [Breaks].
00754         //   Generated from: [0..10FFFF] - Cc - Cf - Cs - Co - Cn - Zl - Zp - Grapheme_Extend
00755         ucfDcpGraphemeBase = 1 << 22,
00756         // - For programmatic determination of grapheme cluster boundaries.
00757         //   For more information, see UAX #29: Text Boundaries [Breaks].
00758         //   Generated from: Other_Grapheme_Extend + Me + Mn
00759         //   Note: depending on an application's interpretation of Co (private use), they may be either
00760         //         in Grapheme_Base, or in Grapheme_Extend, or in neither.
00761         ucfDcpGraphemeExtend = 1 << 23,
00762         // - Used to determine programming identifiers, as described in UAX #31: Identifier and Pattern Syntax.
00763         ucfDcpIdStart = 1 << 24,
00764         ucfDcpIdContinue = 1 << 25,
00765         // - Characters with the Math property. For more information, see Chapter 4 in [Unicode].
00766         //   Generated from: Sm + Other_Math
00767         ucfDcpMath = 1 << 26,
00768         // - Characters with the Uppercase property. For more information, see Chapter 4 in [Unicode].
00769         //   Generated from: Lu + Other_Uppercase
00770         ucfDcpUppercase = 1 << 27,
00771         // - Used to determine programming identifiers, as described in UAX #31: Identifier and Pattern Syntax.
00772         ucfDcpXidStart = 1 << 28,
00773         ucfDcpXidContinue = 1 << 29,
00774         ucfDcpMask = ucfDcpAlphabetic | ucfDcpDefaultIgnorableCodePoint | ucfDcpLowercase | ucfDcpGraphemeBase | ucfDcpGraphemeExtend |
00775                 ucfDcpIdStart | ucfDcpIdContinue | ucfDcpMath | ucfDcpUppercase | ucfDcpXidStart | ucfDcpXidContinue,
00776 }
00777 TUniChFlags;
00778 
00779 typedef enum TUniChProperties_
00780 {
00781         // The flags from PropList.txt.
00782         // [The comments are from UCD.html.]
00783         // - ASCII characters commonly used for the representation of hexadecimal numbers.
00784         //   [= 0123456789abcdefABCDEF]
00785         ucfPrAsciiHexDigit = 1,
00786         // - Those format control characters which have specific functions in the Bidirectional Algorithm.
00787         ucfPrBidiControl = 2,
00788         // - Those punctuation characters explicitly called out as dashes in the Unicode Standard,
00789         //   plus compatibility equivalents to those. Most of these have the Pd General Category,
00790         //   but some have the Sm General Category because of their use in mathematics.
00791         //     U+0002d  HYPHEN-MINUS
00792         //     U+0058a  ARMENIAN HYPHEN
00793         //     U+005be  HEBREW PUNCTUATION MAQAF
00794         //     U+01806  MONGOLIAN TODO SOFT HYPHEN
00795         //     U+02010  HYPHEN
00796         //     U+02011  NON-BREAKING HYPHEN
00797         //     U+02012  FIGURE DASH
00798         //     U+02013  EN DASH
00799         //     U+02014  EM DASH
00800         //     U+02015  HORIZONTAL BAR
00801         //     U+02053  SWUNG DASH
00802         //     U+0207b  SUPERSCRIPT MINUS
00803         //     U+0208b  SUBSCRIPT MINUS
00804         //     U+02212  MINUS SIGN
00805         //     U+02e17  DOUBLE OBLIQUE HYPHEN
00806         //     U+0301c  WAVE DASH
00807         //     U+03030  WAVY DASH
00808         //     U+030a0  KATAKANA-HIRAGANA DOUBLE HYPHEN
00809         //     U+0fe31  PRESENTATION FORM FOR VERTICAL EM DASH
00810         //     U+0fe32  PRESENTATION FORM FOR VERTICAL EN DASH
00811         //     U+0fe58  SMALL EM DASH
00812         //     U+0fe63  SMALL HYPHEN-MINUS
00813         //     U+0ff0d  FULLWIDTH HYPHEN-MINUS
00814         ucfPrDash = 4,
00815         // - For a machine-readable list of deprecated characters.  No characters will ever be removed
00816         //   from the standard, but the usage of deprecated characters is strongly discouraged.
00817         ucfPrDeprecated = 8,
00818         // - Characters that linguistically modify the meaning of another character to which they apply.
00819         //   Some diacritics are not combining characters, and some combining characters are not diacritics.
00820         ucfPrDiacritic = 0x10,
00821         // - Characters whose principal function is to extend the value or shape of a preceding alphabetic
00822         //   character.  Typical of these are length and iteration marks.
00823         ucfPrExtender = 0x20,
00824         // - Used in determining default grapheme cluster boundaries.  For more information, see UAX #29: Text Boundaries.
00825         ucfPrGraphemeLink = 0x40,
00826         // - Characters commonly used for the representation of hexadecimal numbers, plus their compatibility equivalents.
00827         //   [= AsciiHexDigit + fullwidth digit {0..9} + fullwidth latin {small|capital} letter {a..f}]
00828         ucfPrHexDigit = 0x80,
00829         // - Those dashes used to mark connections between pieces of words, plus the Katakana middle dot.
00830         //   The Katakana middle dot functions like a hyphen, but is shaped like a dot rather than a dash.
00831         //     U+0002d  HYPHEN-MINUS
00832         //     U+000ad  SOFT HYPHEN
00833         //     U+0058a  ARMENIAN HYPHEN
00834         //     U+01806  MONGOLIAN TODO SOFT HYPHEN
00835         //     U+02010  HYPHEN
00836         //     U+02011  NON-BREAKING HYPHEN
00837         //     U+02e17  DOUBLE OBLIQUE HYPHEN
00838         //     U+030fb  KATAKANA MIDDLE DOT
00839         //     U+0fe63  SMALL HYPHEN-MINUS
00840         //     U+0ff0d  FULLWIDTH HYPHEN-MINUS
00841         //     U+0ff65  HALFWIDTH KATAKANA MIDDLE DOT
00842         ucfPrHyphen = 0x100,
00843         // - Characters considered to be CJKV (Chinese, Japanese, Korean, and Vietnamese) ideographs.
00844         ucfPrIdeographic = 0x200,
00845         // - Those format control characters which have specific functions for control of cursive joining and ligation.
00846         ucfPrJoinControl = 0x400,
00847         // - There are a small number of characters that do not use logical order.
00848         //   These characters require special handling in most processing.
00849         ucfPrLogicalOrderException = 0x800,
00850         // - Code points that are permanently reserved for internal use.
00851         ucfPrNoncharacterCodePoint = 0x1000,
00852         // - Used for pattern syntax as described in UAX #31: Identifier and Pattern Syntax.
00853         ucfPrPatternSyntax = 0x2000,
00854         ucfPrPatternWhiteSpace = 0x4000,
00855         // - Those punctuation characters that function as quotation marks.
00856         //     U+00022  QUOTATION MARK
00857         //     U+00027  APOSTROPHE
00858         //     U+000ab  LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
00859         //     U+000bb  RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
00860         //     U+02018  LEFT SINGLE QUOTATION MARK
00861         //     U+02019  RIGHT SINGLE QUOTATION MARK
00862         //     U+0201a  SINGLE LOW-9 QUOTATION MARK
00863         //     U+0201b  SINGLE HIGH-REVERSED-9 QUOTATION MARK
00864         //     U+0201c  LEFT DOUBLE QUOTATION MARK
00865         //     U+0201d  RIGHT DOUBLE QUOTATION MARK
00866         //     U+0201e  DOUBLE LOW-9 QUOTATION MARK
00867         //     U+0201f  DOUBLE HIGH-REVERSED-9 QUOTATION MARK
00868         //     U+02039  SINGLE LEFT-POINTING ANGLE QUOTATION MARK
00869         //     U+0203a  SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
00870         //     U+0300c  LEFT CORNER BRACKET
00871         //     U+0300d  RIGHT CORNER BRACKET
00872         //     U+0300e  LEFT WHITE CORNER BRACKET
00873         //     U+0300f  RIGHT WHITE CORNER BRACKET
00874         //     U+0301d  REVERSED DOUBLE PRIME QUOTATION MARK
00875         //     U+0301e  DOUBLE PRIME QUOTATION MARK
00876         //     U+0301f  LOW DOUBLE PRIME QUOTATION MARK
00877         //     U+0fe41  PRESENTATION FORM FOR VERTICAL LEFT CORNER BRACKET
00878         //     U+0fe42  PRESENTATION FORM FOR VERTICAL RIGHT CORNER BRACKET
00879         //     U+0fe43  PRESENTATION FORM FOR VERTICAL LEFT WHITE CORNER BRACKET
00880         //     U+0fe44  PRESENTATION FORM FOR VERTICAL RIGHT WHITE CORNER BRACKET
00881         //     U+0ff02  FULLWIDTH QUOTATION MARK
00882         //     U+0ff07  FULLWIDTH APOSTROPHE
00883         //     U+0ff62  HALFWIDTH LEFT CORNER BRACKET
00884         //     U+0ff63  HALFWIDTH RIGHT CORNER BRACKET
00885         ucfPrQuotationMark = 0x8000,
00886         // - Characters with a "soft dot", like i or j. An accent placed on these characters causes the dot to disappear.
00887         //   An explicit _dot above_ can be added where required, such as in Lithuanian.
00888         ucfPrSoftDotted = 0x10000,
00889         // - Sentence Terminal. Used in UAX #29: Text Boundaries.
00890         //     U+00021  EXCLAMATION MARK
00891         //     U+0002e  FULL STOP
00892         //     U+0003f  QUESTION MARK
00893         //     U+0203c  DOUBLE EXCLAMATION MARK
00894         //     U+0203d  INTERROBANG
00895         //     U+02047  DOUBLE QUESTION MARK
00896         //     U+02048  QUESTION EXCLAMATION MARK
00897         //     U+02049  EXCLAMATION QUESTION MARK
00898         //     U+03002  IDEOGRAPHIC FULL STOP
00899         //     [plus many characters from other writing systems]
00900         ucfPrSTerm = 0x20000,
00901         // - Those punctuation characters that generally mark the end of textual units.
00902         //   [JB note: this set contains more character than STerm.  For example, it contains
00903         //   the comma, colon and semicolon, whereas STerm doesn't.]
00904         //     U+00021  EXCLAMATION MARK
00905         //     U+0002c  COMMA
00906         //     U+0002e  FULL STOP
00907         //     U+0003a  COLON
00908         //     U+0003b  SEMICOLON
00909         //     U+0003f  QUESTION MARK
00910         //     U+0203c  DOUBLE EXCLAMATION MARK
00911         //     U+0203d  INTERROBANG
00912         //     U+02047  DOUBLE QUESTION MARK
00913         //     U+02048  QUESTION EXCLAMATION MARK
00914         //     U+02049  EXCLAMATION QUESTION MARK
00915         //     [plus *lots* of charcters from other writing systems]
00916         ucfPrTerminalPunctuation = 0x40000,
00917         // - Indicates all those characters that qualify as Variation Selectors.
00918         //   For details on the behavior of these characters, see StandardizedVariants.html and
00919         //   Section 16.4, Variation Selectors in [Unicode].
00920         ucfPrVariationSelector = 0x80000,
00921         // - Those separator characters and control characters which should be treated by
00922         //   programming languages as "white space" for the purpose of parsing elements.
00923         //   Note: ZERO WIDTH SPACE and ZERO WIDTH NO-BREAK SPACE are not included,
00924         //         since their functions are restricted to line-break control.
00925         //         Their names are unfortunately misleading in this respect.
00926         //   Note: There are other senses of "whitespace" that encompass a different set of characters.
00927         //         [JB note: e.g. there's a BIDI class for whitespace ('WS') in UnicodeData.txt.
00928         //         There's also a "Sp" class in the sentence boundary algorithm, see UAX #29, sec. 5.1.]
00929         //   This includes the following characters:
00930         //     U+0009  <control>
00931         //     U+000a  <control>
00932         //     U+000b  <control>
00933         //     U+000c  <control>
00934         //     U+000d  <control>
00935         //     U+0020  SPACE
00936         //     U+0085  <control>
00937         //     U+00a0  NO-BREAK SPACE
00938         //     U+1680  OGHAM SPACE MARK
00939         //     U+180e  MONGOLIAN VOWEL SEPARATOR
00940         //     U+2000  EN QUAD
00941         //     U+2001  EM QUAD
00942         //     U+2002  EN SPACE
00943         //     U+2003  EM SPACE
00944         //     U+2004  THREE-PER-EM SPACE
00945         //     U+2005  FOUR-PER-EM SPACE
00946         //     U+2006  SIX-PER-EM SPACE
00947         //     U+2007  FIGURE SPACE
00948         //     U+2008  PUNCTUATION SPACE
00949         //     U+2009  THIN SPACE
00950         //     U+200a  HAIR SPACE
00951         //     U+2028  LINE SEPARATOR
00952         //     U+2029  PARAGRAPH SEPARATOR
00953         //     U+202f  NARROW NO-BREAK SPACE
00954         //     U+205f  MEDIUM MATHEMATICAL SPACE
00955         //     U+3000  IDEOGRAPHIC SPACE
00956         ucfPrWhiteSpace = 0x100000
00957 }
00958 TUniChProperties;
00959 
00960 typedef enum TUniChPropertiesX_
00961 {
00962         // More properties from PropList.txt.
00963         // - Used to derive the properties in DerivedCoreProperties.txt.
00964         ucfPxOtherAlphabetic = 1,
00965         ucfPxOtherDefaultIgnorableCodePoint = 2,
00966         ucfPxOtherGraphemeExtend = 4,
00967         ucfPxOtherIdContinue = 8,
00968         ucfPxOtherIdStart = 0x10,
00969         ucfPxOtherLowercase = 0x20,
00970         ucfPxOtherMath = 0x40,
00971         ucfPxOtherUppercase = 0x80,
00972         // - Used in ideographic description sequences.
00973         ucfPxIdsBinaryOperator = 0x100,
00974         ucfPxIdsTrinaryOperator = 0x200,
00975         ucfPxRadical = 0x400,
00976         ucfPxUnifiedIdeograph = 0x800
00977 }
00978 TUniChPropertiesX;
00979 
00980 //-----------------------------------------------------------------------------
00981 // TUniChInfo -- contains information about a single Unicode codepoint
00982 //-----------------------------------------------------------------------------
00983 
00984 class TUniChInfo
00985 {
00986 public:
00987         enum { // combining classes (for 'combClass'); from UnicodeData.txt
00988                 ccStarter = 0, // 0: Spacing, split, enclosing, reordrant, and Tibetan subjoined
00989                 ccOverlaysAndInterior = 1,
00990                 ccNuktas = 7,
00991                 ccHiraganaKatakanaVoicingMarks = 8,
00992                 ccViramas = 9,
00993                 ccFixedPositionStart = 10, // Start of fixed position classes
00994                 ccFixedPositionEnd = 199, // End of fixed position classes
00995                 ccBelowLeftAttached = 200,
00996                 ccBelowAttached = 202,
00997                 ccBelowRightAttached = 204,
00998                 ccLeftAttached = 208, // Left attached (reordrant around single base character)
00999                 ccRightAttached = 210,
01000                 ccAboveLeftAttached = 212,
01001                 ccAboveAttached = 214,
01002                 ccAboveRightAttached = 216,
01003                 ccBelowLeft = 218,
01004                 ccBelow = 220,
01005                 ccBelowRight = 222,
01006                 ccLeft = 224, // Left (reordrant around single base character)
01007                 ccRight = 226,
01008                 ccAboveLeft = 228,
01009                 ccAbove = 230,
01010                 ccAboveRight = 232,
01011                 ccDoubleBelow = 233,
01012                 ccDoubleAbove = 234,
01013                 ccBelowIotaSubscript = 240, // Below (iota subscript)
01014                 ccInvalid = 255 // not defined by Unicode
01015         };
01016         char chCat, chSubCat; // chCat + chSubCat together comprise the general category (from UnicodeData.txt)
01017         uchar combClass; // canonical combining class
01018         TUniChCategory cat; // = TUniChCategory(chCat)
01019         TUniChSubCategory subCat; // = TUniChSubCategory(cat << 8 | subCat)
01020         signed char script; // keyId into 'TUniChDb.scriptNames'; -1 if unknown
01021         int simpleUpperCaseMapping, simpleLowerCaseMapping, simpleTitleCaseMapping; // from UnicodeData.txt
01022         int decompOffset; // offset into 'TUniChDb.decompositions'; or -1 if the character doesn't change during decomposition
01023         int nameOffset; // offset into 'TUniChDb.charNames'
01024         int flags; // a combination of TUniChFlags
01025         int properties; // a combination of TUniChProperties
01026         int propertiesX; // a combination of TUniChPropertiesX
01027         ushort lineBreak; // from LineBreak.txt
01028 
01029         // Converts a 2-letter linebreak code into a 16-bit integer.
01030         static inline ushort GetLineBreakCode(char c1, char c2) { return ((ushort(uchar(c1)) & 0xff) << 8) | ((ushort(uchar(c2)) & 0xff)); }
01031         static const ushort LineBreak_Unknown, LineBreak_ComplexContext, LineBreak_Numeric, LineBreak_InfixNumeric, LineBreak_Quotation;
01032 
01033 public:
01034         void InitAfterLoad() {
01035                 cat = (TUniChCategory) chCat;
01036                 subCat = (TUniChSubCategory) (((int(uchar(chCat)) & 0xff) << 8) | (int(uchar(chSubCat)) & 0xff)); }
01037         void SetCatAndSubCat(const TUniChSubCategory catAndSubCat) {
01038                 cat = (TUniChCategory) ((int(catAndSubCat) >> 8) & 0xff);
01039                 subCat = catAndSubCat;
01040                 chCat = (char) cat; chSubCat = (char) (int(subCat) & 0xff); }
01041         friend class TUniChDb;
01042 
01043         // Inexplicably missing from TSIn/TSOut...
01044         static inline void LoadUShort(TSIn& SIn, ushort& u) { SIn.LoadBf(&u, sizeof(u)); }
01045         static inline void LoadSChar(TSIn& SIn, signed char& u) { SIn.LoadBf(&u, sizeof(u)); }
01046         static inline void SaveUShort(TSOut& SOut, ushort u) { SOut.SaveBf(&u, sizeof(u)); }
01047         static inline void SaveSChar(TSOut& SOut, signed char u) { SOut.SaveBf(&u, sizeof(u)); }
01048 
01049 public:
01050         void Save(TSOut& SOut) const {
01051                 SOut.Save(chCat); SOut.Save(chSubCat); SOut.Save(combClass); SaveSChar(SOut, script);
01052                 SOut.Save(simpleUpperCaseMapping); SOut.Save(simpleLowerCaseMapping); SOut.Save(simpleTitleCaseMapping);
01053                 SOut.Save(decompOffset); SOut.Save(nameOffset);
01054                 SOut.Save(flags); SOut.Save(properties); SOut.Save(propertiesX); SaveUShort(SOut, lineBreak); }
01055         void Load(TSIn& SIn) {
01056                 SIn.Load(chCat); SIn.Load(chSubCat); SIn.Load(combClass); LoadSChar(SIn, script);
01057                 SIn.Load(simpleUpperCaseMapping); SIn.Load(simpleLowerCaseMapping); SIn.Load(simpleTitleCaseMapping);
01058                 SIn.Load(decompOffset); SIn.Load(nameOffset);
01059                 SIn.Load(flags); SIn.Load(properties); SIn.Load(propertiesX); LoadUShort(SIn, lineBreak); InitAfterLoad(); }
01060         explicit TUniChInfo(TSIn& SIn) { Load(SIn); }
01061         TUniChInfo() : chCat(char(ucOther)), chSubCat(char(ucOtherNotAssigned & 0xff)), combClass(ccInvalid),
01062                 script(-1),simpleUpperCaseMapping(-1), simpleLowerCaseMapping(-1), simpleTitleCaseMapping(-1),
01063                 decompOffset(-1), nameOffset(-1), flags(0), properties(0), propertiesX(0), lineBreak(LineBreak_Unknown) {
01064                 InitAfterLoad(); }
01065 
01066         // DerivedCoreProperties flags.
01067         bool IsDcpFlag(const TUniChFlags flag) const { Assert((flag & ucfDcpMask) == flag); return (flags & flag) == flag; }
01068         void ClrDcpFlags() { flags = flags & ~ucfDcpMask; }
01069         void SetDcpFlag(const TUniChFlags flag) { Assert((flag & ucfDcpMask) == flag); flags |= flag; }
01070         bool IsAlphabetic() const { return IsDcpFlag(ucfDcpAlphabetic); }
01071         bool IsUppercase() const { return IsDcpFlag(ucfDcpUppercase); }
01072         bool IsLowercase() const { return IsDcpFlag(ucfDcpLowercase); }
01073         bool IsMath() const { return IsDcpFlag(ucfDcpMath); }
01074         bool IsDefaultIgnorable() const { return IsDcpFlag(ucfDcpDefaultIgnorableCodePoint); }
01075         bool IsGraphemeBase() const { return IsDcpFlag(ucfDcpGraphemeBase); }
01076         bool IsGraphemeExtend() const { return IsDcpFlag(ucfDcpGraphemeExtend); }
01077         bool IsIdStart() const { return IsDcpFlag(ucfDcpIdStart); }
01078         bool IsIdContinue() const { return IsDcpFlag(ucfDcpIdContinue); }
01079         bool IsXidStart() const { return IsDcpFlag(ucfDcpXidStart); }
01080         bool IsXidContinue() const { return IsDcpFlag(ucfDcpXidContinue); }
01081 
01082         // PropList.txt flags.
01083         bool IsProperty(const TUniChProperties flag) const { return (properties & flag) == flag; }
01084         void SetProperty(const TUniChProperties flag) { properties |= flag; }
01085         bool IsAsciiHexDigit() const { return IsProperty(ucfPrAsciiHexDigit); }
01086         bool IsBidiControl() const { return IsProperty(ucfPrBidiControl); }
01087         bool IsDash() const { return IsProperty(ucfPrDash); }
01088         bool IsDeprecated() const { return IsProperty(ucfPrDeprecated); }
01089         bool IsDiacritic() const { return IsProperty(ucfPrDiacritic); }
01090         bool IsExtender() const { return IsProperty(ucfPrExtender); }
01091         bool IsGraphemeLink() const { return IsProperty(ucfPrGraphemeLink); }
01092         bool IsHexDigit() const { return IsProperty(ucfPrHexDigit); }
01093         bool IsHyphen() const { return IsProperty(ucfPrHyphen); }
01094         bool IsIdeographic() const { return IsProperty(ucfPrIdeographic); }
01095         bool IsJoinControl() const { return IsProperty(ucfPrJoinControl); }
01096         bool IsLogicalOrderException() const { return IsProperty(ucfPrLogicalOrderException); }
01097         bool IsNoncharacter() const { return IsProperty(ucfPrNoncharacterCodePoint); }
01098         bool IsQuotationMark() const { return IsProperty(ucfPrQuotationMark); }
01099         bool IsSoftDotted() const { return IsProperty(ucfPrSoftDotted); }
01100         bool IsSTerminal() const { return IsProperty(ucfPrSTerm); }
01101         bool IsTerminalPunctuation() const { return IsProperty(ucfPrTerminalPunctuation); }
01102         bool IsVariationSelector() const { return IsProperty(ucfPrVariationSelector); }
01103         bool IsWhiteSpace() const { return IsProperty(ucfPrWhiteSpace); }
01104 
01105         // Additional PropList.txt flags.
01106         bool IsPropertyX(const TUniChPropertiesX flag) const { return (propertiesX & flag) == flag; }
01107         void SetPropertyX(const TUniChPropertiesX flag) { propertiesX |= flag; }
01108 
01109         // Miscellaneous flags.
01110         bool IsCompositionExclusion() const { return (flags & ucfCompositionExclusion) == ucfCompositionExclusion; }
01111         bool IsCompatibilityDecomposition() const { return (flags & ucfCompatibilityDecomposition) == ucfCompatibilityDecomposition; }
01112 
01113         // Word-boundary flags.
01114         bool IsWbFlag(const TUniChFlags flag) const { Assert((flag & ucfWbMask) == flag); return (flags & flag) == flag; }
01115         void ClrWbAndSbFlags() { flags = flags & ~(ucfWbMask | ucfSbMask); }
01116         void SetWbFlag(const TUniChFlags flag) { Assert((flag & ucfWbMask) == flag); flags |= flag; }
01117         int GetWbFlags() const { return flags & ucfWbMask; }
01118         bool IsWbFormat() const { return IsWbFlag(ucfWbFormat); }
01119         TStr GetWbFlagsStr() const { return GetWbFlagsStr(GetWbFlags()); }
01120         static TStr GetWbFlagsStr(const int flags) { return TStr("") + (flags & ucfWbALetter ? "A" : "") +
01121                 (flags & ucfWbFormat ? "F" : "") + (flags & ucfWbKatakana ? "K" : "") + (flags & ucfWbMidLetter ? "M" : "") +
01122                 (flags & ucfWbMidNum ? "m" : "") + (flags & ucfWbNumeric ? "N" : "") + (flags & ucfWbExtendNumLet ? "E" : ""); }
01123 
01124         // Sentence-boundary flags.
01125         bool IsSbFlag(const TUniChFlags flag) const { Assert((flag & ucfSbMask) == flag); return (flags & flag) == flag; }
01126         void SetSbFlag(const TUniChFlags flag) { Assert((flag & ucfSbMask) == flag); flags |= flag; }
01127         int GetSbFlags() const { return flags & ucfSbMask; }
01128         bool IsSbFormat() const { return IsSbFlag(ucfSbFormat); }
01129         TStr GetSbFlagsStr() const { return GetSbFlagsStr(GetSbFlags()); }
01130         static TStr GetSbFlagsStr(const int flags) { return TStr("") + (flags & ucfSbSep ? "S" : "") +
01131                 (flags & ucfSbFormat ? "F" : "") + (flags & ucfSbSp ? "_" : "") + (flags & ucfSbLower ? "L" : "") +
01132                 (flags & ucfSbUpper ? "U" : "") + (flags & ucfSbOLetter ? "O" : "") + (flags & ucfSbNumeric ? "N" : "") +
01133                 (flags & ucfSbATerm ? "A" : "") + (flags & ucfSbSTerm ? "T" : "") + (flags & ucfSbClose ? "C" : ""); }
01134 
01135         bool IsSbSep() const { return (flags & ucfSbSep) == ucfSbSep; }
01136 
01137         // Grapheme-boundary flags.
01138         bool IsGbExtend() const { return IsGraphemeExtend(); }
01139 
01140         // Sec. 3.13, D47: C is cased iff it is uppercase, lowercase, or general_category == titlecase_letter.
01141         bool IsCased() const { return IsUppercase() || IsLowercase() || (subCat == ucLetterTitlecase); }
01142 
01143         // Character categories.
01144         TUniChCategory GetCat() const { return (TUniChCategory) cat; }
01145         TUniChSubCategory GetSubCat() const { return (TUniChSubCategory) subCat; }
01146         // The following characters belong to the 'symbol/currency' subcategory:
01147         //     U+00024  DOLLAR SIGN
01148         //     U+000a2  CENT SIGN
01149         //     U+000a3  POUND SIGN
01150         //     U+000a4  CURRENCY SIGN
01151         //     U+000a5  YEN SIGN
01152         //     U+020a3  FRENCH FRANC SIGN
01153         //     U+020a4  LIRA SIGN
01154         //     U+020ac  EURO SIGN
01155         //     [and plenty of others]
01156         bool IsCurrency() const { return subCat == ucSymbolCurrency; }
01157         // Note: most private-use and surrogate characters aren't listed explicitly in UnicodeData.txt.
01158         // Thus, it's better to call TUniChDb's versions of these methods, which are aware of
01159         // the full ranges of private-use and surrogate characters.
01160         bool IsPrivateUse() const { return subCat == ucOtherPrivateUse; }
01161         bool IsSurrogate() const { return subCat == ucOtherSurrogate; }
01162 
01163         inline static bool IsValidSubCat(const char chCat, const char chSubCat) {
01164                 static const char s[] = "LuLlLtLmLoMnMcMeNdNlNoPcPdPsPePiPfPoSmScSkSoZsZlZpCcCfCsCoCn";
01165                 for (const char *p = s; *p; p += 2)
01166                         if (chCat == p[0] && chSubCat == p[1]) return true;
01167                 return false; }
01168 };
01169 
01170 //-----------------------------------------------------------------------------
01171 // TUniTrie -- a trie for suffixes that should not appear at the end
01172 // of a sentence
01173 //-----------------------------------------------------------------------------
01174 
01175 template<typename TItem_>
01176 class TUniTrie
01177 {
01178 public:
01179         typedef TItem_ TItem;
01180 protected:
01181         class TNode {
01182         public:
01183                 TItem item;
01184                 int child, sib;
01185                 bool terminal;
01186                 TNode() : child(-1), sib(-1), terminal(false) { }
01187                 TNode(const TItem& item_, const int child_, const int sib_, const bool terminal_) : item(item_), child(child_), sib(sib_), terminal(terminal_) { }
01188         };
01189         typedef TVec<TNode> TNodeV;
01190         typedef TPair<TItem, TItem> TItemPr;
01191         typedef TTriple<TItem, TItem, TItem> TItemTr;
01192         typedef TUniVecIdx TVecIdx;
01193         THash<TItem, TVoid> singles; //
01194         THash<TItemPr, TVoid> pairs;
01195         THash<TItemTr, TInt> roots;
01196         TNodeV nodes;
01197 public:
01198         TUniTrie() { }
01199         void Clr() { singles.Clr(); pairs.Clr(); roots.Clr(); nodes.Clr(); }
01200 
01201         bool Empty() const { return singles.Empty() && pairs.Empty() && roots.Empty(); }
01202 
01203         bool Has1Gram(const TItem& item) const { return singles.IsKey(item); }
01204         bool Has2Gram(const TItem& last, const TItem& butLast) const { return pairs.IsKey(TItemPr(last, butLast)); }
01205         int Get3GramRoot(const TItem& last, const TItem& butLast, const TItem& butButLast) const {
01206                 int keyId = roots.GetKeyId(TItemTr(last, butLast, butButLast));
01207                 if (keyId < 0) return 0; else return roots[keyId]; }
01208         int GetChild(const int parentIdx, const TItem& item) const {
01209                 for (int childIdx = nodes[parentIdx].child; childIdx >= 0; ) {
01210                         const TNode &node = nodes[childIdx];
01211                         if (node.item == item) return childIdx;
01212                         childIdx = node.sib; }
01213                 return -1; }
01214         bool IsNodeTerminal(const int nodeIdx) const { return nodes[nodeIdx].terminal; }
01215 
01216         // Adds a new string to the trie.  Note that the last characters appear
01217         // closer to the root of the trie.
01218         template<typename TSrcVec>
01219         void Add(const TSrcVec& src, const size_t srcIdx, const size_t srcCount)
01220         {
01221                 IAssert(srcCount > 0);
01222                 if (srcCount == 1) { singles.AddKey(TItem(src[TVecIdx(srcIdx)])); return; }
01223                 if (srcCount == 2) { pairs.AddKey(TItemPr(TItem(src[TVecIdx(srcIdx + 1)]), TItem(src[TVecIdx(srcIdx)]))); return; }
01224                 size_t srcLast = srcIdx + (srcCount - 1);
01225                 TItemTr tr = TItemTr(TItem(src[TVecIdx(srcLast)]), TItem(src[TVecIdx(srcLast - 1)]), TItem(src[TVecIdx(srcLast - 2)]));
01226                 int keyId = roots.GetKeyId(tr), curNodeIdx = -1;
01227                 if (keyId >= 0) curNodeIdx = roots[keyId];
01228                 else { curNodeIdx = nodes.Add(TNode(TItem(0), -1, -1, false)); roots.AddDat(tr, curNodeIdx); }
01229                 //
01230                 if (srcCount > 3) for (size_t srcPos = srcLast - 3; ; )
01231                 {
01232                         const TItem curItem = src[TVecIdx(srcPos)];
01233                         int childNodeIdx = nodes[curNodeIdx].child;
01234                         while (childNodeIdx >= 0) {
01235                                 TNode &childNode = nodes[childNodeIdx];
01236                                 if (childNode.item == curItem) break;
01237                                 childNodeIdx = childNode.sib; }
01238                         if (childNodeIdx < 0) {
01239                                 childNodeIdx = nodes.Add(TNode(curItem, -1, nodes[curNodeIdx].child, false));
01240                                 nodes[curNodeIdx].child = childNodeIdx; }
01241                         curNodeIdx = childNodeIdx;
01242                         if (srcPos == srcIdx) break; else srcPos--;
01243                 }
01244                 nodes[curNodeIdx].terminal = true;
01245         }
01246 
01247         template<typename TSrcVec>
01248         void Add(const TSrcVec& src) { Add(src, 0, (size_t) src.Len()); }
01249 };
01250 
01251 //-----------------------------------------------------------------------------
01252 // TUniChDb -- provides access to the Unicode Character Database
01253 //-----------------------------------------------------------------------------
01254 
01255 class TUniChDb
01256 {
01257 protected:
01258         void InitAfterLoad();
01259         typedef TUniVecIdx TVecIdx;
01260 
01261 public:
01262         THash<TInt, TUniChInfo> h; // key: codepoint
01263         TStrPool charNames;
01264         TStrIntH scripts; // keyID = used in TUniChInfo.script; key = script name; dat = number of characters (informative only)
01265         TIntV decompositions;
01266         THash<TIntPr, TInt> inverseDec;
01267         TUniCaseFolding caseFolding;
01268         // These hash tables contain only the unconditional mappings from SpecialCasing.txt.
01269         // The conditional mappings are hardcoded into GetCaseConverted().
01270         TIntIntVH specialCasingLower, specialCasingUpper, specialCasingTitle;
01271         int scriptUnknown; // = scripts.GetKey("Unknown")
01272 
01273         TUniChDb() : scriptUnknown(-1) { }
01274         explicit TUniChDb(TSIn& SIn) { Load(SIn); }
01275         void Clr() {
01276                 h.Clr(); charNames.Clr(); decompositions.Clr(); inverseDec.Clr(); caseFolding.Clr();
01277                 specialCasingLower.Clr(); specialCasingUpper.Clr(); specialCasingTitle.Clr();
01278                 scripts.Clr(); }
01279         void Save(TSOut& SOut) const {
01280                 h.Save(SOut); charNames.Save(SOut); decompositions.Save(SOut);
01281                 inverseDec.Save(SOut); caseFolding.Save(SOut); scripts.Save(SOut);
01282                 specialCasingLower.Save(SOut); specialCasingUpper.Save(SOut); specialCasingTitle.Save(SOut);
01283                 SOut.SaveCs(); }
01284         void Load(TSIn& SIn) {
01285                 h.Load(SIn); charNames.~TStrPool(); new (&charNames) TStrPool(SIn);
01286                 decompositions.Load(SIn);
01287                 inverseDec.Load(SIn); caseFolding.Load(SIn); scripts.Load(SIn);
01288                 specialCasingLower.Load(SIn); specialCasingUpper.Load(SIn); specialCasingTitle.Load(SIn);
01289                 SIn.LoadCs(); InitAfterLoad(); }
01290         void LoadBin(const TStr& fnBin) {
01291                 PSIn SIn = TFIn::New(fnBin); Load(*SIn); }
01292         void Test(const TStr& basePath);
01293 
01294         // File names used by LoadTxt() and its subroutines.
01295         static TStr GetCaseFoldingFn() { return "CaseFolding.txt"; }
01296         static TStr GetSpecialCasingFn() { return "SpecialCasing.txt"; }
01297         static TStr GetUnicodeDataFn() { return "UnicodeData.txt"; }
01298         static TStr GetCompositionExclusionsFn() { return "CompositionExclusions.txt"; }
01299         static TStr GetScriptsFn() { return "Scripts.txt"; }
01300         static TStr GetDerivedCorePropsFn() { return "DerivedCoreProperties.txt"; }
01301         static TStr GetLineBreakFn() { return "LineBreak.txt"; }
01302         static TStr GetPropListFn() { return "PropList.txt"; }
01303         static TStr GetAuxiliaryDir() { return "auxiliary"; }
01304         static TStr GetWordBreakTestFn() { return "WordBreakTest.txt"; }
01305         static TStr GetWordBreakPropertyFn() { return "WordBreakProperty.txt"; }
01306         static TStr GetSentenceBreakTestFn() { return "SentenceBreakTest.txt"; }
01307         static TStr GetSentenceBreakPropertyFn() { return "SentenceBreakProperty.txt"; }
01308         static TStr GetNormalizationTestFn() { return "NormalizationTest.txt"; }
01309         static TStr GetBinFn() { return "UniChDb.bin"; } // used only by Test()
01310 
01311         //-------------------------------------------------------------------------
01312         // Script names
01313         //-------------------------------------------------------------------------
01314 
01315         // These constants are used when initializing from the text files.
01316         static TStr GetScriptNameUnknown() { return "Unknown"; }
01317         static TStr GetScriptNameKatakana() { return "Katakana"; }
01318         static TStr GetScriptNameHiragana() { return "Hiragana"; }
01319         //
01320         const TStr& GetScriptName(const int scriptId) const { return scripts.GetKey(scriptId); }
01321         int GetScriptByName(const TStr& scriptName) const { return scripts.GetKeyId(scriptName); }
01322         int GetScript(const TUniChInfo& ci) const { int s = ci.script; if (s < 0) s = scriptUnknown; return s; }
01323         int GetScript(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return scriptUnknown; else return GetScript(h[i]); }
01324 
01325         //-------------------------------------------------------------------------
01326         // Character namesnames
01327         //-------------------------------------------------------------------------
01328 
01329         // GetCharName returns 0 if the name is unknown; GetCharNameS returns a string of the form "U+1234".
01330         const char *GetCharName(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return 0; int ofs = h[i].nameOffset; return ofs < 0 ? 0 : charNames.GetCStr(ofs); }
01331         TStr GetCharNameS(const int cp) const {
01332                 // ToDo: Add special processing for precomposed Hangul syllables (UAX #15, sec. 16).
01333                 const char *p = GetCharName(cp); if (p) return p;
01334                 char buf[20]; sprintf(buf, "U+%04x", cp); return TStr(buf); }
01335         template<class TSrcVec> void PrintCharNames(FILE *f, const TSrcVec& src, size_t srcIdx, const size_t srcCount, const TStr& prefix) const {
01336                 if (! f) f = stdout;
01337                 for (const size_t srcEnd = srcIdx + srcCount; srcIdx < srcEnd; srcIdx++) {
01338                         fprintf(f, "%s", prefix.CStr());
01339                         int cp = src[TVecIdx(srcIdx)]; fprintf(f, (cp >= 0x10000 ? "U+%05x" : "U+%04x "), cp);
01340                         fprintf(f, " %s\n", GetCharNameS(cp).CStr()); }}
01341         template<class TSrcVec> void PrintCharNames(FILE *f, const TSrcVec& src, const TStr& prefix) const { PrintCharNames(f, src, 0, src.Len(), prefix); }
01342 
01343         //-------------------------------------------------------------------------
01344         // Character information
01345         //-------------------------------------------------------------------------
01346         // These methods provide access to a subset of the functionality
01347         // available in TUniChInfo.
01348 
01349         bool IsGetChInfo(const int cp, TUniChInfo& ChInfo) {
01350                 int i = h.GetKeyId(cp);
01351                 if (i < 0) return false; else { ChInfo=h[i]; return true; }}
01352         TUniChCategory GetCat(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return ucOther; else return h[i].cat; }
01353         TUniChSubCategory GetSubCat(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return ucOtherNotAssigned; else return h[i].subCat; }
01354 
01355         bool IsWbFlag(const int cp, const TUniChFlags flag) const { int i = h.GetKeyId(cp); if (i < 0) return false; else return h[i].IsWbFlag(flag); }
01356         int GetWbFlags(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return 0; else return h[i].GetWbFlags(); }
01357         bool IsSbFlag(const int cp, const TUniChFlags flag) const { int i = h.GetKeyId(cp); if (i < 0) return false; else return h[i].IsSbFlag(flag); }
01358         int GetSbFlags(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return 0; else return h[i].GetSbFlags(); }
01359 
01360 #define ___UniFwd1(name) bool name(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return false; else return h[i].name(); }
01361 #define ___UniFwd2(name1, name2) ___UniFwd1(name1) ___UniFwd1(name2)
01362 #define ___UniFwd3(name1, name2, name3) ___UniFwd2(name1, name2) ___UniFwd1(name3)
01363 #define ___UniFwd4(name1, name2, name3, name4) ___UniFwd3(name1, name2, name3) ___UniFwd1(name4)
01364 #define ___UniFwd5(name1, name2, name3, name4, name5) ___UniFwd4(name1, name2, name3, name4) ___UniFwd1(name5)
01365 
01366 #define DECLARE_FORWARDED_PROPERTY_METHODS \
01367         ___UniFwd5(IsAsciiHexDigit, IsBidiControl, IsDash, IsDeprecated, IsDiacritic) \
01368         ___UniFwd5(IsExtender, IsGraphemeLink, IsHexDigit, IsHyphen, IsIdeographic)  \
01369         ___UniFwd5(IsJoinControl, IsLogicalOrderException, IsNoncharacter, IsQuotationMark, IsSoftDotted)  \
01370         ___UniFwd4(IsSTerminal, IsTerminalPunctuation, IsVariationSelector, IsWhiteSpace)  \
01371         ___UniFwd5(IsAlphabetic, IsUppercase, IsLowercase, IsMath, IsDefaultIgnorable)  \
01372         ___UniFwd4(IsGraphemeBase, IsGraphemeExtend, IsIdStart, IsIdContinue)  \
01373         ___UniFwd2(IsXidStart, IsXidContinue)  \
01374         ___UniFwd3(IsCompositionExclusion, IsCompatibilityDecomposition, IsSbSep)  \
01375         ___UniFwd1(IsGbExtend)  \
01376         ___UniFwd2(IsCased, IsCurrency)
01377 
01378         DECLARE_FORWARDED_PROPERTY_METHODS
01379 
01380 #undef ___UniFwd1
01381 
01382         bool IsPrivateUse(const int cp) const {
01383                 int i = h.GetKeyId(cp); if (i >= 0) return h[i].IsPrivateUse();
01384                 return (0xe000 <= cp && cp <= 0xf8ff) ||  // plane 0 private-use area
01385                         // Planes 15 and 16 are entirely for private use.
01386                         (0xf0000 <= cp && cp <= 0xffffd) || (0x100000 <= cp && cp <= 0x10fffd); }
01387         // Note: d800..dbff are high surrogates, dc00..dfff are low surrogates.
01388         // For db80..dbff it is clear that the surrogate pair containing this high surrogate
01389         // will refer to a private-use codepoint, but IsPrivateUse nevertheless returns false
01390         // for db80..dbff.  This is consistent with the category codes assigned in UnicodeData.txt.
01391         bool IsSurrogate(const int cp) const {
01392                 int i = h.GetKeyId(cp); if (i >= 0) return h[i].IsSurrogate();
01393                 return 0xd800 <= cp && cp <= 0xdcff; }
01394 
01395         // Note: in particular, all Hangul characters (HangulLBase..HangulLBase + HangulLCount - 1
01396         // and HangulSBase..HangulSBase + HangulSCount - 1) should be treated as starters
01397         // for composition to work correctly.
01398         int GetCombiningClass(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return TUniChInfo::ccStarter; else return h[i].combClass; }
01399 
01400         //-------------------------------------------------------------------------
01401         // Hangul constants
01402         //-------------------------------------------------------------------------
01403 
01404         enum {
01405         HangulSBase = 0xAC00, HangulLBase = 0x1100, HangulVBase = 0x1161, HangulTBase = 0x11A7,
01406         HangulLCount = 19, HangulVCount = 21, HangulTCount = 28,
01407         HangulNCount = HangulVCount * HangulTCount,   // 588
01408         HangulSCount = HangulLCount * HangulNCount   // 11172
01409         };
01410 
01411         //-------------------------------------------------------------------------
01412         // Word boundaries (UAX #29)
01413         //-------------------------------------------------------------------------
01414 
01415 protected:
01416         // UAX #29, rule WB3: ignore Format and Extend characters.
01417         // [Note: rule SB5 for sentence boundaries is identical, and thus these methods will also be used for sentence-boundary detection.]
01418         static bool IsWbIgnored(const TUniChInfo& ci) { return ci.IsGbExtend() || ci.IsWbFormat(); }
01419         bool IsWbIgnored(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return false; else return IsWbIgnored(h[i]); }
01420         // Sets 'position' to the smallest index from 'position..srcEnd-1' that contains a non-ignored character.
01421         template<typename TSrcVec> void WbFindCurOrNextNonIgnored(const TSrcVec& src, size_t& position, const size_t srcEnd) const {
01422                 while (position < srcEnd && IsWbIgnored(src[TVecIdx(position)])) position++; }
01423         // Sets 'position' to the smallest index from 'position+1..srcEnd-1' that contains a non-ignored character.
01424         template<typename TSrcVec> void WbFindNextNonIgnored(const TSrcVec& src, size_t& position, const size_t srcEnd) const {
01425                 if (position >= srcEnd) return;
01426                 position++; while (position < srcEnd && IsWbIgnored(src[TVecIdx(position)])) position++; }
01427         // Sets 'position' to the smallest index from 'position+1..srcEnd-1' that contains a non-ignored character.
01428         template<typename TSrcVec> void WbFindNextNonIgnoredS(const TSrcVec& src, size_t& position, const size_t srcEnd) const {
01429                 if (position >= srcEnd) return;
01430                 if (IsSbSep(src[TVecIdx(position)])) { position++; return; }
01431                 position++; while (position < srcEnd && IsWbIgnored(src[TVecIdx(position)])) position++; }
01432         // Sets 'position' to the largest index from 'srcStart..position-1' that contains a non-ignored character.
01433         template<typename TSrcVec> bool WbFindPrevNonIgnored(const TSrcVec& src, const size_t srcStart, size_t& position) const {
01434                 if (position <= srcStart) return false;
01435                 while (position > srcStart) {
01436                         position--; if (! IsWbIgnored(src[TVecIdx(position)])) return true; }
01437                 return false; }
01438         // Test driver for WbFind*NonIgnored.
01439         void TestWbFindNonIgnored(const TIntV& src) const;
01440         void TestWbFindNonIgnored() const;
01441 public:
01442         // Finds the next word boundary strictly after 'position'.
01443         // Note that there is a valid word boundary at 'srcIdx + srcCount'.
01444         // If there is no such word boundary, it returns 'false' and sets 'position' to 'srcIdx + srcCount'.
01445         template<typename TSrcVec>
01446         bool FindNextWordBoundary(const TSrcVec& src, const size_t srcIdx, const size_t srcCount, size_t &position) const;
01447         // Creates, in 'dest', a vector of 'srcCount + 1' elements, where 'dest[i]' tells if there is a word
01448         // boundary between 'src[srcIdx + i - 1]' and 'src[srcIdx + i]'.  Note that 'dest[0]' and 'dest[srcCount]' are
01449         // always set to 'true'.
01450         template<typename TSrcVec>
01451         void FindWordBoundaries(const TSrcVec& src, const size_t srcIdx, const size_t srcCount, TBoolV& dest) const;
01452 protected:
01453         void TestFindNextWordOrSentenceBoundary(const TStr& basePath, bool sentence);
01454 
01455         //-------------------------------------------------------------------------
01456         // Sentence boundaries (UAX #29)
01457         //-------------------------------------------------------------------------
01458 
01459 protected:
01460         TUniTrie<TInt> sbExTrie;
01461 
01462         // Checks whether a sentence that ended at src[position - 1]
01463         // would end in one of the suffixes from sbExTrie.
01464         template<typename TSrcVec>
01465         bool CanSentenceEndHere(const TSrcVec& src, const size_t srcIdx, const size_t position) const;
01466 
01467 public:
01468         // Finds the next sentence boundary strictly after 'position'.
01469         // Note that there is a valid sentence boundary at 'srcIdx + srcCount'.
01470         // If there is no such sentence boundary, it returns 'false' and sets 'position' to 'srcIdx + srcCount'.
01471         template<typename TSrcVec>
01472         bool FindNextSentenceBoundary(const TSrcVec& src, const size_t srcIdx, const size_t srcCount, size_t &position) const;
01473         // Creates, in 'dest', a vector of 'srcCount + 1' elements, where 'dest[i]' tells if there is a sentence
01474         // boundary between 'src[srcIdx + i - 1]' and 'src[srcIdx + i]'.  Note that 'dest[0]' and 'dest[srcCount]' are
01475         // always set to 'true'.
01476         template<typename TSrcVec>
01477         void FindSentenceBoundaries(const TSrcVec& src, const size_t srcIdx, const size_t srcCount, TBoolV& dest) const;
01478 
01479         // These methods allow the user to define a set of sentence boundary exceptions.
01480         // This is a set of strings, stored in 'sbExTrie'.  If the Unicode rules require
01481         // a sentence boundary in a position that would cause the sentence to end with
01482         // 'x (STerm | ATerm) Close* Sp* Sep?', where 'x' is a word from 'sbExTrie',
01483         // we will *not* place a sentence boundary there.
01484         //
01485         // NOTE: sbExTrie is not saved or loaded by the Save() and Load() methods.
01486         // By default, it is empty.  Use SbEx_Clr() to clear it, and SbEx_SetStdEnglish() to obtain
01487         // a standard set of English-language exceptions.
01488         void SbEx_Clr() { sbExTrie.Clr(); }
01489         template<class TSrcVec> void SbEx_Add(const TSrcVec& v) { sbExTrie.Add(v); }
01490         // template<> void SbEx_Add(const TStr& s) {
01491         void SbEx_Add(const TStr& s) {
01492           TIntV v; int n = s.Len(); v.Gen(n); for (int i = 0; i < n; i++) v[i] = int(uchar(s[i])); SbEx_Add(v); }
01493         void SbEx_AddUtf8(const TStr& s) { TUniCodec codec; TIntV v; codec.DecodeUtf8(s, v); SbEx_Add(v); }
01494         int SbEx_AddMulti(const TStr& words, const bool wordsAreUtf8 = true) { TStrV vec; words.SplitOnAllCh('|', vec);
01495                 for (int i = 0; i < vec.Len(); i++) if (wordsAreUtf8) SbEx_AddUtf8(vec[i]); else SbEx_Add(vec[i]);
01496                 return vec.Len(); }
01497         void SbEx_Set(const TUniTrie<TInt>& newTrie) { sbExTrie = newTrie; }
01498         int SbEx_SetStdEnglish() {
01499                 static const TStr data = "Ms|Mrs|Mr|Rev|Dr|Prof|Gov|Sen|Rep|Gen|Brig|Col|Capt|Lieut|Lt|Sgt|Pvt|Cmdr|Adm|Corp|St|Mt|Ft|e.g|e. g.|i.e|i. e|ib|ibid|s.v|s. v|s.vv|s. vv";
01500                 SbEx_Clr(); return SbEx_AddMulti(data, false); }
01501 
01502         //-------------------------------------------------------------------------
01503         // Normalization, decomposition, etc. (UAX #15)
01504         //-------------------------------------------------------------------------
01505 
01506 protected:
01507         // Adds, to 'dest', the decomposition of 'codePoint' (calling itself recursively if necessary).
01508         // If 'compatibility == false', only canonical decompositions are used.
01509         template<typename TDestCh>
01510         void AddDecomposition(const int codePoint, TVec<TDestCh>& dest, const bool compatibility) const;
01511 public:
01512         // This appends, to 'dest', the decomposed form of the source string.
01513         // - for normalization form D (NFD), i.e. canonical decomposition: use compatibility == false;
01514         // - for normalization form KD (NFKD), i.e. compatibility decomposition: use compatibility == true.
01515         template<typename TSrcVec, typename TDestCh>
01516         void Decompose(const TSrcVec& src, size_t srcIdx, const size_t srcCount,
01517                         TVec<TDestCh>& dest, bool compatibility, bool clrDest = true) const;
01518         template<typename TSrcVec, typename TDestCh>
01519         void Decompose(const TSrcVec& src, TVec<TDestCh>& dest, bool compatibility, bool clrDest = true) const {
01520                 Decompose(src, 0, src.Len(), dest, compatibility, clrDest); }
01521         // This performs canonical composition on the source string, and appends
01522         // the result to the destination string.  The source string should be the
01523         // result of a (canonical or compatibility) decomposition; if this is the
01524         // case, the composition will lead to a normalization form C (NFC) or
01525         // normalization form KC (NFKC), depending on whether canonical or compatibility
01526         // decomposition was used.
01527         template<typename TSrcVec, typename TDestCh>
01528         void Compose(const TSrcVec& src, size_t srcIdx, const size_t srcCount,
01529                         TVec<TDestCh>& dest, bool clrDest = true) const;
01530         template<typename TSrcVec, typename TDestCh>
01531         void Compose(const TSrcVec& src, TVec<TDestCh>& dest, bool clrDest = true) const {
01532                 Compose(src, 0, src.Len(), dest, clrDest); }
01533         // Calls Decompose, followed by Compose; thus the result is the NFC (if
01534         // compatibility == false) or NFKC (if compatibility == true) of the source string.
01535         // A temporary TIntV is used to contain the intermediate NF(K)D form of the
01536         // source string.
01537         template<typename TSrcVec, typename TDestCh>
01538         void DecomposeAndCompose(const TSrcVec& src, size_t srcIdx, const size_t srcCount,
01539                         TVec<TDestCh>& dest, bool compatibility, bool clrDest = true) const;
01540         template<typename TSrcVec, typename TDestCh>
01541         void DecomposeAndCompose(const TSrcVec& src, TVec<TDestCh>& dest, bool compatibility, bool clrDest = true) const {
01542                 DecomposeAndCompose(src, 0, src.Len(), dest, compatibility, clrDest); }
01543         // Copies the starter characters from 'src' to 'dest'; the other
01544         // characters are skipped.  'src' should already have been decomposed.
01545         // Returns the number of characters extracted.
01546         template<typename TSrcVec, typename TDestCh>
01547         size_t ExtractStarters(const TSrcVec& src, size_t srcIdx, const size_t srcCount,
01548                         TVec<TDestCh>& dest, bool clrDest = true) const;
01549         template<typename TSrcVec, typename TDestCh>
01550         size_t ExtractStarters(const TSrcVec& src, TVec<TDestCh>& dest, bool clrDest = true) const {
01551                 return ExtractStarters(src, 0, src.Len(), dest, clrDest); }
01552         // Extracts the starters into a temporary vector and then copies it into 'src'.
01553         template<typename TSrcVec>
01554         size_t ExtractStarters(TSrcVec& src) const {
01555                 TIntV temp; size_t retVal = ExtractStarters(src, temp);
01556                 src.Clr(); for (int i = 0; i < temp.Len(); i++) src.Add(temp[i]);
01557                 return retVal; }
01558 
01559 protected:
01560         void TestComposition(const TStr& basePath);
01561 
01562         //-------------------------------------------------------------------------
01563         // Initialization from the text files
01564         //-------------------------------------------------------------------------
01565 
01566 protected:
01567         void InitWordAndSentenceBoundaryFlags(const TStr& basePath);
01568         void InitScripts(const TStr& basePath);
01569         void InitLineBreaks(const TStr& basePath);
01570         void InitDerivedCoreProperties(const TStr& basePath);
01571         void InitPropList(const TStr& basePath);
01572         void InitSpecialCasing(const TStr& basePath);
01573         void LoadTxt_ProcessDecomposition(TUniChInfo& ci, TStr s);
01574 public:
01575         void LoadTxt(const TStr& basePath);
01576         void SaveBin(const TStr& fnBinUcd);
01577 
01578         //-------------------------------------------------------------------------
01579         // Case conversions
01580         //-------------------------------------------------------------------------
01581 
01582 public:
01583         typedef enum TCaseConversion_ { ccLower = 0, ccUpper = 1, ccTitle = 2, ccMax = 3 } TCaseConversion;
01584         // Appends the case-converted form of 'src' to 'dest'.
01585         // 'how' defines what kind of case conversion is required.
01586         // 'turkic' should be set to true iff the text is in Turkic ('tr') or Azeri ('ar').
01587         // 'lithuanian' should be set to true iff the text is in Lithuanian ('lt').
01588         template<typename TSrcVec, typename TDestCh> void GetCaseConverted(const TSrcVec& src, size_t srcIdx, const size_t srcCount, TVec<TDestCh>& dest, const bool clrDest, const TCaseConversion how, const bool turkic, const bool lithuanian) const;
01589         template<typename TSrcVec, typename TDestCh> void GetLowerCase(const TSrcVec& src, size_t srcIdx, const size_t srcCount, TVec<TDestCh>& dest, const bool clrDest = true, const bool turkic = false, const bool lithuanian = false) const { GetCaseConverted(src, srcIdx, srcCount, dest, clrDest, ccLower, turkic, lithuanian); }
01590         template<typename TSrcVec, typename TDestCh> void GetUpperCase(const TSrcVec& src, size_t srcIdx, const size_t srcCount, TVec<TDestCh>& dest, const bool clrDest = true, const bool turkic = false, const bool lithuanian = false) const { GetCaseConverted(src, srcIdx, srcCount, dest, clrDest, ccUpper, turkic, lithuanian); }
01591         template<typename TSrcVec, typename TDestCh> void GetTitleCase(const TSrcVec& src, size_t srcIdx, const size_t srcCount, TVec<TDestCh>& dest, const bool clrDest = true, const bool turkic = false, const bool lithuanian = false) const { GetCaseConverted(src, srcIdx, srcCount, dest, clrDest, ccTitle, turkic, lithuanian); }
01592         template<typename TSrcVec, typename TDestCh> void GetLowerCase(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true, const bool turkic = false, const bool lithuanian = false) const { GetLowerCase(src, 0, src.Len(), dest, clrDest, turkic, lithuanian); }
01593         template<typename TSrcVec, typename TDestCh> void GetUpperCase(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true, const bool turkic = false, const bool lithuanian = false) const { GetUpperCase(src, 0, src.Len(), dest, clrDest, turkic, lithuanian); }
01594         template<typename TSrcVec, typename TDestCh> void GetTitleCase(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true, const bool turkic = false, const bool lithuanian = false) const { GetTitleCase(src, 0, src.Len(), dest, clrDest, turkic, lithuanian); }
01595 
01596         // GetSimpleCaseConverted uses only the simple case mappings (from UnicodeData.txt).
01597         // This is simpler and faster.  Since each character now maps into exactly one
01598         // character, case conversion can also be done in place (see ToSimpleCaseConverted, etc.).
01599         template<typename TSrcVec, typename TDestCh> void GetSimpleCaseConverted(const TSrcVec& src, size_t srcIdx, const size_t srcCount, TVec<TDestCh>& dest, const bool clrDest, const TCaseConversion how) const;
01600         template<typename TSrcVec, typename TDestCh> void GetSimpleLowerCase(const TSrcVec& src, size_t srcIdx, const size_t srcCount, TVec<TDestCh>& dest, const bool clrDest = true) const { GetSimpleCaseConverted(src, srcIdx, srcCount, dest, clrDest, ccLower); }
01601         template<typename TSrcVec, typename TDestCh> void GetSimpleUpperCase(const TSrcVec& src, size_t srcIdx, const size_t srcCount, TVec<TDestCh>& dest, const bool clrDest = true) const { GetSimpleCaseConverted(src, srcIdx, srcCount, dest, clrDest, ccUpper); }
01602         template<typename TSrcVec, typename TDestCh> void GetSimpleTitleCase(const TSrcVec& src, size_t srcIdx, const size_t srcCount, TVec<TDestCh>& dest, const bool clrDest = true) const { GetSimpleCaseConverted(src, srcIdx, srcCount, dest, clrDest, ccTitle); }
01603         template<typename TSrcVec, typename TDestCh> void GetSimpleLowerCase(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true) const { GetSimpleLowerCase(src, 0, src.Len(), dest, clrDest); }
01604         template<typename TSrcVec, typename TDestCh> void GetSimpleUpperCase(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true) const { GetSimpleUpperCase(src, 0, src.Len(), dest, clrDest); }
01605         template<typename TSrcVec, typename TDestCh> void GetSimpleTitleCase(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true) const { GetSimpleTitleCase(src, 0, src.Len(), dest, clrDest); }
01606 
01607         template<typename TSrcVec> void ToSimpleCaseConverted(TSrcVec& src, size_t srcIdx, const size_t srcCount, const TCaseConversion how) const;
01608         template<typename TSrcVec> void ToSimpleUpperCase(TSrcVec& src, size_t srcIdx, const size_t srcCount) const { ToSimpleCaseConverted(src, srcIdx, srcCount, ccUpper); }
01609         template<typename TSrcVec> void ToSimpleLowerCase(TSrcVec& src, size_t srcIdx, const size_t srcCount) const { ToSimpleCaseConverted(src, srcIdx, srcCount, ccLower); }
01610         template<typename TSrcVec> void ToSimpleTitleCase(TSrcVec& src, size_t srcIdx, const size_t srcCount) const { ToSimpleCaseConverted(src, srcIdx, srcCount, ccTitle); }
01611         template<typename TSrcVec> void ToSimpleUpperCase(TSrcVec& src) const { ToSimpleUpperCase(src, 0, src.Len()); }
01612         template<typename TSrcVec> void ToSimpleLowerCase(TSrcVec& src) const { ToSimpleLowerCase(src, 0, src.Len()); }
01613         template<typename TSrcVec> void ToSimpleTitleCase(TSrcVec& src) const { ToSimpleTitleCase(src, 0, src.Len()); }
01614 
01615 public:
01616         friend class TUniCaseFolding;
01617 
01618         // Case folding is an alternative to the above functions.  It is intended primarily
01619         // to produce strings that are suitable for comparisons.  For example,
01620         // ToLowerCase(sigma) = sigma, ToLowerCase(final-sigma) = final-sigma;
01621         // but ToCaseFolder(sigma) = sigma, ToCaseFolded(final-sigma) = sigma.
01622         // - 'turkic' enables special processing for Turkic languages (I-dot and i-dotless).
01623         // - 'full' enables full case mappings -- i.e. sometimes a character may be mapped
01624         //   into a string of two or more characters.
01625         // - Note: For best results, perform NFD(CaseFold(NFD(x)) or NFKD(CaseFold(NFKD(x)) on
01626         //   each string before comparing them (see sec. 3.13 of the standard).
01627         template<typename TSrcVec, typename TDestCh>
01628         void GetCaseFolded(const TSrcVec& src, size_t srcIdx, const size_t srcCount,
01629                 TVec<TDestCh>& dest, const bool clrDest, const bool full, const bool turkic = false) const { caseFolding.Fold(src, srcIdx, srcCount, dest, clrDest, full, turkic); }
01630         template<typename TSrcVec, typename TDestCh>
01631         void GetCaseFolded(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true, const bool full = true, const bool turkic = false) const {
01632                 GetCaseFolded(src, 0, src.Len(), dest, clrDest, full, turkic); }
01633         // ToCaseFolded folds the string in place.  However, this means that only the simple
01634         // case foldings can be used (the full ones could increase the length of the string).
01635         template<typename TSrcVec> void ToCaseFolded(TSrcVec& src, size_t srcIdx, const size_t srcCount, const bool turkic = false) const { caseFolding.FoldInPlace(src, srcIdx, srcCount, turkic); }
01636         template<typename TSrcVec> void ToCaseFolded(TSrcVec& src, const bool turkic = false) const { ToCaseFolded(src, 0, src.Len(), turkic); }
01637 
01638 protected:
01639         void TestCaseConversion(const TStr& source, const TStr& trueLc, const TStr& trueTc, const TStr& trueUc, bool turkic, bool lithuanian);
01640         void TestCaseConversions();
01641 
01642         //-------------------------------------------------------------------------
01643         // Text file reader for the Unicode character database
01644         //-------------------------------------------------------------------------
01645 
01646 protected:
01647 
01648         class TUcdFileReader
01649         {
01650         protected:
01651                 TChA buf;
01652         public:
01653                 TChA comment; // contains '#' and everything after it
01654         protected:
01655                 FILE *f;
01656                 int putBackCh;
01657                 int GetCh() {
01658                         if (putBackCh >= 0) { int c = putBackCh; putBackCh = EOF; return c; }
01659                         return fgetc(f); }
01660                 void PutBack(int c) { Assert(putBackCh == EOF); putBackCh = c; }
01661                 // Returns 'false' iff the EOF was encountered before anything was read.
01662                 bool ReadNextLine() {
01663                         buf.Clr(); comment.Clr();
01664                         bool inComment = false, first = true;
01665                         while (true) {
01666                                 int c = GetCh();
01667                                 if (c == EOF) return ! first;
01668                                 else if (c == 13) {
01669                                         c = GetCh(); if (c != 10) PutBack(c);
01670                                         return true; }
01671                                 else if (c == 10) return true;
01672                                 else if (c == '#') inComment = true;
01673                                 if (! inComment) buf += char(c);
01674                                 else comment += char(c); }
01675                                 /*first = false;*/}
01676         private:
01677                 TUcdFileReader& operator = (const TUcdFileReader& r) { Fail; return *((TUcdFileReader *) 0); }
01678                 TUcdFileReader(const TUcdFileReader& r) { Fail; }
01679         public:
01680                 TUcdFileReader() : f(0) { }
01681                 TUcdFileReader(const TStr& fileName) : f(0), putBackCh(EOF) { Open(fileName); }
01682                 void Open(const TStr& fileName) { Close(); f = fopen(fileName.CStr(), "rt"); IAssertR(f, fileName); putBackCh = EOF; }
01683                 void Close() { putBackCh = EOF; if (f) { fclose(f); f = 0; }}
01684                 ~TUcdFileReader() { Close(); }
01685                 bool GetNextLine(TStrV& dest) {
01686                         dest.Clr();
01687                         while (true) {
01688                                 if (! ReadNextLine()) return false;
01689                                 TStr line = buf; line.ToTrunc();
01690                                 if (line.Len() <= 0) continue;
01691                                 line.SplitOnAllCh(';', dest, false);
01692                                 for (int i = 0; i < dest.Len(); i++) dest[i].ToTrunc();
01693                                 return true; }}
01694                 static int ParseCodePoint(const TStr& s) {
01695                         int c; bool ok = s.IsHexInt(true, 0, 0x10ffff, c); IAssertR(ok, s); return c; }
01696                 static void ParseCodePointList(const TStr& s, TIntV& dest, bool ClrDestP = true) { // space-separated list
01697                         if (ClrDestP) dest.Clr();
01698                         TStrV parts; s.SplitOnWs(parts);
01699                         for (int i = 0; i < parts.Len(); i++) {
01700                                 int c; bool ok = parts[i].IsHexInt(true, 0, 0x10ffff, c); IAssertR(ok, s);
01701                                 dest.Add(c); } }
01702                 static void ParseCodePointRange(const TStr& s, int& from, int &to) { // xxxx or xxxx..yyyy
01703                         int i = s.SearchStr(".."); if (i < 0) { from = ParseCodePoint(s); to = from; return; }
01704                         from = ParseCodePoint(s.GetSubStr(0, i - 1));
01705                         to = ParseCodePoint(s.GetSubStr(i + 2, s.Len() - 1)); }
01706         };
01707 
01708         //-------------------------------------------------------------------------
01709         // Helper class for processing the text files
01710         //-------------------------------------------------------------------------
01711         // Files such as DerivedCoreProps.txt often refer to ranges of codepoints,
01712         // and not all codepoints from the range have also been listed in
01713         // UnicodeData.txt.  Thus, new TUniChInfo instances will be created
01714         // when processing DerivedCoreProps.txt and similar files.
01715         // To assign the correct (sub)categories to these new codepoints,
01716         // the following class will extract the subcategory info from the
01717         // comments in DerivedCoreProps.txt and similar files.
01718 
01719         class TSubcatHelper
01720         {
01721         public:
01722                 bool hasCat; TUniChSubCategory subCat;
01723                 TStrH invalidCatCodes;
01724                 TUniChDb &owner;
01725 
01726                 TSubcatHelper(TUniChDb &owner_) : owner(owner_) { }
01727 
01728                 void ProcessComment(TUniChDb::TUcdFileReader &reader)
01729                 {
01730                         hasCat = false; subCat = ucOtherNotAssigned;
01731                         if (reader.comment.Len() > 3)
01732                         {
01733                                 IAssert(reader.comment[0] == '#');
01734                                 IAssert(reader.comment[1] == ' ');
01735                                 char chCat = reader.comment[2], chSubCat = reader.comment[3];
01736                                 if (reader.comment.Len() > 4) IAssert(isspace(uchar(reader.comment[4])));
01737                                 if (TUniChInfo::IsValidSubCat(chCat, chSubCat)) {
01738                                         hasCat = true; subCat = (TUniChSubCategory) ((int(uchar(chCat)) << 8) | (int(uchar(chSubCat)))); }
01739                                 else invalidCatCodes.AddKey(TStr(chCat) + TStr(chSubCat));
01740                         }
01741                 }
01742 
01743                 void SetCat(const int cp) {
01744                         int i = owner.h.GetKeyId(cp); IAssert(i >= 0);
01745                         IAssert(owner.h[i].subCat == ucOtherNotAssigned);
01746                         IAssert(hasCat);
01747                         owner.h[i].SetCatAndSubCat(subCat); }
01748                 void TestCat(const int cp) {
01749                         if (! hasCat) return;
01750                         int i = owner.h.GetKeyId(cp); IAssert(i >= 0);
01751                         IAssert(owner.h[i].subCat == subCat); }
01752 
01753                 ~TSubcatHelper()
01754                 {
01755                         if (invalidCatCodes.IsKey("L&")) invalidCatCodes.DelKey("L&");
01756                         // Output any unexpected ones (there shouldn't be any).
01757                         if (! invalidCatCodes.Empty()) {
01758                                 printf("Invalid cat code(s) in the comments: ");
01759                                 for (int i = invalidCatCodes.FFirstKeyId(); invalidCatCodes.FNextKeyId(i); )
01760                                         printf(" \"%s\"", invalidCatCodes.GetKey(i).CStr());
01761                                 printf("\n"); }
01762                 }
01763         };
01764 };
01765 
01766 //-----------------------------------------------------------------------------
01767 // TUnicode -- a sadly emasculated wrapper around TUniCodec and TUniChDb
01768 //-----------------------------------------------------------------------------
01769 
01770 class TUnicode
01771 {
01772 public:
01773         TUniCodec codec;
01774         TUniChDb ucd;
01775 
01776         TUnicode() { Init(); }
01777         explicit TUnicode(const TStr& fnBinUcd) { ucd.LoadBin(fnBinUcd); Init(); }
01778         void Init() { InitCodecs(); }
01779 
01780         //-----------------------------------------------------------------------
01781         // UTF-8
01782         //-----------------------------------------------------------------------
01783 
01784         // Returns the number of characters that have been successfully decoded.
01785         // This does not include any replacement characters that may have been inserted into 'dest'.
01786         int DecodeUtf8(const TIntV& src, TIntV& dest) const { return (int) codec.DecodeUtf8(src, dest); }
01787         int DecodeUtf8(const TStr& src, TIntV& dest) const { return (int) codec.DecodeUtf8(src, dest); }
01788 
01789         // Returns the number of characters that have been successfully encoded.
01790         // This does not include any replacement characters that may have been inserted into 'dest'.
01791         int EncodeUtf8(const TIntV& src, TIntV& dest) const { return (int) codec.EncodeUtf8(src, dest); }
01792 
01793         // The following wrapper around the UTF-8 encoder returns a TStr containing
01794         // the UTF-8-encoded version of the input string.
01795         TStr EncodeUtf8Str(const TIntV& src) const { return codec.EncodeUtf8Str(src); }
01796 
01797         //-----------------------------------------------------------------------
01798         // UTF-16 Decoder
01799         //-----------------------------------------------------------------------
01800 
01801         // Returns the number of characters that have been successfully decoded.
01802         // This does not include any replacement characters that may have been inserted into 'dest'.
01803         // Each element of 'src' is assumed to contain one byte of data.
01804         // srcCount must be even (though srcIdx doesn't need to be).
01805         int DecodeUtf16FromBytes(const TIntV& src, TIntV& dest,
01806                 const TUtf16BomHandling bomHandling = bomAllowed,
01807                 const TUniByteOrder defaultByteOrder = boMachineEndian) const {
01808                         return (int) codec.DecodeUtf16FromBytes(src, 0, src.Len(), dest, true, bomHandling, defaultByteOrder); }
01809 
01810         // Here, each element of 'src' is treated as a 16-bit word.  The byte-order settings
01811         // are used to determine if the two bytes of each word should be swapped before further
01812         // processing.  For example, if a BOM is present, it must have the value 0xfeff; if it
01813         // actually has the value 0xfffe, this means that the two bytes of each word must be swapped.
01814         // Basically, the combination of the byteOrder parameter and the byte order mark (if present) at the
01815         // beginning of the source data is used to determine the "original" byte order of the data;
01816         // if this doesn't match the byte order of the local machine, the two bytes of each word will
01817         // be swapped during the decoding process.
01818         int DecodeUtf16FromWords(const TIntV& src, TIntV& dest,
01819                 const TUtf16BomHandling bomHandling = bomAllowed,
01820                 const TUniByteOrder defaultByteOrder = boMachineEndian) const {
01821                         return (int) codec.DecodeUtf16FromWords(src, 0, src.Len(), dest, true, bomHandling, defaultByteOrder); }
01822 
01823         //-----------------------------------------------------------------------
01824         // UTF-16 Encoder
01825         //-----------------------------------------------------------------------
01826 
01827         // Returns the number of characters that have been successfully encoded.
01828         // This does not include any replacement characters that may have been inserted into 'dest'.
01829         int EncodeUtf16ToWords(const TIntV& src, TIntV& dest, const bool insertBom,
01830                 const TUniByteOrder destByteOrder = boMachineEndian) const {
01831                         return (int) codec.EncodeUtf16ToWords(src, 0, src.Len(), dest, true, insertBom, destByteOrder); }
01832 
01833         int EncodeUtf16ToBytes(const TIntV& src, TIntV& dest, const bool insertBom,
01834                 const TUniByteOrder destByteOrder = boMachineEndian) const {
01835                         return (int) codec.EncodeUtf16ToBytes(src, 0, src.Len(), dest, true, insertBom, destByteOrder); }
01836 
01837         //-----------------------------------------------------------------------
01838         // 8-bit codecs
01839         //-----------------------------------------------------------------------
01840 
01841         T8BitCodec<TEncoding_ISO8859_1> iso8859_1;
01842         T8BitCodec<TEncoding_ISO8859_2> iso8859_2;
01843         T8BitCodec<TEncoding_ISO8859_3> iso8859_3;
01844         T8BitCodec<TEncoding_ISO8859_4> iso8859_4;
01845         T8BitCodec<TEncoding_YuAscii> yuAscii;
01846         T8BitCodec<TEncoding_CP1250> cp1250;
01847         T8BitCodec<TEncoding_CP852> cp852;
01848         T8BitCodec<TEncoding_CP437> cp437;
01849 
01850         //-----------------------------------------------------------------------
01851         // Codec registry
01852         //-----------------------------------------------------------------------
01853         // If you know you'll need ISO-8859-2, just use
01854         //   TUnicode unicode;
01855         //   unicode.iso8859_2.Encode(...);
01856         // If you don't know what you'll need, use:
01857         //   TUnicode unicode;
01858         //   PCodecBase myCodec = unicode.GetCodec(myCodecName);
01859         //   myCodec->Encode(...);
01860         // Note that the first approach is slightly more efficient because there
01861         // aren't any virtual method calls involved.
01862 
01863 protected:
01864         THash<TStr, PCodecBase> codecs;
01865         static inline TStr NormalizeCodecName(const TStr& name) {
01866                 TStr s = name.GetLc(); s.ChangeStrAll("_", ""); s.ChangeStrAll("-", ""); return s; }
01867 public:
01868         void RegisterCodec(const TStr& nameList, const PCodecBase& codec) {
01869                 TStrV names; nameList.SplitOnWs(names);
01870                 for (int i = 0; i < names.Len(); i++)
01871                         codecs.AddDat(NormalizeCodecName(names[i]), codec); }
01872         void UnregisterCodec(const TStr& nameList) {
01873                 TStrV names; nameList.SplitOnWs(names);
01874                 for (int i = 0; i < names.Len(); i++)
01875                         codecs.DelKey(NormalizeCodecName(names[i])); }
01876         void ClrCodecs() { codecs.Clr(); }
01877         void InitCodecs();
01878         PCodecBase GetCodec(const TStr& name) const {
01879                 TStr s = NormalizeCodecName(name);
01880                 PCodecBase p; if (! codecs.IsKeyGetDat(s, p)) p.Clr();
01881                 return p; }
01882         void GetAllCodecs(TCodecBaseV& dest) const {
01883                 dest.Clr();
01884                 for (int i = codecs.FFirstKeyId(); codecs.FNextKeyId(i); ) {
01885                         PCodecBase codec = codecs[i]; bool found = false;
01886                         for (int j = 0; j < dest.Len(); j++) if (dest[j]() == codec()) { found = true; break; }
01887                         if (! found) dest.Add(codec); }}
01888 
01889         //-------------------------------------------------------------------------
01890         // Word boundaries (UAX #29)
01891         //-------------------------------------------------------------------------
01892 
01893         // Finds the next word boundary strictly after 'position'.
01894         // Note that there are valid word boundaries at 0 and at 'src.Len()'.
01895         // If there is no such word boundary, it returns 'false' and sets 'position' to 'src.Len()'.
01896         bool FindNextWordBoundary(const TIntV& src, int &position) const {
01897                 if (position < 0) { position = 0; return true; }
01898                 size_t position_; bool retVal = ucd.FindNextWordBoundary(src, 0, src.Len(), position_); position = int(position_); return retVal; }
01899         // Creates, in 'dest', a vector of 'src.Len() + 1' elements, where 'dest[i]' tells if there is a word
01900         // boundary between 'src[i - 1]' and 'src[i]'.  Note that 'dest[0]' and 'dest[src.Len()]' are
01901         // always set to 'true'.
01902         void FindWordBoundaries(const TIntV& src, TBoolV& dest) const { ucd.FindWordBoundaries(src, 0, src.Len(), dest); }
01903 
01904         //-------------------------------------------------------------------------
01905         // Sentence boundaries (UAX #29)
01906         //-------------------------------------------------------------------------
01907 
01908         // Finds the next sentence boundary strictly after 'position'.
01909         // Note that there are valid sentence boundaries at 0 and at 'src.Len()'.
01910         // If there is no such sentence boundary, it returns 'false' and sets 'position' to 'src.Len()'.
01911         bool FindNextSentenceBoundary(const TIntV& src, int &position) const {
01912                 if (position < 0) { position = 0; return true; }
01913                 size_t position_; bool retVal = ucd.FindNextSentenceBoundary(src, 0, src.Len(), position_); position = int(position_); return retVal; }
01914         // Creates, in 'dest', a vector of 'src.Len() + 1' elements, where 'dest[i]' tells if there is a sentence
01915         // boundary between 'src[i - 1]' and 'src[i]'.  Note that 'dest[0]' and 'dest[src.Len()]' are
01916         // always set to 'true'.
01917         void FindSentenceBoundaries(const TIntV& src, TBoolV& dest) const { ucd.FindSentenceBoundaries(src, 0, src.Len(), dest); }
01918 
01919         void ClrSentenceBoundaryExceptions() { ucd.SbEx_Clr(); }
01920         void UseEnglishSentenceBoundaryExceptions() { ucd.SbEx_SetStdEnglish(); }
01921 
01922         //-------------------------------------------------------------------------
01923         // Normalization, decomposition, etc. (UAX #15)
01924         //-------------------------------------------------------------------------
01925 
01926         // This sets 'dest' to the decomposed form of the source string.
01927         // - for normalization form D (NFD), i.e. canonical decomposition: use compatibility == false;
01928         // - for normalization form KD (NFKD), i.e. compatibility decomposition: use compatibility == true.
01929         void Decompose(const TIntV& src, TIntV& dest, bool compatibility) const { ucd.Decompose(src, dest, compatibility, true); }
01930         // This performs canonical composition on the source string, and stores
01931         // the result in the destination vector.  The source string should be the
01932         // result of a (canonical or compatibility) decomposition; if this is the
01933         // case, the composition will lead to a normalization form C (NFC) or
01934         // normalization form KC (NFKC), depending on whether canonical or compatibility
01935         // decomposition was used.
01936         void Compose(const TIntV& src, TIntV& dest) const { return ucd.Compose(src, dest, true); }
01937         // Calls Decompose, followed by Compose; thus the result is the NFC (if
01938         // compatibility == false) or NFKC (if compatibility == true) of the source string.
01939         // A temporary TIntV is used to contain the intermediate NF(K)D form of the
01940         // source string.
01941         void DecomposeAndCompose(const TIntV& src, TIntV& dest, bool compatibility) const { return ucd.DecomposeAndCompose(src, dest, compatibility); }
01942         // Copies the starter characters from 'src' to 'dest'; the other
01943         // characters are skipped.  'src' should already have been decomposed.
01944         // Returns the number of characters extracted.  This function can be
01945         // used to remove diacritical marks from a string (after it has been decomposed!).
01946         int ExtractStarters(const TIntV& src, TIntV& dest) const { return (int) ucd.ExtractStarters(src, dest); }
01947         // Extracts the starters into a temporary vector and then copies it into 'src'.
01948         int ExtractStarters(TIntV& src) const { return (int) ucd.ExtractStarters(src); }
01949 
01950         //-------------------------------------------------------------------------
01951         // Case conversions
01952         //-------------------------------------------------------------------------
01953         // NOTE: if you will be dealing with Turkish, Azeri or Lithuanian text,
01954         // use the case-conversion methods in TUniChDb, which allow the caller
01955         // to request language-specific case mappings for these languages.
01956 
01957 public:
01958         typedef TUniChDb::TCaseConversion TCaseConversion;
01959         // Sets 'dest' to the case-converted form of 'src'.
01960         void GetLowerCase(const TIntV& src, TIntV& dest) const { ucd.GetLowerCase(src, dest, true, false, false); }
01961         void GetUpperCase(const TIntV& src, TIntV& dest) const { ucd.GetUpperCase(src, dest, true, false, false); }
01962         void GetTitleCase(const TIntV& src, TIntV& dest) const { ucd.GetTitleCase(src, dest, true, false, false); }
01963 
01964         // GetSimpleCaseConverted uses only the simple case mappings (from UnicodeData.txt).
01965         // This is simpler and faster.  Since each character now maps into exactly one
01966         // character, case conversion can also be done in place (see ToSimpleCaseConverted, etc.).
01967         void GetSimpleLowerCase(const TIntV& src, TIntV& dest) const { ucd.GetSimpleLowerCase(src, dest, true); }
01968         void GetSimpleUpperCase(const TIntV& src, TIntV& dest) const { ucd.GetSimpleUpperCase(src, dest, true); }
01969         void GetSimpleTitleCase(const TIntV& src, TIntV& dest) const { ucd.GetSimpleTitleCase(src, dest, true); }
01970 
01971         // These functions perform simple case-conversions in-place.
01972         void ToSimpleUpperCase(TIntV& src) const { ucd.ToSimpleUpperCase(src); }
01973         void ToSimpleLowerCase(TIntV& src) const { ucd.ToSimpleLowerCase(src); }
01974         void ToSimpleTitleCase(TIntV& src) const { ucd.ToSimpleTitleCase(src); }
01975 
01976         // Case folding is an alternative to the above functions.  It is intended primarily
01977         // to produce strings that are suitable for comparisons.  For example,
01978         // ToLowerCase(sigma) = sigma, ToLowerCase(final-sigma) = final-sigma;
01979         // but ToCaseFolded(sigma) = sigma, ToCaseFolded(final-sigma) = sigma.
01980         // - 'full' enables full case mappings -- i.e. sometimes a character may be mapped
01981         //   into a string of two or more characters.
01982         // - Note: For best results, perform NFD(CaseFold(NFD(x)) or NFKD(CaseFold(NFKD(x)) on
01983         //   each string before comparing them (see sec. 3.13 of the standard).
01984         void GetCaseFolded(const TIntV& src, TIntV& dest, const bool full = true) const { return ucd.GetCaseFolded(src, dest, true, full, false); }
01985         // ToCaseFolded folds the string in place.  However, this means that only the simple
01986         // case foldings can be used (the full ones could increase the length of the string).
01987         void ToCaseFolded(TIntV& src) const { return ucd.ToCaseFolded(src, false); }
01988 
01989         TStr GetUtf8CaseFolded(const TStr& s) const {
01990                 bool isAscii = true;
01991                 for (int i = 0, n = s.Len(); i < n; i++) if (uchar(s[i]) >= 128) { isAscii = false; break; }
01992                 if (isAscii) return s.GetLc();
01993                 TIntV src; DecodeUtf8(s, src);
01994                 TIntV dest; GetCaseFolded(src, dest);
01995                 return EncodeUtf8Str(dest); }
01996 
01997         //-------------------------------------------------------------------------
01998         // Character properties
01999         //-------------------------------------------------------------------------
02000         // These methods simply call the corresponding TUniChDb method
02001         // (which typically calls the corresponding method of TUniChInfo).
02002         // See the declaration for DECLARE_FORWARDED_PROPERTY_METHODS for a complete list.
02003         // They are all of the form        bool IsXxxx(const int cp) const
02004         // Some of the more notable ones include:
02005         // - IsAlphabetic, IsUppercase, IsLowercase, IsMath, IsAsciiHexDigit
02006         //   IsDash, IsDeprecated, IsDiacritic, IsHexDigit, IsHyphen, IsIdeographic
02007         //   IsNoncharacter, IsQuotationMark, IsSoftDotted, IsTerminalPunctuation, IsWhiteSpace
02008 
02009 #define ___UniFwd1(name) bool name(const int cp) const { return ucd.name(cp); }
02010         DECLARE_FORWARDED_PROPERTY_METHODS
02011 #undef DECLARE_FORWARDED_PROPERTY_METHODS
02012 #undef __UniFwd1
02013         ___UniFwd2(IsPrivateUse, IsSurrogate)
02014 
02015         TUniChCategory GetCat(const int cp) const { return ucd.GetCat(cp); }
02016         TUniChSubCategory GetSubCat(const int cp) const { return ucd.GetSubCat(cp); }
02017 
02018         // GetCharName returns 0 if the name is unknown; GetCharNameS returns a string of the form "U+1234".
02019         const char *GetCharName(const int cp) const { return ucd.GetCharName(cp); }
02020         TStr GetCharNameS(const int cp) const { return ucd.GetCharNameS(cp); }
02021 
02022 };
02023 
02024 //-----------------------------------------------------------------------------
02025 // TUniCodec -- UTF-8 Decoder
02026 //-----------------------------------------------------------------------------
02027 
02028 // Returns the number of characters that have been successfully decoded.
02029 // This does not include any replacement characters that may have been inserted into 'dest'.
02030 template<typename TSrcVec, typename TDestCh>
02031 size_t TUniCodec::DecodeUtf8(
02032         const TSrcVec& src, size_t srcIdx, const size_t srcCount,
02033         TVec<TDestCh>& dest, const bool clrDest) const
02034 {
02035         size_t nDecoded = 0;
02036         if (clrDest) dest.Clr();
02037         const size_t origSrcIdx = srcIdx;
02038         const size_t srcEnd = srcIdx + srcCount;
02039         while (srcIdx < srcEnd)
02040         {
02041                 const size_t charSrcIdx = srcIdx;
02042                 uint c = src[TVecIdx(srcIdx)] & 0xff; srcIdx++;
02043                 if ((c & _1000_0000) == 0) {
02044                         // c is one of the characters 0..0x7f, encoded as a single byte.
02045                         dest.Add(TDestCh(c)); nDecoded++; continue; }
02046                 else if ((c & _1100_0000) == _1000_0000) {
02047                         // No character in a valid UTF-8-encoded string should begin with a byte of the form 10xxxxxx.
02048                         // We must have been thrown into the middle of a multi-byte character.
02049                         switch (errorHandling) {
02050                         case uehThrow: throw TUnicodeException(charSrcIdx, c, "Invalid character: 10xxxxxx.");
02051                         case uehAbort: return nDecoded;
02052                         case uehReplace: dest.Add(TDestCh(replacementChar)); continue;
02053                         case uehIgnore: continue;
02054                         default: Fail; } }
02055                 else
02056                 {
02057                         // c introduces a sequence of 2..6 bytes, depending on how many
02058                         // of the most significant bits of c are set.
02059                         uint nMoreBytes = 0, nBits = 0, minVal = 0;
02060                         if ((c & _1110_0000) == _1100_0000) nMoreBytes = 1, nBits = 5, minVal = 0x80;
02061                         else if ((c & _1111_0000) == _1110_0000) nMoreBytes = 2, nBits = 4, minVal = 0x800;
02062                         else if ((c & _1111_1000) == _1111_0000) nMoreBytes = 3, nBits = 3, minVal = 0x10000;
02063                         else if ((c & _1111_1100) == _1111_1000) nMoreBytes = 4, nBits = 2, minVal = 0x200000;
02064                         else if ((c & _1111_1110) == _1111_1100) nMoreBytes = 5, nBits = 1, minVal = 0x4000000;
02065                         else {
02066                                 // c is of the form 1111111x, which is invalid even in the early definitions of UTF-8
02067                                 // (which allowed the encoding of codepoints up to 2^31 - 1).  However, in principle this
02068                                 // could be used to encode 32-bit integers with the msb set: 1aaabbbbccccddddeeeeffffgggghhhh
02069                                 // could be encoded as 1111111a 10aabbbb 10ccccdd 10ddeeee 10ffffgg 10gghhhh.
02070                                 if (strict)  {
02071                                         switch (errorHandling) {
02072                                         case uehThrow: throw TUnicodeException(charSrcIdx, c, "Invalid character: 1111111x.");
02073                                         case uehAbort: return nDecoded;
02074                                         // In the case of uehReplace and uehIgnore, we'll read the next 5 bytes
02075                                         // and try to decode the character.  Then, since 'strict' is true and
02076                                         // the codepoint is clearly >= 2^31, we'll notice this as an error later
02077                                         // and (in the case of uehReplace) insert a replacement character then.
02078                                         // This is probably better than inserting a replacement character right
02079                                         // away and then trying to read the next byte as if a new character
02080                                         // was beginning there -- if the current byte is really followed by five
02081                                         // 10xxxxxx bytes, we'll just get six replacement characters in a row.
02082                                         case uehReplace: break; //dest.Add(TDestCh(replacementChar)); continue;
02083                                         case uehIgnore: break; // continue;
02084                                         default: Fail; } }
02085                                 nMoreBytes = 5; nBits = 2; minVal = 0x80000000u; }
02086                         // Decode this multi-byte sequence.
02087                         uint cOut = c & ((1 << nBits) - 1); // First extract the nBits least significant bits from c.
02088                         bool cancel = false;
02089                         for (uint i = 0; i < nMoreBytes && ! cancel; i++) {
02090                                 // See if there are enough bytes left in the source vector.
02091                                 if (! (srcIdx < srcEnd)) {
02092                                         switch (errorHandling) {
02093                                         case uehThrow: throw TUnicodeException(charSrcIdx, c, TInt::GetStr(nMoreBytes) + " more bytes expected, only " + TInt::GetStr(int(srcEnd - charSrcIdx - 1)) + " available.");
02094                                         case uehAbort: return nDecoded;
02095                                         case uehReplace: dest.Add(TDestCh(replacementChar)); cancel = true; continue;
02096                                         case uehIgnore: cancel = true; continue;
02097                                         default: Fail; } }
02098                                 // Read the next byte.
02099                                 c = src[TVecIdx(srcIdx)] & 0xff; srcIdx++;
02100                                 if ((c & _1100_0000) != _1000_0000) { // Each subsequent byte should be of the form 10xxxxxx.
02101                                         switch (errorHandling) {
02102                                         case uehThrow: throw TUnicodeException(charSrcIdx, c, "Byte " + TInt::GetStr(i) + " of " + TInt::GetStr(nMoreBytes) + " extra bytes should begin with 10xxxxxx.");
02103                                         case uehAbort: return nDecoded;
02104                                         case uehReplace: dest.Add(TDestCh(replacementChar)); srcIdx--; cancel = true; continue;
02105                                         case uehIgnore: srcIdx--; cancel = true; continue;
02106                                         default: Fail; } }
02107                                 cOut <<= 6; cOut |= (c & _0011_1111); }
02108                         if (cancel) continue;
02109                         if (strict) {
02110                                 // err1: This codepoint has been represented by more bytes than it should have been.
02111                                 // For example, cOut in the range 0..127 should be represented by a single byte,
02112                                 // not by two or more bytes.
02113                                 // - For example, this may happen in the "modified UTF-8" sometimes used for Java
02114                                 // serialization, where the codepoint 0 is encoded as 11000000 10000000 to avoid
02115                                 // the appearance of null bytes in the encoded stream.
02116                                 bool err1 = (cOut < minVal);
02117                                 // err2: Early definitions of UTF-8 allowed any 31-bit integer to be encoded, using up to 6 bytes.
02118                                 // However, later this was restricted to the codepoints 0..0x10ffff only, because only these
02119                                 // are valid Unicode codepoints.  Thus, no more than 4 bytes are ever necessary.
02120                                 bool err2 = (nMoreBytes > 3 || (nMoreBytes == 3 && cOut > 0x10ffff));
02121                                 if (err1 || err2) switch (errorHandling) {
02122                                         case uehThrow:
02123                                                 if (err1) throw TUnicodeException(charSrcIdx, c, "The codepoint 0x" + TInt::GetStr(cOut, "%08x") + " has been represented by too many bytes (" + TInt::GetStr(nMoreBytes + 1) + ").");
02124                                                 else if (err2) throw TUnicodeException(charSrcIdx, c, "Invalid multibyte sequence: it decodes into 0x" + TInt::GetStr(cOut, "%08x") + ", but only codepoints 0..0x10ffff are valid.");
02125                                                 else { Fail; break; }
02126                                         case uehAbort: return nDecoded;
02127                                         case uehReplace: dest.Add(TDestCh(replacementChar)); continue;
02128                                         case uehIgnore: continue;
02129                                         default: Fail; } }
02130                         // Add the decoded codepoint to the destination vector.
02131                         // If this is the first decoded character, and it's one of the byte-order marks
02132                         // (0xfffe and 0xfeff), we will skip it (unless skipBom is false).
02133                         if (! (skipBom && (cOut == 0xfffe || cOut == 0xfeff) && charSrcIdx == origSrcIdx)) {
02134                                 dest.Add(cOut); nDecoded++; }
02135                 } // else (multi-byte sequence)
02136         } // while
02137         return nDecoded;
02138 }
02139 
02140 //-----------------------------------------------------------------------
02141 // TUniCodec -- UTF-8 Encoder
02142 //-----------------------------------------------------------------------
02143 
02144 // Returns the number of characters that have been successfully encoded.
02145 // This does not include any replacement characters that may have been inserted into 'dest'.
02146 template<typename TSrcVec, typename TDestCh>
02147 size_t TUniCodec::EncodeUtf8(
02148         const TSrcVec& src, size_t srcIdx, const size_t srcCount,
02149         TVec<TDestCh>& dest, const bool clrDest) const
02150 {
02151         size_t nEncoded = 0;
02152         for (const size_t srcEnd = srcIdx + srcCount; srcIdx < srcEnd; srcIdx++)
02153         {
02154                 uint c = uint(src[TVecIdx(srcIdx)]);
02155                 bool err = false;
02156                 if (strict && c > 0x10ffff) {
02157                         err = true;
02158                         switch (errorHandling) {
02159                         case uehThrow: throw TUnicodeException(srcIdx, c, "Invalid character (0x" + TInt::GetStr(c, "%x") + "; only characters in the range 0..0x10ffff are allowed).");
02160                         case uehAbort: return nEncoded;
02161                         case uehReplace: c = replacementChar; break;
02162                         case uehIgnore: continue;
02163                         default: Fail; } }
02164                 if (c < 0x80u)
02165                         dest.Add(TDestCh(c & 0xffu));
02166                 else if (c < 0x800u) {
02167                         dest.Add(TDestCh(_1100_0000 | ((c >> 6) & _0001_1111)));
02168                         dest.Add(TDestCh(_1000_0000 | (c & _0011_1111))); }
02169                 else if (c < 0x10000u) {
02170                         dest.Add(TDestCh(_1110_0000 | ((c >> 12) & _0000_1111)));
02171                         dest.Add(TDestCh(_1000_0000 | ((c >> 6) & _0011_1111)));
02172                         dest.Add(TDestCh(_1000_0000 | (c & _0011_1111))); }
02173                 else if (c < 0x200000u) {
02174                         dest.Add(TDestCh(_1111_0000 | ((c >> 18) & _0000_0111)));
02175                         dest.Add(TDestCh(_1000_0000 | ((c >> 12) & _0011_1111)));
02176                         dest.Add(TDestCh(_1000_0000 | ((c >> 6) & _0011_1111)));
02177                         dest.Add(TDestCh(_1000_0000 | (c & _0011_1111))); }
02178                 else if (c < 0x4000000u) {
02179                         dest.Add(TDestCh(_1111_1000 | ((c >> 24) & _0000_0011)));
02180                         dest.Add(TDestCh(_1000_0000 | ((c >> 18) & _0011_1111)));
02181                         dest.Add(TDestCh(_1000_0000 | ((c >> 12) & _0011_1111)));
02182                         dest.Add(TDestCh(_1000_0000 | ((c >> 6) & _0011_1111)));
02183                         dest.Add(TDestCh(_1000_0000 | (c & _0011_1111))); }
02184                 else {
02185                         dest.Add(TDestCh(_1111_1100 | ((c >> 30) & _0000_0011)));
02186                         dest.Add(TDestCh(_1000_0000 | ((c >> 24) & _0011_1111)));
02187                         dest.Add(TDestCh(_1000_0000 | ((c >> 18) & _0011_1111)));
02188                         dest.Add(TDestCh(_1000_0000 | ((c >> 12) & _0011_1111)));
02189                         dest.Add(TDestCh(_1000_0000 | ((c >> 6) & _0011_1111)));
02190                         dest.Add(TDestCh(_1000_0000 | (c & _0011_1111))); }
02191                 if (! err) nEncoded++;
02192         }
02193         return nEncoded;
02194 }
02195 
02196 //-----------------------------------------------------------------------
02197 // TUniCodec -- UTF-16 Encoder
02198 //-----------------------------------------------------------------------
02199 
02200 // Returns the number of characters that have been successfully decoded.
02201 // This does not include any replacement characters that may have been inserted into 'dest'.
02202 // Each element of 'src' is assumed to contain one byte of data.
02203 // srcCount must be even (though srcIdx doesn't need to be).
02204 template<typename TSrcVec, typename TDestCh>
02205 size_t TUniCodec::DecodeUtf16FromBytes(
02206         const TSrcVec& src, size_t srcIdx, const size_t srcCount,
02207         TVec<TDestCh>& dest, const bool clrDest,
02208         const TUtf16BomHandling bomHandling,
02209         const TUniByteOrder defaultByteOrder) const
02210 {
02211         IAssert(srcCount % 2 == 0);
02212         IAssert(bomHandling == bomAllowed || bomHandling == bomRequired || bomHandling == bomIgnored);
02213         IAssert(defaultByteOrder == boMachineEndian || defaultByteOrder == boBigEndian || defaultByteOrder == boLittleEndian);
02214         if (clrDest) dest.Clr();
02215         size_t nDecoded = 0;
02216         if (srcCount <= 0) return nDecoded;
02217         const size_t origSrcIdx = srcIdx, srcEnd = srcIdx + srcCount;
02218         bool littleEndian = false;
02219   bool leDefault = (defaultByteOrder == boLittleEndian || (defaultByteOrder == boMachineEndian && IsMachineLittleEndian()));
02220         if (bomHandling == bomIgnored) littleEndian = leDefault;
02221         else if (bomHandling == bomAllowed || bomHandling == bomRequired)
02222         {
02223                 int byte1 = uint(src[TVecIdx(srcIdx)]) & 0xff, byte2 = uint(src[TVecIdx(srcIdx + 1)]) & 0xff;
02224                 if (byte1 == 0xfe && byte2 == 0xff) { littleEndian = false; if (skipBom) srcIdx += 2; }
02225                 else if (byte1 == 0xff && byte2 == 0xfe) { littleEndian = true; if (skipBom) srcIdx += 2; }
02226                 else if (bomHandling == bomAllowed) littleEndian = leDefault;
02227                 else { // Report an error.
02228                         switch (errorHandling) {
02229                         case uehThrow: throw TUnicodeException(srcIdx, byte1, "BOM expected at the beginning of the input vector (" + TInt::GetStr(byte1, "%02x") + " " + TInt::GetStr(byte2, "%02x") + " found instead).");
02230                         case uehAbort: case uehReplace: case uehIgnore: return size_t(-1);
02231                         default: Fail; } }
02232         }
02233         else Fail;
02234         while (srcIdx < srcEnd)
02235         {
02236                 const size_t charSrcIdx = srcIdx;
02237                 uint byte1 = uint(src[TVecIdx(srcIdx)]) & 0xff, byte2 = uint(src[TVecIdx(srcIdx + 1)]) & 0xff; srcIdx += 2;
02238                 uint c = littleEndian ? (byte1 | (byte2 << 8)) : (byte2 | (byte1 << 8));
02239                 if (Utf16FirstSurrogate <= c && c <= Utf16FirstSurrogate + 1023)
02240                 {
02241                         // c is the first character in a surrogate pair.  Read the next character.
02242                         if (! (srcIdx + 2 <= srcEnd)) {
02243                                 switch (errorHandling) {
02244                                 case uehThrow: throw TUnicodeException(charSrcIdx, c, "The second character of a surrogate pair is missing.");
02245                                 case uehAbort: return nDecoded;
02246                                 case uehReplace: dest.Add(TDestCh(replacementChar)); continue;
02247                                 case uehIgnore: continue;
02248                                 default: Fail; } }
02249                         uint byte1 = uint(src[TVecIdx(srcIdx)]) & 0xff, byte2 = uint(src[TVecIdx(srcIdx + 1)]) & 0xff; srcIdx += 2;
02250                         uint c2 = littleEndian ? (byte1 | (byte2 << 8)) : (byte2 | (byte1 << 8));
02251                         // c2 should be the second character of the surrogate pair.
02252                         if (c2 < Utf16SecondSurrogate || Utf16SecondSurrogate + 1023 < c2) {
02253                                 switch (errorHandling) {
02254                                 case uehThrow: throw TUnicodeException(charSrcIdx + 2, c2, "The second character of a surrogate pair should be in the range " + TInt::GetStr(Utf16SecondSurrogate, "%04x") + ".." + TInt::GetStr(Utf16SecondSurrogate + 1023, "%04x") + ", not " + TInt::GetStr(c2, "04x") + ".");
02255                                 case uehAbort: return nDecoded;
02256                                 // with uehReplace and uehIgnore, we'll just skip the first character of the surrogate pair; we'll process the second one during the next iteration, this time as an ordinary character
02257                                 case uehReplace: dest.Add(TDestCh(replacementChar)); srcIdx -= 2; continue;
02258                                 case uehIgnore: srcIdx -= 2; continue;
02259                                 default: Fail; } }
02260                         // c and c2 each contain 10 bits of information.
02261                         uint cc = ((c - Utf16FirstSurrogate) << 10) | (c2 - Utf16SecondSurrogate);
02262                         cc += 0x10000;
02263                         dest.Add(TDestCh(cc)); nDecoded++; continue;
02264                 }
02265                 else if (strict && Utf16SecondSurrogate <= c && c <= Utf16SecondSurrogate + 1023) {
02266                         switch (errorHandling) {
02267                         case uehThrow: throw TUnicodeException(charSrcIdx, c, "This 16-bit value should be used only as the second character of a surrogate pair.");
02268                         case uehAbort: return nDecoded;
02269                         case uehReplace: dest.Add(TDestCh(replacementChar)); continue;
02270                         case uehIgnore: continue;
02271                         default: Fail; } }
02272                 // If 'c' is the first character in the input stream, and it's a BOM, we might have to skip it.
02273                 if (charSrcIdx == origSrcIdx && (c == 0xfffeu || c == 0xfeffu) && skipBom) continue;
02274                 // Otherwise, store 'c' to the destination vector.
02275                 dest.Add(TDestCh(c)); nDecoded++;
02276         }
02277         return nDecoded;
02278 }
02279 
02280 // Here, each element of 'src' is treated as a 16-bit word.  The byte-order settings
02281 // are used to determine if the two bytes of each word should be swapped before further
02282 // processing.  For example, if a BOM is present, it must have the value 0xfeff; if it
02283 // actually has the value 0xfffe, this means that the two bytes of each word must be swapped.
02284 // Basically, the combination of the byteOrder parameter and the byte order mark (if present) at the
02285 // beginning of the source data is used to determine the "original" byte order of the data;
02286 // if this doesn't match the byte order of the local machine, the two bytes of each word will
02287 // be swapped during the decoding process.
02288 template<typename TSrcVec, typename TDestCh>
02289 size_t TUniCodec::DecodeUtf16FromWords(
02290         const TSrcVec& src, size_t srcIdx, const size_t srcCount,
02291         TVec<TDestCh>& dest, bool clrDest,
02292         const TUtf16BomHandling bomHandling,
02293         const TUniByteOrder defaultByteOrder) const
02294 {
02295         IAssert(bomHandling == bomAllowed || bomHandling == bomRequired || bomHandling == bomIgnored);
02296         IAssert(defaultByteOrder == boMachineEndian || defaultByteOrder == boBigEndian || defaultByteOrder == boLittleEndian);
02297         if (clrDest) dest.Clr();
02298         size_t nDecoded = 0;
02299         if (srcCount <= 0) return nDecoded;
02300         const size_t origSrcIdx = srcIdx, srcEnd = srcIdx + srcCount;
02301         bool swap = false;
02302   bool isMachineLe = IsMachineLittleEndian();
02303         bool isDefaultLe = (defaultByteOrder == boLittleEndian || (defaultByteOrder == boMachineEndian && isMachineLe));
02304         if (bomHandling == bomIgnored) swap = (isDefaultLe != isMachineLe);
02305         else if (bomHandling == bomAllowed || bomHandling == bomRequired)
02306         {
02307                 int c = uint(src[TVecIdx(srcIdx)]) & 0xffff;
02308                 if (c == 0xfeff) { swap = false; if (skipBom) srcIdx += 1; }
02309                 else if (c == 0xfffe) { swap = true; if (skipBom) srcIdx += 1; }
02310                 else if (bomHandling == bomAllowed) swap = (isMachineLe != isDefaultLe);
02311                 else { // Report an error.
02312                         switch (errorHandling) {
02313                         case uehThrow: throw TUnicodeException(srcIdx, c, "BOM expected at the beginning of the input vector (" + TInt::GetStr(c, "%04x") + " found instead).");
02314                         case uehAbort: case uehReplace: case uehIgnore: return size_t(-1);
02315                         default: Fail; } }
02316         }
02317         else Fail;
02318         while (srcIdx < srcEnd)
02319         {
02320                 const size_t charSrcIdx = srcIdx;
02321                 uint c = uint(src[TVecIdx(srcIdx)]) & 0xffffu; srcIdx++;
02322                 if (swap) c = ((c >> 8) & 0xff) | ((c & 0xff) << 8);
02323                 if (Utf16FirstSurrogate <= c && c <= Utf16FirstSurrogate + 1023)
02324                 {
02325                         // c is the first character in a surrogate pair.  Read the next character.
02326                         if (! (srcIdx < srcEnd)) {
02327                                 switch (errorHandling) {
02328                                 case uehThrow: throw TUnicodeException(charSrcIdx, c, "The second character of a surrogate pair is missing.");
02329                                 case uehAbort: return nDecoded;
02330                                 case uehReplace: dest.Add(TDestCh(replacementChar)); continue;
02331                                 case uehIgnore: continue;
02332                                 default: Fail; } }
02333                         uint c2 = uint(src[TVecIdx(srcIdx)]) & 0xffffu; srcIdx++;
02334                         if (swap) c2 = ((c2 >> 8) & 0xff) | ((c2 & 0xff) << 8);
02335                         // c2 should be the second character of the surrogate pair.
02336                         if (c2 < Utf16SecondSurrogate || Utf16SecondSurrogate + 1023 < c2) {
02337                                 switch (errorHandling) {
02338                                 case uehThrow: throw TUnicodeException(charSrcIdx + 1, c2, "The second character of a surrogate pair should be in the range " + TInt::GetStr(Utf16SecondSurrogate, "%04x") + ".." + TInt::GetStr(Utf16SecondSurrogate + 1023, "%04x") + ", not " + TInt::GetStr(c2, "04x") + ".");
02339                                 case uehAbort: return nDecoded;
02340                                 // with uehReplace and uehIgnore, we'll just skip the first character of the surrogate pair; we'll process the second one during the next iteration, this time as an ordinary character
02341                                 case uehReplace: dest.Add(TDestCh(replacementChar)); srcIdx -= 1; continue;
02342                                 case uehIgnore: srcIdx -= 1; continue;
02343                                 default: Fail; } }
02344                         // c and c2 each contain 10 bits of information.
02345                         uint cc = ((c - Utf16FirstSurrogate) << 10) | (c2 - Utf16SecondSurrogate);
02346                         cc += 0x10000;
02347                         dest.Add(TDestCh(cc)); nDecoded++; continue;
02348                 }
02349                 else if (strict && Utf16SecondSurrogate <= c && c <= Utf16SecondSurrogate + 1023) {
02350                         switch (errorHandling) {
02351                         case uehThrow: throw TUnicodeException(charSrcIdx, c, "This 16-bit value should be used only as the second character of a surrogate pair.");
02352                         case uehAbort: return nDecoded;
02353                         case uehReplace: dest.Add(TDestCh(replacementChar)); continue;
02354                         case uehIgnore: continue;
02355                         default: Fail; } }
02356                 // If 'c' is the first character in the input stream, and it's a BOM, we might have to skip it.
02357                 if (charSrcIdx == origSrcIdx && (c == 0xfffeu || c == 0xfeffu) && skipBom) continue;
02358                 // Otherwise, store 'c' to the destination vector.
02359                 dest.Add(TDestCh(c)); nDecoded++;
02360         }
02361         return nDecoded;
02362 }
02363 
02364 //-----------------------------------------------------------------------
02365 // TUniCodec -- UTF-16 Encoder
02366 //-----------------------------------------------------------------------
02367 
02368 // Returns the number of characters that have been successfully encoded.
02369 // This does not include any replacement characters that may have been inserted into 'dest'.
02370 template<typename TSrcVec, typename TDestCh>
02371 size_t TUniCodec::EncodeUtf16ToWords(
02372         const TSrcVec& src, size_t srcIdx, const size_t srcCount,
02373         TVec<TDestCh>& dest, const bool clrDest, const bool insertBom,
02374         const TUniByteOrder destByteOrder) const
02375 {
02376         bool isMachineLe = IsMachineLittleEndian();
02377         bool swap = (destByteOrder == boLittleEndian && ! isMachineLe) || (destByteOrder == boBigEndian && isMachineLe);
02378         size_t nEncoded = 0, srcEnd = srcIdx + srcCount;
02379         if (insertBom) { dest.Add(TDestCh(swap ? 0xfffeu : 0xfeffu)); nEncoded++; }
02380         while (srcIdx < srcEnd)
02381         {
02382                 uint c = uint(src[TVecIdx(srcIdx)]); srcIdx++;
02383                 if (! (c <= 0x10ffffu)) {
02384                         switch (errorHandling) {
02385                         case uehThrow: throw TUnicodeException(srcIdx - 1, c, "UTF-16 only supports characters in the range 0..10ffff (not " + TUInt::GetStr(c, "%08x") + ").");
02386                         case uehAbort: return nEncoded;
02387                         case uehReplace: dest.Add(TDestCh(swap ? SwapBytes(replacementChar) : replacementChar)); continue;
02388                         case uehIgnore: continue;
02389                         default: Fail; } }
02390                 if (Utf16FirstSurrogate <= c && c < Utf16FirstSurrogate + 1023) {
02391                         switch (errorHandling) {
02392                         case uehThrow: throw TUnicodeException(srcIdx - 1, c, "UTF-16 cannot encode " + TUInt::GetStr(c, "%04x") + " as it belongs to the first surrogate range (" + TUInt::GetStr(Utf16FirstSurrogate, "%04x") + ".." + TUInt::GetStr(Utf16FirstSurrogate + 1023, "%04x") + ").");
02393                         case uehAbort: return nEncoded;
02394                         case uehReplace: dest.Add(TDestCh(swap ? SwapBytes(replacementChar) : replacementChar)); continue;
02395                         case uehIgnore: continue;
02396                         default: Fail; } }
02397                 if (Utf16SecondSurrogate <= c && c < Utf16SecondSurrogate + 1023) {
02398                         switch (errorHandling) {
02399                         case uehThrow: throw TUnicodeException(srcIdx - 1, c, "The character " + TUInt::GetStr(c, "%04x") + " belongs to the second surrogate range (" + TUInt::GetStr(Utf16FirstSurrogate, "%04x") + ".." + TUInt::GetStr(Utf16FirstSurrogate + 1023, "%04x") + "), which is not allowed with strict == true.");
02400                         case uehAbort: return nEncoded;
02401                         case uehReplace: dest.Add(TDestCh(swap ? SwapBytes(replacementChar) : replacementChar)); continue;
02402                         case uehIgnore: continue;
02403                         default: Fail; } }
02404                 // If c is <= 0xffff, it can be stored directly.
02405                 if (c <= 0xffffu) {
02406                         if (swap) c = ((c >> 8) & 0xff) | ((c & 0xff) << 8);
02407                         dest.Add(TDestCh(c)); nEncoded++; continue; }
02408                 // Otherwise, represent c by a pair of surrogate characters.
02409                 c -= 0x10000u; IAssert(/*0 <= c &&*/ c <= 0xfffffu);
02410                 uint c1 = (c >> 10) & 1023, c2 = c & 1023;
02411                 c1 += Utf16FirstSurrogate; c2 += Utf16SecondSurrogate;
02412                 if (swap) {
02413                         c1 = ((c1 >> 8) & 0xff) | ((c1 & 0xff) << 8);
02414                         c2 = ((c2 >> 8) & 0xff) | ((c2 & 0xff) << 8); }
02415                 dest.Add(TDestCh(c1));
02416                 dest.Add(TDestCh(c2));
02417                 nEncoded++; continue;
02418         }
02419         return nEncoded;
02420 }
02421 
02422 template<typename TSrcVec, typename TDestCh>
02423 size_t TUniCodec::EncodeUtf16ToBytes(
02424         const TSrcVec& src, size_t srcIdx, const size_t srcCount,
02425         TVec<TDestCh>& dest, const bool clrDest, const bool insertBom,
02426         const TUniByteOrder destByteOrder) const
02427 {
02428         bool isDestLe = (destByteOrder == boLittleEndian || (destByteOrder == boMachineEndian && IsMachineLittleEndian()));
02429         size_t nEncoded = 0, srcEnd = srcIdx + srcCount;
02430         if (insertBom) { dest.Add(isDestLe ? 0xff : 0xfe); dest.Add(isDestLe ? 0xfe : 0xff); nEncoded++; }
02431         while (srcIdx < srcEnd)
02432         {
02433                 uint c = uint(src[TVecIdx(srcIdx)]); srcIdx++;
02434                 if (! (c <= 0x10ffffu)) {
02435                         switch (errorHandling) {
02436                         case uehThrow: throw TUnicodeException(srcIdx - 1, c, "UTF-16 only supports characters in the range 0..10ffff (not " + TUInt::GetStr(c, "%08x") + ").");
02437                         case uehAbort: return nEncoded;
02438 #define ___OutRepl if (isDestLe) { dest.Add(replacementChar & 0xff); dest.Add((replacementChar >> 8) & 0xff); } else { dest.Add((replacementChar >> 8) & 0xff); dest.Add(replacementChar & 0xff); }
02439                         case uehReplace: ___OutRepl; continue;
02440                         case uehIgnore: continue;
02441                         default: Fail; } }
02442                 if (Utf16FirstSurrogate <= c && c < Utf16FirstSurrogate + 1023) {
02443                         switch (errorHandling) {
02444                         case uehThrow: throw TUnicodeException(srcIdx - 1, c, "UTF-16 cannot encode " + TUInt::GetStr(c, "%04x") + " as it belongs to the first surrogate range (" + TUInt::GetStr(Utf16FirstSurrogate, "%04x") + ".." + TUInt::GetStr(Utf16FirstSurrogate + 1023, "%04x") + ").");
02445                         case uehAbort: return nEncoded;
02446                         case uehReplace: ___OutRepl; continue;
02447                         case uehIgnore: continue;
02448                         default: Fail; } }
02449                 if (Utf16SecondSurrogate <= c && c < Utf16SecondSurrogate + 1023) {
02450                         switch (errorHandling) {
02451                         case uehThrow: throw TUnicodeException(srcIdx - 1, c, "The character " + TUInt::GetStr(c, "%04x") + " belongs to the second surrogate range (" + TUInt::GetStr(Utf16FirstSurrogate, "%04x") + ".." + TUInt::GetStr(Utf16FirstSurrogate + 1023, "%04x") + "), which is not allowed with strict == true.");
02452                         case uehAbort: return nEncoded;
02453                         case uehReplace: ___OutRepl; continue;
02454                         case uehIgnore: continue;
02455                         default: Fail; } }
02456 #undef ___OutRepl
02457                 // If c is <= 0xffff, it can be stored directly.
02458                 if (c <= 0xffffu) {
02459                         if (isDestLe) { dest.Add(c & 0xff); dest.Add((c >> 8) & 0xff); }
02460                         else { dest.Add((c >> 8) & 0xff); dest.Add(c & 0xff); }
02461                         nEncoded++; continue; }
02462                 // Otherwise, represent c by a pair of surrogate characters.
02463                 c -= 0x10000u; IAssert(/*0 <= c &&*/ c <= 0xfffffu);
02464                 uint c1 = (c >> 10) & 1023, c2 = c & 1023;
02465                 c1 += Utf16FirstSurrogate; c2 += Utf16SecondSurrogate;
02466                 if (isDestLe) { dest.Add(c1 & 0xff); dest.Add((c1 >> 8) & 0xff); dest.Add(c2 & 0xff); dest.Add((c2 >> 8) & 0xff); }
02467                 else { dest.Add((c1 >> 8) & 0xff); dest.Add(c1 & 0xff); dest.Add((c2 >> 8) & 0xff); dest.Add(c2 & 0xff); }
02468                 nEncoded++; continue;
02469         }
02470         return nEncoded;
02471 }
02472 
02473 //-----------------------------------------------------------------------------
02474 // TUniChDb -- word boundaries
02475 //-----------------------------------------------------------------------------
02476 
02477 template<typename TSrcVec>
02478 bool TUniChDb::FindNextWordBoundary(const TSrcVec& src, const size_t srcIdx, const size_t srcCount, size_t &position) const
02479 {
02480         // WB1.  Break at the start of text.
02481         if (position < srcIdx) { position = srcIdx; return true; }
02482         // If we are beyond the end of the text, there aren't any word breaks left.
02483         const size_t srcEnd = srcIdx + srcCount;
02484         if (position >= srcEnd) return false;
02485         // If 'position' is currently at an ignored character, move it back to the last nonignored character.
02486         size_t origPos = position;
02487         if (IsWbIgnored(src[TVecIdx(position)])) {
02488                 if (! WbFindPrevNonIgnored(src, srcIdx, position))
02489                         position = origPos;
02490         }
02491         // Determine the previous nonignored character (before 'position').
02492         size_t posPrev = position;
02493         if (! WbFindPrevNonIgnored(src, srcIdx, posPrev)) posPrev = position;
02494         // Sec 6.2.  Allow a break between Sep and an ignored character.
02495         if (position == origPos && position + 1 < srcEnd && IsSbSep(src[TVecIdx(position)]) && IsWbIgnored(src[TVecIdx(position + 1)])) { position += 1; return true; }
02496         // Determine the next nonignored character (after 'position').
02497         size_t posNext = position; WbFindNextNonIgnored(src, posNext, srcEnd);
02498         size_t posNext2;
02499         int cPrev = (posPrev < position ? (int) src[TVecIdx(posPrev)] : -1), cCur = (position < srcEnd ? (int) src[TVecIdx(position)] : -1);
02500         int cNext = (position < posNext && posNext < srcEnd ? (int) src[TVecIdx(posNext)] : -1);
02501         int wbfPrev = GetWbFlags(cPrev), wbfCur = GetWbFlags(cCur), wbfNext = GetWbFlags(cNext);
02502         int cNext2, wbfNext2;
02503         //
02504         for ( ; position < srcEnd; posPrev = position, position = posNext, posNext = posNext2,
02505                                                            cPrev = cCur, cCur = cNext, cNext = cNext2,
02506                                                            wbfPrev = wbfCur, wbfCur = wbfNext, wbfNext = wbfNext2)
02507         {
02508                 // Should there be a word boundary between 'position' and 'posNext' (or, more accurately,
02509                 // between src[posNext - 1] and src[posNext] --- any ignored characters between 'position'
02510                 // and 'posNext' are considered to belong to the previous character ('position'), not to the next one)?
02511                 posNext2 = posNext; WbFindNextNonIgnored(src, posNext2, srcEnd);
02512                 cNext2 = (posNext < posNext2 && posNext2 < srcEnd ? (int) src[TVecIdx(posNext2)] : -1);
02513                 wbfNext2 = GetWbFlags(cNext2);
02514 #define TestCurNext(curFlag, nextFlag) if ((wbfCur & curFlag) == curFlag && (wbfNext & nextFlag) == nextFlag) continue
02515 #define TestCurNext2(curFlag, nextFlag, next2Flag) if ((wbfCur & curFlag) == curFlag && (wbfNext & nextFlag) == nextFlag && (wbfNext2 & next2Flag) == next2Flag) continue
02516 #define TestPrevCurNext(prevFlag, curFlag, nextFlag) if ((wbfPrev & prevFlag) == prevFlag && (wbfCur & curFlag) == curFlag && (wbfNext & nextFlag) == nextFlag) continue
02517                 // WB3.  Do not break within CRLF.
02518                 if (cCur == 13 && cNext == 10) continue;
02519                 // WB5.  Do not break between most letters.
02520                 TestCurNext(ucfWbALetter, ucfWbALetter);
02521                 // WB6.  Do not break letters across certain punctuation.
02522                 TestCurNext2(ucfWbALetter, ucfWbMidLetter, ucfWbALetter);
02523                 // WB7.  Do not break letters across certain punctuation.
02524                 TestPrevCurNext(ucfWbALetter, ucfWbMidLetter, ucfWbALetter);
02525                 // WB8.  Do not break within sequences of digits, or digits adjacent to letters.
02526                 TestCurNext(ucfWbNumeric, ucfWbNumeric);
02527                 // WB9.  Do not break within sequences of digits, or digits adjacent to letters.
02528                 TestCurNext(ucfWbALetter, ucfWbNumeric);
02529                 // WB10.  Do not break within sequences of digits, or digits adjacent to letters.
02530                 TestCurNext(ucfWbNumeric, ucfWbALetter);
02531                 // WB11.  Do not break within sequences, such as "3.2" or "3.456,789".
02532                 TestPrevCurNext(ucfWbNumeric, ucfWbMidNum, ucfWbNumeric);
02533                 // WB12.  Do not break within sequences, such as "3.2" or "3.456,789".
02534                 TestCurNext2(ucfWbNumeric, ucfWbMidNum, ucfWbNumeric);
02535                 // WB13.  Do not break between Katakana.
02536                 TestCurNext(ucfWbKatakana, ucfWbKatakana);
02537                 // WB13a.  Do not break from extenders.
02538                 if ((wbfCur & (ucfWbALetter | ucfWbNumeric | ucfWbKatakana | ucfWbExtendNumLet)) != 0 &&
02539                         (wbfNext & ucfWbExtendNumLet) == ucfWbExtendNumLet) continue;
02540                 // WB13b.  Do not break from extenders.
02541                 if ((wbfCur & ucfWbExtendNumLet) == ucfWbExtendNumLet &&
02542                         (wbfNext & (ucfWbALetter | ucfWbNumeric | ucfWbKatakana)) != 0) continue;
02543                 // WB14.  Otherwise, break everywhere.
02544                 position = posNext; return true;
02545 #undef TestCurNext
02546 #undef TestCurNext2
02547 #undef TestPrevCurNext
02548         }
02549         // WB2.  Break at the end of text.
02550         IAssert(position == srcEnd);
02551         return true;
02552 }
02553 
02554 // ToDo: provide a more efficient implementation of this.
02555 template<typename TSrcVec>
02556 void TUniChDb::FindWordBoundaries(const TSrcVec& src, const size_t srcIdx, const size_t srcCount, TBoolV& dest) const
02557 {
02558         if (size_t(dest.Len()) != srcCount + 1) dest.Gen(TVecIdx(srcCount + 1));
02559         dest.PutAll(false);
02560         size_t position = srcIdx;
02561         dest[TVecIdx(position - srcIdx)] = true;
02562         while (position < srcIdx + srcCount)
02563         {
02564                 size_t oldPos = position;
02565                 FindNextWordBoundary(src, srcIdx, srcCount, position);
02566                 Assert(oldPos < position); Assert(position <= srcIdx + srcCount);
02567                 dest[TVecIdx(position - srcIdx)] = true;
02568         }
02569         Assert(dest[TVecIdx(srcCount)]);
02570 }
02571 
02572 //-----------------------------------------------------------------------------
02573 // TUniChDb -- sentence boundaries
02574 //-----------------------------------------------------------------------------
02575 
02576 template<typename TSrcVec>
02577 bool TUniChDb::CanSentenceEndHere(const TSrcVec& src, const size_t srcIdx, const size_t position) const
02578 {
02579         if (sbExTrie.Empty()) return true;
02580         // We'll move back from the position where a sentence-boundary is being considered.
02581         size_t pos = position;
02582         if (! WbFindPrevNonIgnored(src, srcIdx, pos)) return true;
02583         int c = (int) src[TVecIdx(pos)]; int sfb = GetSbFlags(c);
02584         // - Skip the Sep, if there is one.
02585         if ((c & ucfSbSep) == ucfSbSep) {
02586                 if (! WbFindPrevNonIgnored(src, srcIdx, pos)) return true;
02587                 c = (int) src[TVecIdx(pos)]; sfb = GetSbFlags(c); }
02588         // - Skip any Sp characters.
02589         while ((sfb & ucfSbSp) == ucfSbSp) {
02590                 if (! WbFindPrevNonIgnored(src, srcIdx, pos)) return true;
02591                 c = (int) src[TVecIdx(pos)]; sfb = GetSbFlags(c); }
02592         // - Skip any Close characters.
02593         while ((sfb & ucfSbSp) == ucfSbSp) {
02594                 if (! WbFindPrevNonIgnored(src, srcIdx, pos)) return true;
02595                 c = (int) src[TVecIdx(pos)]; sfb = GetSbFlags(c); }
02596         // - Skip any ATerm | STerm characters.
02597         while ((sfb & (ucfSbATerm | ucfSbSTerm)) != 0) {
02598                 if (! WbFindPrevNonIgnored(src, srcIdx, pos)) return true;
02599                 c = (int) src[TVecIdx(pos)]; sfb = GetSbFlags(c); }
02600         // Now start moving through the trie.
02601         int cLast = c, cButLast = -1, cButButLast = -1, len = 1, node = -1;
02602         while (true)
02603         {
02604                 bool atEnd = (! WbFindPrevNonIgnored(src, srcIdx, pos));
02605                 c = (atEnd ? -1 : (int) src[TVecIdx(pos)]);
02606                 TUniChCategory cat = GetCat(c);
02607                 if (atEnd || ! (cat == ucLetter || cat == ucNumber || cat == ucSymbol)) {
02608                         // Check if the suffix we've read so far is one of those that appear in the trie.
02609                         if (len == 1) return ! sbExTrie.Has1Gram(cLast);
02610                         if (len == 2) return ! sbExTrie.Has2Gram(cLast, cButLast);
02611                         IAssert(len >= 3); IAssert(node >= 0);
02612                         if (sbExTrie.IsNodeTerminal(node)) return false;
02613                         if (atEnd) return true; }
02614                 if (len == 1) { cButLast = c; len++; }
02615                 else if (len == 2) { cButButLast = c; len++;
02616                         // Now we have read the last three characters; start descending the suitable subtrie.
02617                         node = sbExTrie.Get3GramRoot(cLast, cButLast, cButButLast);
02618                         if (node < 0) return true; }
02619                 else {
02620                         // Descend down the trie.
02621                         node = sbExTrie.GetChild(node, c);
02622                         if (node < 0) return true; }
02623         }
02624         //return true;
02625 }
02626 
02627 template<typename TSrcVec>
02628 bool TUniChDb::FindNextSentenceBoundary(const TSrcVec& src, const size_t srcIdx, const size_t srcCount, size_t &position) const
02629 {
02630         // SB1.  Break at the start of text.
02631         if (position < srcIdx) { position = srcIdx; return true; }
02632         // If we are beyond the end of the text, there aren't any word breaks left.
02633         const size_t srcEnd = srcIdx + srcCount;
02634         if (position >= srcEnd) return false;
02635         // If 'position' is currently at an ignored character, move it back to the last nonignored character.
02636         size_t origPos = position;
02637         if (IsWbIgnored(src[TVecIdx(position)])) {
02638                 if (! WbFindPrevNonIgnored(src, srcIdx, position))
02639                         position = origPos;
02640         }
02641         // Determine the previous nonignored character (before 'position').
02642         size_t posPrev = position;
02643         if (! WbFindPrevNonIgnored(src, srcIdx, posPrev)) posPrev = position;
02644         // Sec 6.2.  Allow a break between Sep and an ignored character.
02645         if (position == origPos && position + 1 < srcEnd && IsSbSep(src[TVecIdx(position)]) && IsWbIgnored(src[TVecIdx(position + 1)])) { position += 1; return true; }
02646         // Determine the next nonignored character (after 'position').
02647         size_t posNext = position; WbFindNextNonIgnored(src, posNext, srcEnd);
02648         size_t posNext2;
02649         int cPrev = (posPrev < position ? (int) src[TVecIdx(posPrev)] : -1), cCur = (position < srcEnd ? (int) src[TVecIdx(position)] : -1);
02650         int cNext = (position < posNext && posNext < srcEnd ? (int) src[TVecIdx(posNext)] : -1);
02651         int sbfPrev = GetSbFlags(cPrev), sbfCur = GetSbFlags(cCur), sbfNext = GetSbFlags(cNext);
02652         int cNext2, sbfNext2;
02653         // Initialize the state of the peek-back automaton.
02654         typedef enum { stInit, stATerm, stATermSp, stATermSep, stSTerm, stSTermSp, stSTermSep } TPeekBackState;
02655         TPeekBackState backState;
02656         {
02657                 size_t pos = position;
02658                 bool wasSep = false, wasSp = false, wasATerm = false, wasSTerm = false;
02659                 while (true)
02660                 {
02661                         if (! WbFindPrevNonIgnored(src, srcIdx, pos)) break;
02662                         // Skip at most one Sep.
02663                         int cp = (int) src[TVecIdx(pos)]; int sbf = GetSbFlags(cp);
02664                         if ((sbf & ucfSbSep) == ucfSbSep) {
02665                                 wasSep = true;
02666                                 if (! WbFindPrevNonIgnored(src, srcIdx, pos)) break;
02667                                 cp = (int) src[TVecIdx(pos)]; sbf = GetSbFlags(cp); }
02668                         // Skip zero or more Sp's.
02669                         bool stop = false;
02670                         while ((sbf & ucfSbSp) == ucfSbSp) {
02671                                 wasSp = true;
02672                                 if (! WbFindPrevNonIgnored(src, srcIdx, pos)) { stop = true; break; }
02673                                 cp = (int) src[TVecIdx(pos)]; sbf = GetSbFlags(cp); }
02674                         if (stop) break;
02675                         // Skip zero or more Close's.
02676                         while ((sbf & ucfSbClose) == ucfSbClose) {
02677                                 if (! WbFindPrevNonIgnored(src, srcIdx, pos)) { stop = true; break; }
02678                                 cp = (int) src[TVecIdx(pos)]; sbf = GetSbFlags(cp); }
02679                         if (stop) break;
02680                         // Process an ATerm or STerm.
02681                         wasATerm = ((sbf & ucfSbATerm) == ucfSbATerm);
02682                         wasSTerm = ((sbf & ucfSbSTerm) == ucfSbSTerm);
02683                         break;
02684                 }
02685                 if (wasATerm) backState = (wasSep ? stATermSep : wasSp ? stATermSp : stATerm);
02686                 else if (wasSTerm) backState = (wasSep ? stSTermSep : wasSp ? stSTermSp : stSTerm);
02687                 else backState = stInit;
02688         }
02689         // Initialize the state of the peek-ahead automaton.  This state tells us what follows
02690         // after we skip all contiguous characters from the complement of the set {OLetter, Upper, Lower, Sep, STerm, ATerm}.
02691         // Thus, the next character is either OLetter, Upper, Lower, Sep, STerm, ATerm, or the end of the input string.
02692         // Our peek-ahead automaton must tell us whether it is Lower or something else.
02693         typedef enum { stUnknown, stLower, stNotLower } TPeekAheadState;
02694         TPeekAheadState aheadState = stUnknown;
02695         //
02696         for ( ; position < srcEnd; posPrev = position, position = posNext, posNext = posNext2,
02697                                                            cPrev = cCur, cCur = cNext, cNext = cNext2,
02698                                                            sbfPrev = sbfCur, sbfCur = sbfNext, sbfNext = sbfNext2)
02699         {
02700                 // Should there be a word boundary between 'position' and 'posNext' (or, more accurately,
02701                 // between src[posNext - 1] and src[posNext] --- any ignored characters between 'position'
02702                 // and 'posNext' are considered to belong to the previous character ('position'), not to the next one)?
02703                 posNext2 = posNext; WbFindNextNonIgnored(src, posNext2, srcEnd);
02704                 cNext2 = (posNext < posNext2 && posNext2 < srcEnd ? (int) src[TVecIdx(posNext2)] : -1);
02705                 sbfNext2 = GetSbFlags(cNext2);
02706                 // Update the peek-back automaton.
02707 #define TestCur(curFlag) ((sbfCur & ucfSb##curFlag) == ucfSb##curFlag)
02708 #define Trans(curFlag, newState) if (TestCur(curFlag)) { backState = st##newState; break; }
02709                 switch (backState) {
02710                         case stInit: Trans(ATerm, ATerm); Trans(STerm, STerm); break;
02711                         case stATerm: Trans(Sp, ATermSp); Trans(Sep, ATermSep); Trans(ATerm, ATerm); Trans(STerm, STerm); Trans(Close, ATerm); backState = stInit; break;
02712                         case stSTerm: Trans(Sp, STermSp); Trans(Sep, STermSep); Trans(ATerm, ATerm); Trans(STerm, STerm); Trans(Close, STerm); backState = stInit; break;
02713                         case stATermSp: Trans(Sp, ATermSp); Trans(Sep, ATermSep); Trans(ATerm, ATerm); Trans(STerm, STerm); backState = stInit; break;
02714                         case stSTermSp: Trans(Sp, STermSp); Trans(Sep, STermSep); Trans(ATerm, ATerm); Trans(STerm, STerm); backState = stInit; break;
02715                         case stATermSep: Trans(ATerm, ATerm); Trans(STerm, STerm); backState = stInit; break;
02716                         case stSTermSep: Trans(ATerm, ATerm); Trans(STerm, STerm); backState = stInit; break;
02717                         default: IAssert(false); }
02718 #undef Trans
02719 #undef TestCur
02720                 // Update the peek-ahead automaton.
02721 #define IsPeekAheadSkippable(sbf) ((sbf & (ucfSbOLetter | ucfSbUpper | ucfSbLower | ucfSbSep | ucfSbSTerm | ucfSbATerm)) == 0)
02722                 if (! IsPeekAheadSkippable(sbfCur)) {
02723                         bool isLower = ((sbfCur & ucfSbLower) == ucfSbLower);
02724                         if (aheadState == stLower) IAssert(isLower);
02725                         else if (aheadState == stNotLower) IAssert(! isLower);
02726                         // We haven't peaked ahead farther than this so far -- invalidate the state.
02727                         aheadState = stUnknown; }
02728                 if (aheadState == stUnknown)
02729                 {
02730                         // Peak ahead to the next non-peekahead-skippable character.
02731                         size_t pos = posNext;
02732                         while (pos < srcEnd) {
02733                                 int cp = (int) src[TVecIdx(pos)]; int sbf = GetSbFlags(cp);
02734                                 if (! IsPeekAheadSkippable(sbf)) {
02735                                         if ((sbf & ucfSbLower) == ucfSbLower) aheadState = stLower;
02736                                         else aheadState = stNotLower;
02737                                         break; }
02738                                 WbFindNextNonIgnored(src, pos, srcEnd); }
02739                         if (! (pos < srcEnd)) aheadState = stNotLower;
02740                 }
02741 #undef IsPeekAheadSkippable
02742                 //
02743 #define TestCurNext(curFlag, nextFlag) if ((sbfCur & curFlag) == curFlag && (sbfNext & nextFlag) == nextFlag) continue
02744 #define TestCurNext2(curFlag, nextFlag, next2Flag) if ((sbfCur & curFlag) == curFlag && (sbfNext & nextFlag) == nextFlag && (sbfNext2 & next2Flag) == next2Flag) continue
02745 #define TestPrevCurNext(prevFlag, curFlag, nextFlag) if ((sbfPrev & prevFlag) == prevFlag && (sbfCur & curFlag) == curFlag && (sbfNext & nextFlag) == nextFlag) continue
02746                 // SB3.  Do not break within CRLF.
02747                 if (cCur == 13 && cNext == 10) continue;
02748                 // SB4.  Break ater paragraph separators.
02749                 if ((sbfCur & ucfSbSep) == ucfSbSep) {
02750                         if (! CanSentenceEndHere(src, srcIdx, position)) continue;
02751                         position = posNext; return true; }
02752                 // Do not break after ambiguous terminators like period, if they are immediately followed by a number
02753                 // or lowercase letter, if they are between uppercase letters, or if the first following letter
02754                 // (optionally after certain punctuation) is lowercase.  For example, a period may be an abbreviation
02755                 // or numeric period, and thus may not mark the end of a sentence.
02756                 TestCurNext(ucfSbATerm, ucfSbNumeric); // SB6
02757                 TestPrevCurNext(ucfSbUpper, ucfSbATerm, ucfSbUpper); // SB7
02758                 // SB8a.  (STerm | ATerm) Close* Sp* [do not break] (STerm | ATerm)
02759                 if ((backState == stATerm || backState == stATermSp || backState == stSTerm || backState == stSTermSp) &&
02760                         (sbfNext & (ucfSbSTerm | ucfSbATerm)) != 0) continue;
02761                 // SB8*.  ATerm Close* Sp* [do not break] ( ! (OLetter | Upper | Lower | Sep | STerm | ATerm) )* Lower
02762                 if ((backState == stATerm || backState == stATermSp) && aheadState == stLower) continue;
02763                 // Break after sentence terminators, but include closing punctuation, trailing spaces, and a paragraph separator (if present).
02764                 // SB9. ( STerm | ATerm ) Close* [do not break] ( Close | Sp | Sep )
02765                 if ((backState == stATerm || backState == stSTerm) && (sbfNext & (ucfSbClose | ucfSbSp | ucfSbSep)) != 0) continue;
02766                 // SB10. ( STerm | ATerm ) Close* Sp* [do not break] ( Sp | Sep )
02767                 // SB11*. ( STerm | ATerm ) Close* Sp* Sep? [do break]
02768                 if (backState == stATerm || backState == stATermSp || backState == stATermSep || backState == stSTerm || backState == stSTermSp || backState == stSTermSep) {
02769                         if ((sbfNext & (ucfSbSp | ucfSbSep)) != 0) continue; // SB10
02770                         if (! CanSentenceEndHere(src, srcIdx, position)) continue;
02771                         position = posNext; return true; } // SB11
02772                 // WB12.  Otherwise, do not break.
02773                 continue;
02774 #undef TestCurNext
02775 #undef TestCurNext2
02776 #undef TestPrevCurNext
02777         }
02778         // WB2.  Break at the end of text.
02779         IAssert(position == srcEnd);
02780         return true;
02781 }
02782 
02783 // ToDo: provide a more efficient implementation of this.
02784 template<typename TSrcVec>
02785 void TUniChDb::FindSentenceBoundaries(const TSrcVec& src, const size_t srcIdx, const size_t srcCount, TBoolV& dest) const
02786 {
02787         if (size_t(dest.Len()) != srcCount + 1) dest.Gen(TVecIdx(srcCount + 1));
02788         dest.PutAll(false);
02789         size_t position = srcIdx;
02790         dest[TVecIdx(position - srcIdx)] = true;
02791         while (position < srcIdx + srcCount)
02792         {
02793                 size_t oldPos = position;
02794                 FindNextSentenceBoundary(src, srcIdx, srcCount, position);
02795                 Assert(oldPos < position); Assert(position <= srcIdx + srcCount);
02796                 dest[TVecIdx(position - srcIdx)] = true;
02797         }
02798         Assert(dest[TVecIdx(srcCount)]);
02799 }
02800 
02801 //-----------------------------------------------------------------------------
02802 // TUniChDb -- case conversions
02803 //-----------------------------------------------------------------------------
02804 
02805 template<typename TSrcVec, typename TDestCh>
02806 void TUniChDb::GetCaseConverted(const TSrcVec& src, size_t srcIdx, const size_t srcCount,
02807                                                                 TVec<TDestCh>& dest, const bool clrDest,
02808                                                                 const TUniChDb::TCaseConversion how,
02809                                                                 const bool turkic, const bool lithuanian) const
02810 {
02811         const TIntIntVH &specials = (how == ccUpper ? specialCasingUpper : how == ccLower ? specialCasingLower : how == ccTitle ? specialCasingTitle : *((TIntIntVH *) 0));
02812         if (clrDest) dest.Clr();
02813         enum {
02814                 GreekCapitalLetterSigma = 0x3a3,
02815                 GreekSmallLetterSigma = 0x3c3,
02816                 GreekSmallLetterFinalSigma = 0x3c2,
02817                 LatinCapitalLetterI = 0x49,
02818                 LatinCapitalLetterJ = 0x4a,
02819                 LatinCapitalLetterIWithOgonek = 0x12e,
02820                 LatinCapitalLetterIWithGrave = 0xcc,
02821                 LatinCapitalLetterIWithAcute = 0xcd,
02822                 LatinCapitalLetterIWithTilde = 0x128,
02823                 LatinCapitalLetterIWithDotAbove = 0x130,
02824                 LatinSmallLetterI = 0x69,
02825                 CombiningDotAbove = 0x307
02826         };
02827         //
02828         bool seenCased = false, seenTwoCased = false; int cpFirstCased = -1;
02829         size_t nextWordBoundary = srcIdx;
02830         TBoolV wordBoundaries; bool wbsKnown = false;
02831         for (const size_t origSrcIdx = srcIdx, srcEnd = srcIdx + srcCount; srcIdx < srcEnd; )
02832         {
02833                 int cp = src[TVecIdx(srcIdx)]; srcIdx++;
02834                 //if (turkic && cp == 0x130 && how == ccLower) printf("!");
02835                 // For conversion to titlecase, the first cased character of each word
02836                 // must be converted to titlecase; everything else must be converted
02837                 // to lowercase.
02838                 TUniChDb::TCaseConversion howHere;
02839                 if (how != ccTitle) howHere = how;
02840                 else {
02841                         if (srcIdx - 1 == nextWordBoundary) { // A word starts/ends here.
02842                                 seenCased = false; seenTwoCased = false; cpFirstCased = -1;
02843                                 size_t next = nextWordBoundary; FindNextWordBoundary(src, origSrcIdx, srcCount, next);
02844                                 IAssert(next > nextWordBoundary); nextWordBoundary = next; }
02845                         bool isCased = IsCased(cp);
02846                         if (isCased && ! seenCased) { howHere = ccTitle; seenCased = true; cpFirstCased = cp; }
02847                         else { howHere = ccLower;
02848                                 if (isCased && seenCased) seenTwoCased = true; }
02849                 }
02850                 // First, process the conditional mappings from SpecialCasing.txt.
02851                 // These will be processed in code -- they were ignored while
02852                 // we were reading SpecialCasing.txt itself.
02853                 if (cp == GreekCapitalLetterSigma && howHere == ccLower)
02854                 {
02855                         // SpecialCasing.txt mentions the 'FinalSigma' condition, but sec. 3.13 of
02856                         // the standard doesn't define it.  We'll use FinalCased instead.
02857                         // FinalCased: within the closest word boundaries containing C,
02858                         // there is a cased letter before C, and there is no cased letter after C.
02859                         //size_t nextBoundary = srcIdx - 1; FindNextWordBoundary(src, srcIdx, srcCount, nextBoundary);
02860                         if (! wbsKnown) { FindWordBoundaries(src, origSrcIdx, srcCount, wordBoundaries); wbsKnown = true; }
02861                         size_t srcIdx2 = srcIdx; bool casedAfter = false;
02862                         if (how == ccTitle)
02863                                 printf("!");
02864                         //while (srcIdx2 < nextBoundary)
02865                         while (! wordBoundaries[TVecIdx(srcIdx2 - origSrcIdx)])
02866                         {
02867                                 int cp2 = src[TVecIdx(srcIdx2)]; srcIdx2++;
02868                                 if (IsCased(cp2)) { casedAfter = true; break; }
02869                         }
02870                         if (! casedAfter)
02871                         {
02872                                 //size_t prevBoundary = srcIdx - 1;
02873                                 //FindPreviousWordBoundary(src, srcIdx, srcCount, prevBoundary);
02874                                 srcIdx2 = srcIdx - 1; bool casedBefore = false;
02875                                 //while (prevBoundary < srcIdx2)
02876                                 while (! wordBoundaries[TVecIdx(srcIdx2 - origSrcIdx)])
02877                                 {
02878                                         --srcIdx2; int cp2 = src[TVecIdx(srcIdx2)];
02879                                         if (IsCased(cp2)) { casedBefore = true; break; }
02880                                 }
02881                                 if (casedBefore) {
02882                                         // Now we have a FinalCased character.
02883                                         dest.Add(GreekSmallLetterFinalSigma); Assert(howHere == ccLower); continue; }
02884                         }
02885                         // If we got here, add a non-final sigma.
02886                         dest.Add(GreekSmallLetterSigma); continue;
02887                 }
02888                 else if (lithuanian)
02889                 {
02890                         if (howHere == ccLower)
02891                         {
02892                                 if (cp == LatinCapitalLetterI || cp == LatinCapitalLetterJ || cp == LatinCapitalLetterIWithOgonek)
02893                                 {
02894                                         bool moreAbove = false;
02895                                         for (size_t srcIdx2 = srcIdx; srcIdx2 < srcEnd; )
02896                                         {
02897                                                 const int cp2 = src[TVecIdx(srcIdx2)]; srcIdx2++;
02898                                                 const int cc2 = GetCombiningClass(cp2);
02899                                                 if (cc2 == TUniChInfo::ccStarter) break;
02900                                                 if (cc2 == TUniChInfo::ccAbove) { moreAbove = true; break; }
02901                                         }
02902                                         if (moreAbove)
02903                                         {
02904                                                 if (cp == LatinCapitalLetterI) { dest.Add(0x69); dest.Add(0x307); continue; }
02905                                                 if (cp == LatinCapitalLetterJ) { dest.Add(0x6a); dest.Add(0x307); continue; }
02906                                                 if (cp == LatinCapitalLetterIWithOgonek) { dest.Add(0x12f); dest.Add(0x307); continue; }
02907                                         }
02908                                 }
02909                                 else if (cp == LatinCapitalLetterIWithGrave) { dest.Add(0x69); dest.Add(0x307); dest.Add(0x300); continue; }
02910                                 else if (cp == LatinCapitalLetterIWithAcute) { dest.Add(0x69); dest.Add(0x307); dest.Add(0x301); continue; }
02911                                 else if (cp == LatinCapitalLetterIWithTilde) { dest.Add(0x69); dest.Add(0x307); dest.Add(0x303); continue; }
02912                         }
02913                         if (cp == CombiningDotAbove)
02914                         {
02915                                 // Lithuanian, howHere != ccLower.
02916                                 // AfterSoftDotted := the last preceding character with a combining class
02917                                 // of zero before C was Soft_Dotted, and there is no intervening combining
02918                                 // character class 230 (ABOVE).
02919                                 bool afterSoftDotted = false;
02920                                 size_t srcIdx2 = srcIdx - 1; // now srcIdx2 is the index from which we got 'cp'
02921                                 while (origSrcIdx < srcIdx2)
02922                                 {
02923                                         --srcIdx2; int cp2 = src[TVecIdx(srcIdx2)];
02924                                         int cc2 = GetCombiningClass(cp2);
02925                                         if (cc2 == TUniChInfo::ccAbove) break;
02926                                         if (cc2 == TUniChInfo::ccStarter) {
02927                                                 afterSoftDotted = IsSoftDotted(cp2); break; }
02928                                 }
02929                                 if (afterSoftDotted)
02930                                 {
02931                                         Assert(lithuanian);
02932                                         // Remove DOT ABOVE after "i" with upper or titlecase.
02933                                         // - Note: but this must only be done if that "i" was actually placed into uppercase (if how == ccTitle,
02934                                         //   the "i" may have been kept lowercase and thus we shouldn't remove the dot).
02935                                         if (how == ccLower) { dest.Add(0x307); continue; }
02936                                         if (how == ccUpper) continue;
02937                                         Assert(how == ccTitle);
02938                                         Assert(howHere == ccLower); // because CombiningDotAbove is not a cased character
02939                                         if (seenCased && ! seenTwoCased) continue; // The "i" has been placed into uppercase; thus, remove the dot.
02940                                         dest.Add(0x307); continue;
02941                                 }
02942                         }
02943                 }
02944                 else if (turkic) // language code 'tr' (Turkish) and 'az' (Azeri)
02945                 {
02946                         // I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
02947                         // The following rules handle those cases.
02948                         if (cp == LatinCapitalLetterIWithDotAbove) {
02949                                 dest.Add(howHere == ccLower ? 0x69 : 0x130); continue; }
02950                         // When lowercasing, remove dot_above in the sequence I + dot_above,
02951                         // which will turn into i.  This matches the behavior of the
02952                         // canonically equivalent I-dot_above.
02953                         else if (cp == CombiningDotAbove)
02954                         {
02955                                 // AfterI: the last preceding base character was an uppercase I,
02956                                 // and there is no intervening combining character class 230 (ABOVE).
02957                                 bool afterI = false;
02958                                 size_t srcIdx2 = srcIdx - 1; // now srcIdx2 is the index from which we got 'cp'
02959                                 while (origSrcIdx < srcIdx2)
02960                                 {
02961                                         --srcIdx2; int cp2 = src[TVecIdx(srcIdx2)];
02962                                         if (cp2 == LatinCapitalLetterI) { afterI = true; break; }
02963                                         int cc2 = GetCombiningClass(cp2);
02964                                         if (cc2 == TUniChInfo::ccAbove || cc2 == TUniChInfo::ccStarter) break;
02965                                 }
02966                                 if (afterI) {
02967                                         if (how == ccTitle && seenCased && ! seenTwoCased) {
02968                                                 // Sec. 3.13 defines title-casing in an unusual way: find the first cased character in each word;
02969                                                 // if found, map it to titlecase; otherwise, map all characters in that word to lowercase.
02970                                                 // This suggests that if a cased character is found, others in that word should be left alone.
02971                                                 // This seems unusual; we map all other characters to lowercase instead.
02972                                                 // But this leads to problems with e.g. I followed by dot-above (U+0307): since the dot-above
02973                                                 // is not the first cased character (it isn't even cased), we attempt to set it to lowercase;
02974                                                 // but since afterI is also true here, this would mean deleting it.  Thus our titlecased
02975                                                 // form of "I followed by dot-above" would be just "I", which is clearly wrong.
02976                                                 // So we treat this as a special case here.
02977                                                 IAssert(cpFirstCased == LatinCapitalLetterI);
02978                                                 dest.Add(0x307); continue; }
02979                                         if (howHere != ccLower) dest.Add(0x307);
02980                                         continue; }
02981                         }
02982                         // When lowercasing, unless an I is before a dot_above,
02983                         // it turns into a dotless i.
02984                         else if (cp == LatinCapitalLetterI)
02985                         {
02986                                 // BeforeDot: C is followed by U+0307 (combining dot above).
02987                                 // Any sequence of characters with a combining class that is
02988                                 // neither 0 nor 230 may intervene between the current character
02989                                 // and the combining dot above.
02990                                 bool beforeDot = false;
02991                                 for (size_t srcIdx2 = srcIdx; srcIdx2 < srcEnd; )
02992                                 {
02993                                         const int cp2 = src[TVecIdx(srcIdx2)]; srcIdx2++;
02994                                         if (cp2 == 0x307) { beforeDot = true; break; }
02995                                         const int cc2 = GetCombiningClass(cp2);
02996                                         if (cc2 == TUniChInfo::ccStarter || cc2 == TUniChInfo::ccAbove) break;
02997                                 }
02998                                 if (! beforeDot) {
02999                                         dest.Add(howHere == ccLower ? 0x131 : 0x49); continue; }
03000                         }
03001                         // When uppercasing, i turns into a dotted capital I.
03002                         else if (cp == LatinSmallLetterI)
03003                         {
03004                                 dest.Add(howHere == ccLower ? 0x69 : 0x130); continue;
03005                         }
03006                 }
03007                 // Try to use the unconditional mappings.
03008                 const TIntIntVH &specHere = (
03009                         howHere == how ? specials :
03010                         howHere == ccLower ? specialCasingLower :
03011                         howHere == ccTitle ? specialCasingTitle :
03012                         howHere == ccUpper ? specialCasingUpper : *((TIntIntVH *) 0));
03013                 int i = specHere.GetKeyId(cp);
03014                 if (i >= 0) { TUniCaseFolding::AppendVector(specHere[i], dest); continue; }
03015                 // Try to use the simple (one-character) mappings.
03016                 i = h.GetKeyId(cp);
03017                 if (i >= 0) {
03018                         const TUniChInfo &ci = h[i];
03019                         int cpNew = (
03020                                 howHere == ccLower ? ci.simpleLowerCaseMapping :
03021                                 howHere == ccUpper ? ci.simpleUpperCaseMapping :
03022                                                                          ci.simpleTitleCaseMapping);
03023                         if (cpNew < 0) cpNew = cp;
03024                         dest.Add(cpNew); continue; }
03025                 // As a final resort, leave 'cp' unchanged.
03026                 dest.Add(cp);
03027         }
03028 }
03029 
03030 template<typename TSrcVec, typename TDestCh>
03031 void TUniChDb::GetSimpleCaseConverted(const TSrcVec& src, size_t srcIdx, const size_t srcCount,
03032         TVec<TDestCh>& dest, const bool clrDest, const TCaseConversion how) const
03033 {
03034         if (clrDest) dest.Clr();
03035         bool seenCased = false; size_t nextWordBoundary = srcIdx;
03036         for (const size_t origSrcIdx = srcIdx, srcEnd = srcIdx + srcCount; srcIdx < srcEnd; )
03037         {
03038                 const int cp = src[TVecIdx(srcIdx)]; srcIdx++;
03039                 int i = h.GetKeyId(cp); if (i < 0) { dest.Add(cp); continue; }
03040                 const TUniChInfo &ci = h[i];
03041                 // With titlecasing, the first cased character of each word must be put into titlecase,
03042                 // all others into lowercase.  This is what the howHere variable is for.
03043                 TUniChDb::TCaseConversion howHere;
03044                 if (how != ccTitle) howHere = how;
03045                 else {
03046                         if (srcIdx - 1 == nextWordBoundary) { // A word starts/ends here.
03047                                 seenCased = false;
03048                                 size_t next = nextWordBoundary; FindNextWordBoundary(src, origSrcIdx, srcCount, next);
03049                                 IAssert(next > nextWordBoundary); nextWordBoundary = next; }
03050                         bool isCased = IsCased(cp);
03051                         if (isCased && ! seenCased) { howHere = ccTitle; seenCased = true; }
03052                         else howHere = ccLower;
03053                 }
03054                 int cpNew = (howHere == ccTitle ? ci.simpleTitleCaseMapping : howHere == ccUpper ? ci.simpleUpperCaseMapping : ci.simpleLowerCaseMapping);
03055                 if (cpNew < 0) cpNew = cp;
03056                 dest.Add(cpNew);
03057         }
03058 }
03059 
03060 template<typename TSrcVec>
03061 void TUniChDb::ToSimpleCaseConverted(TSrcVec& src, size_t srcIdx, const size_t srcCount, const TCaseConversion how) const
03062 {
03063         bool seenCased = false; size_t nextWordBoundary = srcIdx;
03064         for (const size_t origSrcIdx = srcIdx, srcEnd = srcIdx + srcCount; srcIdx < srcEnd; srcIdx++)
03065         {
03066                 const int cp = src[TVecIdx(srcIdx)];
03067                 int i = h.GetKeyId(cp); if (i < 0) continue;
03068                 const TUniChInfo &ci = h[i];
03069                 // With titlecasing, the first cased character of each word must be put into titlecase,
03070                 // all others into lowercase.  This is what the howHere variable is for.
03071                 TUniChDb::TCaseConversion howHere;
03072                 if (how != ccTitle) howHere = how;
03073                 else {
03074                         if (srcIdx == nextWordBoundary) { // A word starts/ends here.
03075                                 seenCased = false;
03076                                 size_t next = nextWordBoundary; FindNextWordBoundary(src, origSrcIdx, srcCount, next);
03077                                 IAssert(next > nextWordBoundary); nextWordBoundary = next; }
03078                         bool isCased = IsCased(cp);
03079                         if (isCased && ! seenCased) { howHere = ccTitle; seenCased = true; }
03080                         else howHere = ccLower;
03081                 }
03082                 int cpNew = (howHere == ccTitle ? ci.simpleTitleCaseMapping : howHere == ccUpper ? ci.simpleUpperCaseMapping : ci.simpleLowerCaseMapping);
03083                 if (cpNew >= 0) src[TVecIdx(srcIdx)] = cpNew;
03084         }
03085 }
03086 
03087 //-----------------------------------------------------------------------------
03088 // TUniChDb -- composition, decomposition, normal forms
03089 //-----------------------------------------------------------------------------
03090 
03091 template<typename TDestCh>
03092 void TUniChDb::AddDecomposition(const int codePoint, TVec<TDestCh>& dest, const bool compatibility) const
03093 {
03094         if (HangulSBase <= codePoint && codePoint < HangulSBase + HangulSCount)
03095         {
03096                 // UAX #15, sec. 16: Hangul decomposition
03097                 const int SIndex = codePoint - HangulSBase;
03098                 const int L = HangulLBase + SIndex / HangulNCount;
03099                 const int V = HangulVBase + (SIndex % HangulNCount) / HangulTCount;
03100                 const int T = HangulTBase + (SIndex % HangulTCount);
03101                 dest.Add(L); dest.Add(V);
03102                 if (T != HangulTBase) dest.Add(T);
03103                 return;
03104         }
03105         int i = h.GetKeyId(codePoint); if (i < 0) { dest.Add(codePoint); return; }
03106         const TUniChInfo &ci = h[i];
03107         int ofs = ci.decompOffset; if (ofs < 0) { dest.Add(codePoint); return; }
03108         if ((! compatibility) && ci.IsCompatibilityDecomposition()) { dest.Add(codePoint); return; }
03109         while (true) {
03110                 int cp = decompositions[ofs++]; if (cp < 0) return;
03111                 AddDecomposition(cp, dest, compatibility); }
03112 }
03113 
03114 template<typename TSrcVec, typename TDestCh>
03115 void TUniChDb::Decompose(const TSrcVec& src, size_t srcIdx, const size_t srcCount,
03116                 TVec<TDestCh>& dest, const bool compatibility, bool clrDest) const
03117 {
03118         if (clrDest) dest.Clr();
03119         const size_t destStart = dest.Len()/*, srcEnd = srcIdx + srcCount*/;
03120         // Decompose the string.
03121         while (srcIdx < srcCount) {
03122                 AddDecomposition(src[TVecIdx(srcIdx)], dest, compatibility); srcIdx++; }
03123         // Rearrange the decomposed string into canonical order.
03124         for (size_t destIdx = destStart, destEnd = dest.Len(); destIdx < destEnd; )
03125         {
03126                 size_t j = destIdx;
03127                 int cp = dest[TVecIdx(destIdx)]; destIdx++;
03128                 int cpCls = GetCombiningClass(cp);
03129                 if (cpCls == TUniChInfo::ccStarter) continue;
03130                 while (destStart < j && GetCombiningClass(dest[TVecIdx(j - 1)]) > cpCls) {
03131                         dest[TVecIdx(j)] = dest[TVecIdx(j - 1)]; j--; }
03132                 dest[TVecIdx(j)] = cp;
03133         }
03134 }
03135 
03136 template<typename TSrcVec, typename TDestCh>
03137 void TUniChDb::DecomposeAndCompose(const TSrcVec& src, size_t srcIdx, const size_t srcCount,
03138                 TVec<TDestCh>& dest, bool compatibility, bool clrDest) const
03139 {
03140         if (clrDest) dest.Clr();
03141         TIntV temp;
03142         Decompose(src, srcIdx, srcCount, temp, compatibility);
03143         Compose(temp, 0, temp.Len(), dest, clrDest);
03144 }
03145 
03146 template<typename TSrcVec, typename TDestCh>
03147 void TUniChDb::Compose(const TSrcVec& src, size_t srcIdx, const size_t srcCount,
03148                 TVec<TDestCh>& dest, bool clrDest) const
03149 {
03150         if (clrDest) dest.Clr();
03151         bool lastStarterKnown = false; // has a starter been encountered yet?
03152         size_t lastStarterPos = size_t(-1);  // the index (in 'dest') of the last starter
03153         int cpLastStarter = -1; // the codepoint of the last starter (i.e. cpLastStarter == dest[lastStarterPos])
03154         const size_t srcEnd = srcIdx + srcCount;
03155         int ccMax = -1; // The highest combining class among the characters since the last starter.
03156         while (srcIdx < srcEnd)
03157         {
03158                 const int cp = src[TVecIdx(srcIdx)]; srcIdx++;
03159                 const int cpClass = GetCombiningClass(cp);
03160                 //int cpCombined = -1;
03161                 // If there is a starter with which 'cp' can be combined, and from which it is not blocked
03162                 // by some intermediate character, we can try to combine them.
03163                 if (lastStarterKnown && ccMax < cpClass)
03164                 {
03165                         int j = inverseDec.GetKeyId(TIntPr(cpLastStarter, cp));
03166                         int cpCombined = -1;
03167                         do {
03168                                 // Try to look up a composition in the inverseDec table.
03169                                 if (j >= 0) { cpCombined = inverseDec[j]; break; }
03170                                 // UAX #15, sec. 16: Hangul composition
03171                                 // - Try to combine L and V.
03172                                 const int LIndex = cpLastStarter - HangulLBase;
03173                                 if (0 <= LIndex && LIndex < HangulLCount) {
03174                                         const int VIndex = cp - HangulVBase;
03175                                         if (0 <= VIndex && VIndex < HangulVCount) {
03176                                                 cpCombined = HangulSBase + (LIndex * HangulVCount + VIndex) * HangulTCount;
03177                                                 break; } }
03178                                 // - Try to combine LV and T.
03179                                 const int SIndex = cpLastStarter - HangulSBase;
03180                                 if (0 <= SIndex && SIndex < HangulSCount && (SIndex % HangulTCount) == 0)
03181                                 {
03182                                         const int TIndex = cp - HangulTBase;
03183                                         if (0 <= TIndex && TIndex < HangulTCount) {
03184                                                 cpCombined = cpLastStarter + TIndex;
03185                                                 break; }
03186                                 }
03187                         } while (false);
03188                         // If a combining character has been found, use it to replace the old cpStarter.
03189                         if (cpCombined >= 0) {
03190                                 dest[TVecIdx(lastStarterPos)] = cpCombined;
03191                                 Assert(GetCombiningClass(cpCombined) == TUniChInfo::ccStarter);
03192                                 // if (cpCombined is not a starter) { starterKnown = false; lastStarterPos = size_t(01); cpLastStarter = -1; } else
03193                                 cpLastStarter = cpCombined; continue; }
03194                 }
03195                 if (cpClass == TUniChInfo::ccStarter) { // 'cp' is a starter, remember it for later.  Set ccMax to -1 so that this starter can be combined with another starter.
03196                         lastStarterKnown = true; lastStarterPos = dest.Len(); cpLastStarter = cp; ccMax = cpClass - 1; }
03197                 else if (cpClass > ccMax) // Remember cp's class as the new maximum class since the last starter (for blocking).
03198                         ccMax = cpClass;
03199                 dest.Add(cp);
03200         }
03201 }
03202 
03203 template<typename TSrcVec, typename TDestCh>
03204 size_t TUniChDb::ExtractStarters(const TSrcVec& src, size_t srcIdx, const size_t srcCount,
03205                 TVec<TDestCh>& dest, bool clrDest) const
03206 {
03207         if (clrDest) dest.Clr();
03208         size_t retVal = 0;
03209         for (const size_t srcEnd = srcIdx + srcCount; srcIdx < srcEnd; srcIdx++) {
03210                 const int cp = src[TVecIdx(srcIdx)];
03211                 if (GetCombiningClass(cp) == TUniChInfo::ccStarter)
03212                         { dest.Add(cp); retVal++; } }
03213         return retVal;
03214 }
03215 
03216 inline bool AlwaysFalse()
03217 {
03218         int sum = 0;
03219         for (int i = 0; i < 5; i++) sum += i;
03220         return sum > 100;
03221 }
03222 
03223 inline bool AlwaysTrue()
03224 {
03225         int sum = 0;
03226         for (int i = 0; i < 5; i++) sum += i;
03227         return sum < 100;
03228 }
03229 
03230 /*
03231 
03232 Notes on decomposition:
03233 
03234 - In UnicodeData.txt, there is a field with the decomposition mapping.
03235   This field may also include a tag, <...>.
03236   If there is a tag, this is a compatibility mapping.
03237   Otherwise it is a canonical mapping.
03238 - Canonical decomposition uses only canonical mappings,
03239   compatibility decomposition uses both canonical and compatibility mappings.
03240 - Decomposition:
03241   1. Apply the decomposition mappings (canonical or canonical+compatibility), recursively.
03242   2. Put the string into canonical order, which means:
03243      while there exists a pair of characters, A immediately followed by B,
03244          such that combiningclass(A) > combiningclass(B) > 0  [an "exchangeable pair"]:
03245            swap A and B;
03246   This results in NFD (normalized form D, after canonical decomposition)
03247   or NFKD (normalized form KD, after compatibility decomposition).
03248 - Canonical composition:
03249   1. Before composition, the string should have been decomposed
03250      (using either canonical or compatibility decomposition).
03251   2. For each character C (from left to right):
03252      2.1.  Find the last starter S before C (if not found, continue).
03253          2.2.  If there is, between S and C, some character with a combining class >= than that of C, then continue.
03254          2.3.  If there exists a character L for which the canonical decomposition is S+L
03255                and L is not in the composition exclusion table [i.e. L is a "primary composite"],
03256                    then replace S by L, and remove C.
03257   This results in NFC (normalized form C, with canonical decomposition followed by canonical composition)
03258   or NFKC (normalized form KD, with compatibility decomposition followed by canonical composition).
03259 - Composition exclusion table:
03260   - Anything in CompositionExclusions.txt.
03261   - Singletons: characters whose canonical decomposition is a single character.
03262   - Non-starter decompositions: characters whose canonical decomposition begins with a non-starter.
03263 
03264 Example:
03265                  E-grave  (00c8; composition class 0; canonical decomposition: 0045 0300)
03266                                  E-macron (0112; composition class 0;                          0045 0304)
03267                                  grave   (0300; composition class 230)
03268                  macron  (0304; composition class 230)
03269   source string: 00c8 0304
03270   after canonical decomposition (or compatibility decomposition, they would be the same here): 0045 0300 0304
03271   after canonical composition: 00c8 0304
03272 
03273   cc(horn) = 216
03274   cc(dot below) = 220
03275   cc(dot above) = 230
03276 
03277 ToDos:
03278 - case folding - je misljen predvsem za primerjanje tako dobljenih nizov.
03279   Funkcija f(s) = NFC(toCaseFold(s)) je idempotentna.
03280   Funkcija g(s) = NFKC(toCaseFold(s)) pa ni -- ce hocemo to, moramo pri foldingu
03281   upostevati se nekaj dodatnih mappingov (glej 5.18, zadnji odstavek; DerivedNormalizationProps.txt).
03282 - Zdi se, da je CaseFolding.txt v bistvu cisto navaden folding v lowercase.
03283   Ker hocemo imeti tudi ostale foldinge, glejmo raje SpecialCasing.txt
03284   (+ simple case mappinge v UnicodeData.txt).
03285   Predlagam, da pri branju SpecialCasing.txt conditional mappinge kar ignoriramo
03286   in jih potem upostevamo posebej kar v source kodi nasih programov [za
03287   podrobno definicijo pogojev pa glej tabelo 3.13].
03288   - Pripis: vseeno se mi zdi, da je CaseFolding.txt nekaj malo drugacnega od navadnega lowercase.
03289     Na primer, za small final sigma 03c2 je tam navedeno, naj se spremeni v navadno small sigma 03c3.
03290         To ne sledi niti iz UnicodeData.txt niti iz SpecialCasing.txt, pa ceprav v UCD.html pise,
03291         da je CaseFolding.txt izpeljan iz njiju.  Glavni namen CaseFolding.txt naj bi bil za
03292         potrebe "locale-independent case folding" (table 4.1 in sec. 5.18).
03293   - Preden se zacnes ubadati s case conversioni, si oglej razdelek 3.13
03294     in se posebej str. 90.
03295   - Glej str. 91 o kombinaciji N[K]FD + caseFold + N[K]FD
03296   - definicija cased ipd. na str. 89
03297 - isIdentifierStart(c), isIdentifierEnd(c) -- sec. 5.15
03298   Glej DerivedCoreProperties.txt, kjer je na podoben nacin definiranih se kup podobnih
03299   stvari, med drugim isLowerCase in isUpperCase.  Tam je tudi isLetter, isAlphabetic itd. (sec. 4.9).
03300   To je se najbolje dodati med flagse posameznega characterja.
03301 - general category: sec. 4.5
03302 - motivacija za titlecase: 5.18
03303 - primerjaj nas dosedanji izracun compositionExclusion s tistim, kar je naracunano v DerivedNormalizationProps.txt
03304   pod Full_Composition_Exclusion
03305 - script names: Scripts.txt in UAX #24.
03306 - block names: Blocks.txt
03307 - space characters: table 6.2 in baje tudi UCD.html
03308 - dash characters: table 6.3
03309 */
03310 
03311 //#endif
03312