dd/d90/unicode_8h_source.html

 #include "bd.h"


 //#ifndef unicode_h

 //#define unicode_h


 // Includes

 //#include "base.h"

 #include <new>


 typedef int TUniVecIdx;


 //-----------------------------------------------------------------------------

 // TUniCodec -- an UTF-8 and UTF-16 Encoder/Decoder

 //-----------------------------------------------------------------------------


 // Error handling modes for the TUniCodec class.

 typedef enum TUnicodeErrorHandling_

 {

         // What happens when an error occurs:

         uehIgnore = 0,  // - it is silently ignored (nothing is added to the output vector)

         uehThrow = 1,   // - an exception is thrown (TUnicodeException)

         uehReplace = 2, // - the replacement character is added to the output vector

         uehAbort = 3    // - the encoding/decoding process stops immediately

 }

 TUnicodeErrorHandling;


 class TUnicodeException

 {

 public:

         TStr message;  // error message

         size_t srcIdx; // the position in the source vector where the error occurred

         int srcChar;   // the source character at the position srcIdx

         TUnicodeException(size_t srcIdx_, int srcChar_, const TStr& message_) :

                 message(message_), srcIdx(srcIdx_), srcChar(srcChar_) { }

 };


 typedef enum TUniByteOrder_

 {

         boMachineEndian = 0,

         boLittleEndian = 1,

         boBigEndian = 2

 }

 TUniByteOrder;


 typedef enum TUtf16BomHandling_

 {

         bomAllowed = 0,   // if a BOM is present, it is used to determine the byte order; otherwise, the default byte order is used

         bomRequired = 1,  // if a BOM is present, it is used to determine the byte order; otherwise, an error is reported

         bomIgnored = 2    // the default byte order is used; if a BOM is present, it is treated like any other character

 }

 TUtf16BomHandling;


 class TUniCodec

 {

 public:

         // 0xfffd is defined as the replacement character by the Unicode standard.

         // By default, it is rendered as a question mark inside a diamond: "<?>".

         enum { DefaultReplacementChar = 0xfffd };


         // The replacement character is inserted into the destination vector

         // if an error occurs in the source vector.  By default, this is set

         // to DefaultReplacementChar.

         int replacementChar;

         // The error handling mode.

         TUnicodeErrorHandling errorHandling;

         // There are a number of situations where there is strictly speaking an error in

         // the source data although it can still be decoded in a reasonably meaningful way.

         // If strict == true, these situations are treated as errors.  Examples:

         // - when decoding UTF-8:

         //   - a codepoint represented by more bytes than necessary (e.g. one of the characters 0..127

         //     encoded as a two-byte sequence)

         //   - a codepoint > 0x10ffff

         // - when decoding UTF-16:

         //   - a codepoint from the range reserved for the second character of a surrogate pair

         //     is not preceded by a codepoint from the range reserved for the first character of a surrogate pair

         // - when encoding UTF-8:

         //   - a codepoint > 0x10ffff

         // - when encoding UTF-16:

         //   - a codepoint from the range reserved from the second character of a surrogate pair

         //     [note that a codepoint > 0x10ffff, or from the range reserved for the first character of a

         //     surrogate pair, is always an error, even with strict == false]

         bool strict;

         // skipBom == true means: If a byte-order-mark (0xfffe or 0xfeff) occurs at the beginning

         // of the source vector, it is skipped (when decoding).

         // - Note: a BOM is not really useful in UTF-8 encoded data.  However, the .NET UTF8Encoding

         //   emits 0xfeff by default as a kind of preamble.  It gets encoded as 3 bytes, ef bb bf,

         //   and can be helpful to make the data easier to recognize as UTF-8 encoded data.

         bool skipBom;


         TUniCodec() : replacementChar(DefaultReplacementChar), errorHandling(uehIgnore), strict(false), skipBom(true)

         {

         }


         TUniCodec(TUnicodeErrorHandling errorHandling_, bool strict_, int replacementChar_, bool skipBom_) :

                 replacementChar(replacementChar_), errorHandling(errorHandling_), strict(strict_), skipBom(skipBom_)

         {

         }


 protected:

         enum {

 #define DefineByte(b7, b6, b5, b4, b3, b2, b1, b0) _ ## b7 ## b6 ## b5 ## b4 ## _ ## b3 ## b2 ## b1 ## b0 = (b7 << 7) | (b6 << 6) | (b5 << 5) | (b4 << 4) | (b3 << 3) | (b2 << 2) | (b1 << 1) | b0

                 DefineByte(1, 0, 0, 0, 0, 0, 0, 0),

                 DefineByte(1, 1, 0, 0, 0, 0, 0, 0),

                 DefineByte(1, 1, 1, 0, 0, 0, 0, 0),

                 DefineByte(1, 1, 1, 1, 0, 0, 0, 0),

                 DefineByte(1, 1, 1, 1, 1, 0, 0, 0),

                 DefineByte(1, 1, 1, 1, 1, 1, 0, 0),

                 DefineByte(1, 1, 1, 1, 1, 1, 1, 0),

                 DefineByte(0, 0, 1, 1, 1, 1, 1, 1),

                 DefineByte(0, 0, 0, 1, 1, 1, 1, 1),

                 DefineByte(0, 0, 0, 0, 1, 1, 1, 1),

                 DefineByte(0, 0, 0, 0, 0, 1, 1, 1),

                 DefineByte(0, 0, 0, 0, 0, 0, 1, 1)

 #undef DefineByte

         };


         typedef TUniVecIdx TVecIdx;

         //friend class TUniChDb;

         friend class TUniCaseFolding;

         friend class TUnicode;


 public:


         //-----------------------------------------------------------------------

         // UTF-8

         //-----------------------------------------------------------------------


         // Returns the number of characters that have been successfully decoded.

         // This does not include any replacement characters that may have been inserted into 'dest'.

         template<typename TSrcVec, typename TDestCh>

         size_t DecodeUtf8(

                 const TSrcVec& src, size_t srcIdx, const size_t srcCount,

                 TVec<TDestCh>& dest, const bool clrDest = true) const;

         template<typename TSrcVec, typename TDestCh>

         size_t DecodeUtf8(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true) const { return DecodeUtf8(src, 0, src.Len(), dest, clrDest); }


         // Returns the number of characters that have been successfully encoded.

         // This does not include any replacement characters that may have been inserted into 'dest'.

         template<typename TSrcVec, typename TDestCh>

         size_t EncodeUtf8(

                 const TSrcVec& src, size_t srcIdx, const size_t srcCount,

                 TVec<TDestCh>& dest, const bool clrDest = true) const;

         template<typename TSrcVec, typename TDestCh>

         size_t EncodeUtf8(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true) const { return EncodeUtf8(src, 0, src.Len(), dest, clrDest); }


         // The following wrappers around the UTF-8 encoder return a TStr containing

         // the UTF-8-encoded version of the input string.

         template<typename TSrcVec> TStr EncodeUtf8Str(const TSrcVec& src, size_t srcIdx, const size_t srcCount) const { TVec<char> temp; EncodeUtf8(src, srcIdx, srcCount, temp); TStr retVal = &(temp[0]); return retVal; }

         template<typename TSrcVec> TStr EncodeUtf8Str(const TSrcVec& src) const { TVec<char> temp; EncodeUtf8(src, temp); temp.Add(0); TStr retVal = &(temp[0]); return retVal; }


         //-----------------------------------------------------------------------

         // UTF-16 Decoder

         //-----------------------------------------------------------------------


 protected:

         enum {

                 Utf16FirstSurrogate = 0xd800,

                 Utf16SecondSurrogate = 0xdc00

         };


         static bool IsMachineLittleEndian();


 public:


         // Returns the number of characters that have been successfully decoded.

         // This does not include any replacement characters that may have been inserted into 'dest'.

         // Each element of 'src' is assumed to contain one byte of data.

         // srcCount must be even (though srcIdx doesn't need to be).

         template<typename TSrcVec, typename TDestCh>

         size_t DecodeUtf16FromBytes(

                 const TSrcVec& src, size_t srcIdx, const size_t srcCount,

                 TVec<TDestCh>& dest, const bool clrDest,

                 const TUtf16BomHandling bomHandling = bomAllowed,

                 const TUniByteOrder defaultByteOrder = boMachineEndian) const;


         // Here, each element of 'src' is treated as a 16-bit word.  The byte-order settings

         // are used to determine if the two bytes of each word should be swapped before further

         // processing.  For example, if a BOM is present, it must have the value 0xfeff; if it

         // actually has the value 0xfffe, this means that the two bytes of each word must be swapped.

         // Basically, the combination of the byteOrder parameter and the byte order mark (if present) at the

         // beginning of the source data is used to determine the "original" byte order of the data;

         // if this doesn't match the byte order of the local machine, the two bytes of each word will

         // be swapped during the decoding process.

         template<typename TSrcVec, typename TDestCh>

         size_t DecodeUtf16FromWords(

                 const TSrcVec& src, size_t srcIdx, const size_t srcCount,

                 TVec<TDestCh>& dest, bool clrDest,

                 const TUtf16BomHandling bomHandling = bomAllowed,

                 const TUniByteOrder defaultByteOrder = boMachineEndian) const;


         //-----------------------------------------------------------------------

         // UTF-16 Encoder

         //-----------------------------------------------------------------------


         // Returns the number of characters that have been successfully encoded.

         // This does not include any replacement characters that may have been inserted into 'dest'.

         //

         // Notes:

         // - UTF-16 *cannot* encode characters above 0x10ffff, so their presence is always

         //   treated as an error, regardless of the value of 'strict'.

         // - Characters from the range Utf16FirstSurrogate through Utf16FirstSurrogate + 1023

         //   cannot be encoded by UTF-16 either, as they would be misinterpreted during decoding

         //   as the first character of a surrogate pair.

         // - Characters from the range Utf16SecondSurrogate through Utf16SecondSurrogate + 1023

         //   can be encoded in principle; however, if strict == true, they are treated as errors.

         template<typename TSrcVec, typename TDestCh>

         size_t EncodeUtf16ToWords(

                 const TSrcVec& src, size_t srcIdx, const size_t srcCount,

                 TVec<TDestCh>& dest, const bool clrDest, const bool insertBom,

                 const TUniByteOrder destByteOrder = boMachineEndian) const;


         template<typename TSrcVec, typename TDestCh>

         size_t EncodeUtf16ToBytes(

                 const TSrcVec& src, size_t srcIdx, const size_t srcCount,

                 TVec<TDestCh>& dest, const bool clrDest, const bool insertBom,

                 const TUniByteOrder destByteOrder = boMachineEndian) const;


         //-----------------------------------------------------------------------

         // Helper declarations for the test drivers

         //-----------------------------------------------------------------------


 protected:


         static uint GetRndUint(TRnd& rnd);

         static uint GetRndUint(TRnd& rnd, uint minVal, uint maxVal);


         //-----------------------------------------------------------------------

         // UTF-8 Test Driver

         //-----------------------------------------------------------------------


 protected:

         void TestUtf8(bool decode, size_t expectedRetVal, bool expectedThrow, const TIntV& src, const TIntV& expectedDest, FILE *f);

         // Generates a random UTF-8-encoded stream according to the specifications in 'testCaseDesc',

         // then calls TestUtf8 to make sure that DecodeUtf8 reacts as expected.

         void TestDecodeUtf8(TRnd& rnd, const TStr& testCaseDesc);

 public:

         void TestUtf8();


         //-----------------------------------------------------------------------

         // UTF-16 Test Driver

         //-----------------------------------------------------------------------


 protected:

         void WordsToBytes(const TIntV& src, TIntV& dest);

         void TestUtf16(bool decode, size_t expectedRetVal, bool expectedThrow, const TIntV& src, const TIntV& expectedDest,

                 // Note: insertBom is only used with the encoder.  When encoding, 'defaultByteOrder' is used as the destination byte order.

                 const TUtf16BomHandling bomHandling, const TUniByteOrder defaultByteOrder, const bool insertBom,

                 FILE *f);

         static inline int SwapBytes(int x) {

                 return ((x >> 8) & 0xff) | ((x & 0xff) << 8); }

         // Generates a random UTF-16-encoded stream according to the specifications in 'testCaseDesc',

         // then calls TestUtf16 to make sure that DecodeUtf16 reacts as expected.

         void TestDecodeUtf16(TRnd& rnd, const TStr& testCaseDesc,

                 const TUtf16BomHandling bomHandling,

                 const TUniByteOrder defaultByteOrder,

                 const bool insertBom);

 public:

         void TestUtf16();


 };


 //-----------------------------------------------------------------------------

 // Case folding

 //-----------------------------------------------------------------------------

 // Note: there's no need to access this class directly.

 // Use TUniChDb::GetCaseFolded() instead.


 typedef THash<TInt, TIntV> TIntIntVH;


 class TUniCaseFolding

 {

 protected:

         TIntH cfCommon, cfSimple, cfTurkic;

         TIntIntVH cfFull;


         template<typename TSrcDat, typename TDestDat>

         inline static void AppendVector(const TVec<TSrcDat>& src, TVec<TDestDat>& dest) {

                 for (int i = 0; i < src.Len(); i++) dest.Add(src[i]); }

         friend class TUniChDb;

         typedef TUniVecIdx TVecIdx;


 public:

         TUniCaseFolding() { }

         explicit TUniCaseFolding(TSIn& SIn) : cfCommon(SIn), cfSimple(SIn), cfTurkic(SIn), cfFull(SIn) { SIn.LoadCs(); }

         void Load(TSIn& SIn) { cfCommon.Load(SIn); cfSimple.Load(SIn); cfFull.Load(SIn); cfTurkic.Load(SIn); SIn.LoadCs(); }

         void Save(TSOut& SOut) const { cfCommon.Save(SOut); cfSimple.Save(SOut); cfFull.Save(SOut); cfTurkic.Save(SOut); SOut.SaveCs(); }

         void Clr() { cfCommon.Clr(); cfSimple.Clr(); cfFull.Clr(); cfTurkic.Clr(); }

         void LoadTxt(const TStr& fileName);


         // Use 'turkic' when processing text in a Turkic language (tr, az).  This only affects the uppercase I and I-with-dot-above.

         template<typename TSrcVec, typename TDestCh>

         void Fold(const TSrcVec& src, size_t srcIdx, const size_t srcCount,

                 TVec<TDestCh>& dest, const bool clrDest, const bool full, const bool turkic) const

         {

                 for (const size_t srcEnd = srcIdx + srcCount; srcIdx < srcEnd; )

                 {

                         int c = src[TVecIdx(srcIdx)], i; srcIdx++;

                         if (turkic && ((i = cfTurkic.GetKeyId(c)) >= 0)) { dest.Add(cfTurkic[i]); continue; }

                         if (full && ((i = cfFull.GetKeyId(c)) >= 0)) { AppendVector(cfFull[i], dest); continue; }

                         if ((! full) && ((i = cfSimple.GetKeyId(c)) >= 0)) { dest.Add(cfSimple[i]); continue; }

                         i = cfCommon.GetKeyId(c); if (i >= 0) dest.Add(cfCommon[i]); else dest.Add(c);

                 }

         }


         template<typename TSrcVec>

         void FoldInPlace(TSrcVec& src, size_t srcIdx, const size_t srcCount, const bool turkic) const

         {

                 for (const size_t srcEnd = srcIdx + srcCount; srcIdx < srcEnd; srcIdx++)

                 {

                         int c = src[TVecIdx(srcIdx)], i;

                         if (turkic && ((i = cfTurkic.GetKeyId(c)) >= 0)) { src[TVecIdx(srcIdx)] = cfTurkic[i]; continue; }

                         if ((i = cfSimple.GetKeyId(c)) >= 0) { src[TVecIdx(srcIdx)] = cfSimple[i]; continue; }

                         i = cfCommon.GetKeyId(c); if (i >= 0) src[TVecIdx(srcIdx)] = cfCommon[i];

                 }

         }


 protected:

         void Test(const TIntV& src, const TIntV& expectedDest, const bool full, const bool turkic, FILE *f);

 public:

         void Test();

 };


 //-----------------------------------------------------------------------------

 // TCodecBase -- an abstract base class for codecs

 //-----------------------------------------------------------------------------


 class TCodecBase;

 typedef TPt<TCodecBase> PCodecBase;

 typedef TVec<PCodecBase> TCodecBaseV;


 class TCodecBase

 {

 protected:

         TCRef CRef;

         friend class TPt<TCodecBase>;

 public:

         virtual ~TCodecBase() { }


         template<class TCodecImpl>

         static PCodecBase New(); /* {

                 return new TCodecWrapper<TCodecImpl>(); } */


         virtual TStr GetName() const = 0;

         virtual void Test() const { }


         // Returns the number of characters that have been successfully decoded.

         // This does not include any replacement characters that may have been inserted into 'dest'.

         virtual size_t ToUnicode(const TIntV& src, size_t srcIdx, const size_t srcCount, TIntV& dest, const bool clrDest = true) const = 0;

         virtual size_t ToUnicode(const TStr& src, size_t srcIdx, const size_t srcCount, TIntV& dest, const bool clrDest = true) const = 0;


         size_t ToUnicode(const TIntV& src, TIntV& dest, const bool clrDest = true) const { return ToUnicode(src, 0, src.Len(), dest, clrDest); }

         size_t ToUnicode(const TStr& src, TIntV& dest, const bool clrDest = true) const { return ToUnicode(src, 0, src.Len(), dest, clrDest); }


         // Returns the number of characters that have been successfully encoded.

         // This does not include any replacement characters that may have been inserted into 'dest'.

         virtual size_t FromUnicode(const TIntV& src, size_t srcIdx, const size_t srcCount, TIntV& dest, const bool clrDest = true) const = 0;

         virtual size_t FromUnicode(const TIntV& src, size_t srcIdx, const size_t srcCount, TChA& dest, const bool clrDest = true) const = 0;

         virtual size_t FromUnicode(const TIntV& src, size_t srcIdx, const size_t srcCount, TStr& dest, const bool clrDest = true) const = 0;


         size_t FromUnicode(const TIntV& src, TIntV& dest, const bool clrDest = true) const { return FromUnicode(src, 0, src.Len(), dest, clrDest); }

         size_t FromUnicode(const TIntV& src, TChA& dest, const bool clrDest = true) const { return FromUnicode(src, 0, src.Len(), dest, clrDest); }

         size_t FromUnicode(const TIntV& src, TStr& dest, const bool clrDest = true) const { return FromUnicode(src, 0, src.Len(), dest, clrDest); }

 };


 //-----------------------------------------------------------------------------

 // TCodecWrapper -- a descendant of TCodecBase; relies on a template

 // parameter class for the actual implementation of the codec.

 //-----------------------------------------------------------------------------

 // Thus, if you know in advance that you'll need ISO-8859-2, just use

 // T8BitCodec<TEncoding_ISO8859_2>.  If you don't know the encoding

 // in advance, use a PCodecBase pointing to a suitable specialization

 // of TCodecWrapper<...>.  You can TUnicode::GetCodec(TStr& name)

 // to obtain a suitable pointer.


 template<class TCodecImpl_>

 class TCodecWrapper : public TCodecBase

 {

 public:

         typedef TCodecImpl_ TCodecImpl;

         TCodecImpl impl;

 public:


         virtual TStr GetName() const { return impl.GetName(); }


         virtual void Test() const { impl.Test(); }


         virtual size_t ToUnicode(const TIntV& src, size_t srcIdx, const size_t srcCount, TIntV& dest, const bool clrDest = true) const {

                 return impl.ToUnicode(src, srcIdx, srcCount, dest, clrDest); }

         virtual size_t ToUnicode(const TStr& src, size_t srcIdx, const size_t srcCount, TIntV& dest, const bool clrDest = true) const {

                 return impl.ToUnicode(src, srcIdx, srcCount, dest, clrDest); }


         virtual size_t FromUnicode(const TIntV& src, size_t srcIdx, const size_t srcCount, TIntV& dest, const bool clrDest = true) const {

                 return impl.FromUnicode(src, srcIdx, srcCount, dest, clrDest); }

         virtual size_t FromUnicode(const TIntV& src, size_t srcIdx, const size_t srcCount, TChA& dest, const bool clrDest = true) const {

                 return impl.FromUnicode(src, srcIdx, srcCount, dest, clrDest); }

         virtual size_t FromUnicode(const TIntV& src, size_t srcIdx, const size_t srcCount, TStr& dest, const bool clrDest = true) const {

                 TChA buf; size_t retVal = impl.FromUnicode(src, srcIdx, srcCount, buf, false);

                 if (clrDest) dest += buf.CStr(); else dest = buf.CStr();

                 return retVal; }

 };


 template<class TCodecImpl>

 PCodecBase TCodecBase::New() {

   return new TCodecWrapper<TCodecImpl>();

 }


 //-----------------------------------------------------------------------------

 // TVecElt -- a template for determining the type of a vector's elements

 //-----------------------------------------------------------------------------


 template<class TVector_>

 class TVecElt

 {

 };


 template<class TDat>

 class TVecElt<TVec<TDat> >

 {

 public:

         typedef TVec<TDat> TVector;

         typedef TDat TElement;

         static inline void Add(TVector& vector, const TElement& element) { vector.Add(element); }

 };


 template<>

 class TVecElt<TChA>

 {

 public:

         typedef TChA TVector;

         typedef char TElement;

         static inline void Add(TVector& vector, const TElement& element) { vector += element; }

 };


 //-----------------------------------------------------------------------------

 // T8BitCodec -- a class for converting between 8-bit encodings and Unicode

 //-----------------------------------------------------------------------------


 class TEncoding_ISO8859_1

 {

 public:

         static inline TStr GetName() { return "ISO-8859-1"; }

         static int ToUnicode(int c) { Assert(0 <= c && c <= 255); return c; }

         static int FromUnicode(int c) { if (0 <= c && c <= 255) return c; else return -1; }

 };


 class TEncoding_ISO8859_2 // ISO Latin 2

 {

 public:

         static inline TStr GetName() { return "ISO-8859-2"; }

         static const int toUnicodeTable[6 * 16], fromUnicodeTable1[14 * 16], fromUnicodeTable2[2 * 16];

         static int ToUnicode(int c) { Assert(0 <= c && c <= 255);

                 if (c < 0xa0) return c; else return toUnicodeTable[c - 0xa0]; }

         static int FromUnicode(int c) {

                 if (0 <= c && c < 0xa0) return c;

                 else if (0xa0 <= c && c < 0x180) return fromUnicodeTable1[c - 0xa0];

                 else if (0x2c0 <= c && c < 0x2e0) return fromUnicodeTable2[c - 0x2c0];

                 else return -1; }

 };


 class TEncoding_ISO8859_3

 {

 public:

         static inline TStr GetName() { return "ISO-8859-3"; }

         static const int toUnicodeTable[6 * 16], fromUnicodeTable1[14 * 16], fromUnicodeTable2[2];

         static int ToUnicode(int c) { Assert(0 <= c && c <= 255);

                 if (c < 0xa0) return c; else return toUnicodeTable[c - 0xa0]; }

         static int FromUnicode(int c) {

                 if (0 <= c && c < 0xa0) return c;

                 else if (0xa0 <= c && c < 0x180) return fromUnicodeTable1[c - 0xa0];

                 else if (0x2d8 <= c && c < 0x2da) return fromUnicodeTable2[c - 0x2d8];

                 else return -1; }

 };


 class TEncoding_ISO8859_4

 {

 public:

         static inline TStr GetName() { return "ISO-8859-4"; }

         static const int toUnicodeTable[6 * 16], fromUnicodeTable1[14 * 16], fromUnicodeTable2[2 * 16];

         static int ToUnicode(int c) { Assert(0 <= c && c <= 255);

                 if (c < 0xa0) return c; else return toUnicodeTable[c - 0xa0]; }

         static int FromUnicode(int c) {

                 if (0 <= c && c < 0xa0) return c;

                 else if (0xa0 <= c && c < 0x180) return fromUnicodeTable1[c - 0xa0];

                 else if (0x2c0 <= c && c < 0x2e0) return fromUnicodeTable2[c - 0x2c0];

                 else return -1; }

 };


 class TEncoding_YuAscii

 {

 public:

         static const int uniChars[10], yuAsciiChars[10];

         static inline TStr GetName() { return "YU-ASCII"; }

         static int ToUnicode(int c) { Assert(0 <= c && c <= 255);

                 for (int i = 0; i < int(sizeof(yuAsciiChars) / sizeof(yuAsciiChars[0])); i++)

                         if (c == yuAsciiChars[i]) return uniChars[i];

                 return c; }

         static int FromUnicode(int c) {

                 for (int i = 0; i < int(sizeof(uniChars) / sizeof(uniChars[0])); i++)

                         if (c == uniChars[i]) return yuAsciiChars[i];

                         else if(c == yuAsciiChars[i]) return -1;

                 if (0 <= c && c <= 255) return c; else return -1; }

 };


 class TEncoding_CP437 // DOS US

 {

 public:

         static inline TStr GetName() { return "CP437"; }

         static const int toUnicodeTable[8 * 16], fromUnicodeTable1[6 * 16], fromUnicodeTable2[4 * 16], fromUnicodeTable3[6 * 16], fromUnicodeTable4[11 * 16];

         static int ToUnicode(int c) { Assert(0 <= c && c <= 255);

                 if (c < 0x80) return c; else return toUnicodeTable[c - 0x80]; }

         static int FromUnicode(int c) {

                 if (0 <= c && c < 0x80) return c;

                 else if (0xa0 <= c && c < 0x100) return fromUnicodeTable1[c - 0xa0];

                 else if (0x390 <= c && c < 0x3d0) return fromUnicodeTable2[c - 0x390];

                 else if (0x2210 <= c && c < 0x2270) return fromUnicodeTable3[c - 0x2210];

                 else if (0x2500 <= c && c < 0x25b0) return fromUnicodeTable4[c - 0x2500];

                 else if (c == 0x192) return 0x9f;

                 else if (c == 0x207f) return 0xfc;

                 else if (c == 0x20a7) return 0x9e;

                 else if (c == 0x2310) return 0xa9;

                 else if (c == 0x2320) return 0xf4;

                 else if (c == 0x2321) return 0xf5;

                 else return -1; }

 };


 class TEncoding_CP852 // DOS Latin 2

 {

 public:

         static inline TStr GetName() { return "CP852"; }

         static const int toUnicodeTable[8 * 16], fromUnicodeTable1[14 * 16], fromUnicodeTable2[2 * 16], fromUnicodeTable3[11 * 16];

         static int ToUnicode(int c) { Assert(0 <= c && c <= 255);

                 if (c < 0x80) return c; else return toUnicodeTable[c - 0x80]; }

         static int FromUnicode(int c) {

                 if (0 <= c && c < 0x80) return c;

                 else if (0xa0 <= c && c < 0x180) return fromUnicodeTable1[c - 0xa0];

                 else if (0x2c0 <= c && c < 0x2e0) return fromUnicodeTable2[c - 0x2c0];

                 else if (0x2500 <= c && c < 0x25b0) return fromUnicodeTable3[c - 0x2500];

                 else return -1; }

 };


 class TEncoding_CP1250 // Windows-1250, similar to ISO Latin 2

 {

 public:

         static inline TStr GetName() { return "CP1250"; }

         static const int toUnicodeTable[8 * 16], fromUnicodeTable1[14 * 16], fromUnicodeTable2[2 * 16], fromUnicodeTable3[3 * 16];

         static int ToUnicode(int c) { Assert(0 <= c && c <= 255);

                 if (c < 0x80) return c; else return toUnicodeTable[c - 0x80]; }

         static int FromUnicode(int c) {

                 if (0 <= c && c < 0x80) return c;

                 else if (0xa0 <= c && c < 0x180) return fromUnicodeTable1[c - 0xa0];

                 else if (0x2c0 <= c && c < 0x2e0) return fromUnicodeTable2[c - 0x2c0];

                 else if (0x2010 <= c && c < 0x2040) return fromUnicodeTable3[c - 0x2010];

                 else if (c == 0x20ac) return 0x80;

                 else if (c == 0x2122) return 0x99;

                 else return -1; }

 };


 template<class TEncoding_>

 class T8BitCodec

 {

 protected:

         typedef TUniVecIdx TVecIdx;

 public:

         typedef TEncoding_ TEncoding;

         TUnicodeErrorHandling errorHandling;

         int replacementChar;


         T8BitCodec() : errorHandling(uehIgnore), replacementChar(TUniCodec::DefaultReplacementChar) { }

         T8BitCodec(TUnicodeErrorHandling errorHandling_, int replacementChar_ = TUniCodec::DefaultReplacementChar) :

                 errorHandling(errorHandling_), replacementChar(replacementChar_) { }

         static TStr GetName() { return TEncoding::GetName(); }


         void Test() const

         {

                 int nDecoded = 0;

                 for (int c = 0; c <= 255; c++) {

                         int cu = TEncoding::ToUnicode(c); if (cu == -1) continue;

                         nDecoded++;

                         IAssert(0 <= cu && cu < 0x110000);

                         int c2 = TEncoding::FromUnicode(cu);

                         IAssert(c2 == c); }

                 int nEncoded = 0;

                 for (int cu = 0; cu < 0x110000; cu++) {

                         int c = TEncoding::FromUnicode(cu); if (c == -1) continue;

                         nEncoded++;

                         IAssert(0 <= c && c <= 255);

                         int cu2 = TEncoding::ToUnicode(c);

                         IAssert(cu2 == cu); }

                 IAssert(nDecoded == nEncoded);

         }


         // Returns the number of characters that have been successfully decoded.

         // This does not include any replacement characters that may have been inserted into 'dest'.

         template<typename TSrcVec, typename TDestCh>

         size_t ToUnicode(

                 const TSrcVec& src, size_t srcIdx, const size_t srcCount,

                 TVec<TDestCh>& dest, const bool clrDest = true) const

         {

                 if (clrDest) dest.Clr();

                 size_t toDo = srcCount;

                 while (toDo-- > 0) {

                         int chSrc = ((int) src[TVecIdx(srcIdx)]) & 0xff; srcIdx++;

                         int chDest = TEncoding::ToUnicode(chSrc);

                         dest.Add(chDest); }

                 return srcCount;

         }

         template<typename TSrcVec, typename TDestCh>

         size_t ToUnicode(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true) const { return ToUnicode(src, 0, src.Len(), dest, clrDest); }


         size_t ToUnicode(const TIntV& src, TIntV& dest, const bool clrDest = true) const { return ToUnicode(src, 0, src.Len(), dest, clrDest); }

         size_t ToUnicode(const TStr& src, TIntV& dest, const bool clrDest = true) const { return ToUnicode(src, 0, src.Len(), dest, clrDest); }


         // Returns the number of characters that have been successfully encoded.

         // This does not include any replacement characters that may have been inserted into 'dest'.

         template<typename TSrcVec, typename TDestVec>

         size_t FromUnicode(

                 const TSrcVec& src, size_t srcIdx, const size_t srcCount,

                 TDestVec& dest, const bool clrDest = true) const

         {

                 typedef typename TVecElt<TDestVec>::TElement TDestCh;

                 if (clrDest) dest.Clr();

                 size_t toDo = srcCount, nEncoded = 0;

                 while (toDo-- > 0) {

                         int chSrc = (int) src[TVecIdx(srcIdx)]; srcIdx++;

                         int chDest = TEncoding::FromUnicode(chSrc);

                         if (chDest < 0) {

                                 switch (errorHandling) {

                                 case uehThrow: throw TUnicodeException(srcIdx - 1, chSrc, "Invalid character for encoding into " + GetName() + ".");

                                 case uehAbort: return nEncoded;

                                 case uehReplace: TVecElt<TDestVec>::Add(dest, TDestCh(replacementChar)); continue;

                                 case uehIgnore: continue;

                                 default: Fail; } }

                         TVecElt<TDestVec>::Add(dest, TDestCh(chDest)); nEncoded++; }

                 return nEncoded;

         }


         template<typename TSrcVec, typename TDestVec>

         size_t FromUnicode(const TSrcVec& src, TDestVec& dest, const bool clrDest = true) const { return FromUnicode(src, 0, src.Len(), dest, clrDest); }


         size_t UniToStr(const TIntV& src, size_t srcIdx, const size_t srcCount, TStr& dest, const bool clrDest = true) const {

                 TChA buf; size_t retVal = FromUnicode(src, srcIdx, srcCount, buf, false);

                 if (clrDest) dest += buf.CStr(); else dest = buf.CStr();

                 return retVal; }

         size_t UniToStr(const TIntV& src, TStr& dest, const bool clrDest = true) const { return UniToStr(src, 0, src.Len(), dest, clrDest); }

 };


 typedef T8BitCodec<TEncoding_ISO8859_1> TCodec_ISO8859_1;

 typedef T8BitCodec<TEncoding_ISO8859_2> TCodec_ISO8859_2;

 typedef T8BitCodec<TEncoding_ISO8859_3> TCodec_ISO8859_3;

 typedef T8BitCodec<TEncoding_ISO8859_4> TCodec_ISO8859_4;

 typedef T8BitCodec<TEncoding_CP852> TCodec_CP852;

 typedef T8BitCodec<TEncoding_CP437> TCodec_CP437;

 typedef T8BitCodec<TEncoding_CP1250> TCodec_CP1250;

 typedef T8BitCodec<TEncoding_YuAscii> TCodec_YuAscii;


 //-----------------------------------------------------------------------------

 // Various declarations used by the Unicode Character Database

 //-----------------------------------------------------------------------------


 typedef enum TUniChCategory_

 {

 #define DefineUniCat(cat, c) uc ## cat = (int(uchar(c)) & 0xff)

         DefineUniCat(Letter, 'L'),             // ucLetter

         DefineUniCat(Mark, 'M'),

         DefineUniCat(Number, 'N'),

         DefineUniCat(Punctuation, 'P'),

         DefineUniCat(Symbol, 'S'),

         DefineUniCat(Separator, 'Z'),

         DefineUniCat(Other, 'C')

 #undef DefineUniCat

 }

 TUniChCategory;


 typedef enum TUniChSubCategory_

 {

 #define DefineUniSubCat(cat, subCat, c) uc ## cat ## subCat = ((uc ## cat) << 8) | (int(uchar(c)) & 0xff)

         DefineUniSubCat(Letter, Uppercase, 'u'),            // ucLetterUppercase

         DefineUniSubCat(Letter, Lowercase, 'l'),

         DefineUniSubCat(Letter, Titlecase, 't'),

         DefineUniSubCat(Letter, Modifier, 'm'),

         DefineUniSubCat(Letter, Other, 'o'),

         DefineUniSubCat(Mark, Nonspacing, 'n'),

         DefineUniSubCat(Mark, SpacingCombining, 'c'),

         DefineUniSubCat(Mark, Enclosing, 'e'),

         DefineUniSubCat(Number, DecimalDigit, 'd'),

         DefineUniSubCat(Number, Letter, 'l'),

         DefineUniSubCat(Number, Other, 'o'),

         DefineUniSubCat(Punctuation, Connector, 'c'),

         DefineUniSubCat(Punctuation, Dash, 'd'),

         DefineUniSubCat(Punctuation, Open, 's'),

         DefineUniSubCat(Punctuation, Close, 'e'),

         DefineUniSubCat(Punctuation, InitialQuote, 'i'),

         DefineUniSubCat(Punctuation, FinalQuote, 'f'),

         DefineUniSubCat(Punctuation, Other, 'o'),

         DefineUniSubCat(Symbol, Math, 'm'),

         DefineUniSubCat(Symbol, Currency, 'c'),

         DefineUniSubCat(Symbol, Modifier, 'k'),

         DefineUniSubCat(Symbol, Other, 'o'),

         DefineUniSubCat(Separator, Space, 's'),

         DefineUniSubCat(Separator, Line, 'l'),

         DefineUniSubCat(Separator, Paragraph, 'p'),

         DefineUniSubCat(Other, Control, 'c'),

         DefineUniSubCat(Other, Format, 'f'),

         DefineUniSubCat(Other, Surrogate, 's'),

         DefineUniSubCat(Other, PrivateUse, 'o'),

         DefineUniSubCat(Other, NotAssigned, 'n')

 }

 TUniChSubCategory;


 typedef enum TUniChFlags_

 {

         ucfCompatibilityDecomposition = 1, // if this flag is not set, the decomposition is canonical

         ucfCompositionExclusion = 1 << 1,       // from CompositionExclusions.txt

         // Flags used when searching for word boundaries.  See UAX #29.

         ucfWbFormat = 1 << 2,

         ucfWbKatakana = 1 << 3,

         ucfWbALetter = 1 << 4,

         ucfWbMidLetter = 1 << 5,

         ucfWbMidNum = 1 << 6,

         ucfWbNumeric = 1 << 7,

         ucfWbExtendNumLet = 1 << 8,

         // Flags used with sentence boundaries (Sep is also used with word boundaries).  See UAX #29.

         ucfSbSep = 1 << 9,

         ucfSbFormat = 1 << 10,

         ucfSbSp = 1 << 11,

         ucfSbLower = 1 << 12,

         ucfSbUpper = 1 << 13,

         ucfSbOLetter = 1 << 14,

         ucfSbNumeric = 1 << 15,

         ucfSbATerm = 1 << 16,

         ucfSbSTerm = 1 << 17,

         ucfSbClose = 1 << 18,

         ucfSbMask = ucfSbSep | ucfSbFormat | ucfSbSp | ucfSbLower | ucfSbUpper | ucfSbOLetter | ucfSbNumeric | ucfSbATerm | ucfSbSTerm | ucfSbClose,

         ucfWbMask = ucfWbFormat | ucfWbKatakana | ucfWbALetter | ucfWbMidLetter | ucfWbMidNum | ucfWbNumeric | ucfWbExtendNumLet | ucfSbSep,

         // Flags from DerivedCoreProperties.txt.

         // [The comments are from UCD.html.]

         // - Characters with the Alphabetic property. For more information, see Chapter 4 in [Unicode].

         //   Generated from: Other_Alphabetic + Lu + Ll + Lt + Lm + Lo + Nl

         ucfDcpAlphabetic = 1 << 19,

         // - For programmatic determination of default-ignorable code points.

         //   New characters that should be ignored in processing (unless explicitly supported)

         //   will be assigned in these ranges, permitting programs to correctly handle the default

         //   behavior of such characters when not otherwise supported.  For more information, see

         //   UAX #29: Text Boundaries [Breaks].

         //   Generated from Other_Default_Ignorable_Code_Point + Cf + Cc + Cs + Noncharacters - White_Space - annotation characters

         //   [Examples: soft hyphen, zero-width space, noncharacters (e.g. U+fffe, U+ffff, U+1fffe, U+1ffff, etc.), surrogates, language tags, variation selectors]

         ucfDcpDefaultIgnorableCodePoint = 1 << 20,

         // - Characters with the Lowercase property.  For more information, see Chapter 4 in [Unicode].

         //   Generated from: Other_Lowercase + Ll

         ucfDcpLowercase = 1 << 21,

         // - For programmatic determination of grapheme cluster boundaries.

         //   For more information, see UAX #29: Text Boundaries [Breaks].

         //   Generated from: [0..10FFFF] - Cc - Cf - Cs - Co - Cn - Zl - Zp - Grapheme_Extend

         ucfDcpGraphemeBase = 1 << 22,

         // - For programmatic determination of grapheme cluster boundaries.

         //   For more information, see UAX #29: Text Boundaries [Breaks].

         //   Generated from: Other_Grapheme_Extend + Me + Mn

         //   Note: depending on an application's interpretation of Co (private use), they may be either

         //         in Grapheme_Base, or in Grapheme_Extend, or in neither.

         ucfDcpGraphemeExtend = 1 << 23,

         // - Used to determine programming identifiers, as described in UAX #31: Identifier and Pattern Syntax.

         ucfDcpIdStart = 1 << 24,

         ucfDcpIdContinue = 1 << 25,

         // - Characters with the Math property. For more information, see Chapter 4 in [Unicode].

         //   Generated from: Sm + Other_Math

         ucfDcpMath = 1 << 26,

         // - Characters with the Uppercase property. For more information, see Chapter 4 in [Unicode].

         //   Generated from: Lu + Other_Uppercase

         ucfDcpUppercase = 1 << 27,

         // - Used to determine programming identifiers, as described in UAX #31: Identifier and Pattern Syntax.

         ucfDcpXidStart = 1 << 28,

         ucfDcpXidContinue = 1 << 29,

         ucfDcpMask = ucfDcpAlphabetic | ucfDcpDefaultIgnorableCodePoint | ucfDcpLowercase | ucfDcpGraphemeBase | ucfDcpGraphemeExtend |

                 ucfDcpIdStart | ucfDcpIdContinue | ucfDcpMath | ucfDcpUppercase | ucfDcpXidStart | ucfDcpXidContinue,

 }

 TUniChFlags;


 typedef enum TUniChProperties_

 {

         // The flags from PropList.txt.

         // [The comments are from UCD.html.]

         // - ASCII characters commonly used for the representation of hexadecimal numbers.

         //   [= 0123456789abcdefABCDEF]

         ucfPrAsciiHexDigit = 1,

         // - Those format control characters which have specific functions in the Bidirectional Algorithm.

         ucfPrBidiControl = 2,

         // - Those punctuation characters explicitly called out as dashes in the Unicode Standard,

         //   plus compatibility equivalents to those. Most of these have the Pd General Category,

         //   but some have the Sm General Category because of their use in mathematics.

         //     U+0002d  HYPHEN-MINUS

         //     U+0058a  ARMENIAN HYPHEN

         //     U+005be  HEBREW PUNCTUATION MAQAF

         //     U+01806  MONGOLIAN TODO SOFT HYPHEN

         //     U+02010  HYPHEN

         //     U+02011  NON-BREAKING HYPHEN

         //     U+02012  FIGURE DASH

         //     U+02013  EN DASH

         //     U+02014  EM DASH

         //     U+02015  HORIZONTAL BAR

         //     U+02053  SWUNG DASH

         //     U+0207b  SUPERSCRIPT MINUS

         //     U+0208b  SUBSCRIPT MINUS

         //     U+02212  MINUS SIGN

         //     U+02e17  DOUBLE OBLIQUE HYPHEN

         //     U+0301c  WAVE DASH

         //     U+03030  WAVY DASH

         //     U+030a0  KATAKANA-HIRAGANA DOUBLE HYPHEN

         //     U+0fe31  PRESENTATION FORM FOR VERTICAL EM DASH

         //     U+0fe32  PRESENTATION FORM FOR VERTICAL EN DASH

         //     U+0fe58  SMALL EM DASH

         //     U+0fe63  SMALL HYPHEN-MINUS

         //     U+0ff0d  FULLWIDTH HYPHEN-MINUS

         ucfPrDash = 4,

         // - For a machine-readable list of deprecated characters.  No characters will ever be removed

         //   from the standard, but the usage of deprecated characters is strongly discouraged.

         ucfPrDeprecated = 8,

         // - Characters that linguistically modify the meaning of another character to which they apply.

         //   Some diacritics are not combining characters, and some combining characters are not diacritics.

         ucfPrDiacritic = 0x10,

         // - Characters whose principal function is to extend the value or shape of a preceding alphabetic

         //   character.  Typical of these are length and iteration marks.

         ucfPrExtender = 0x20,

         // - Used in determining default grapheme cluster boundaries.  For more information, see UAX #29: Text Boundaries.

         ucfPrGraphemeLink = 0x40,

         // - Characters commonly used for the representation of hexadecimal numbers, plus their compatibility equivalents.

         //   [= AsciiHexDigit + fullwidth digit {0..9} + fullwidth latin {small|capital} letter {a..f}]

         ucfPrHexDigit = 0x80,

         // - Those dashes used to mark connections between pieces of words, plus the Katakana middle dot.

         //   The Katakana middle dot functions like a hyphen, but is shaped like a dot rather than a dash.

         //     U+0002d  HYPHEN-MINUS

         //     U+000ad  SOFT HYPHEN

         //     U+0058a  ARMENIAN HYPHEN

         //     U+01806  MONGOLIAN TODO SOFT HYPHEN

         //     U+02010  HYPHEN

         //     U+02011  NON-BREAKING HYPHEN

         //     U+02e17  DOUBLE OBLIQUE HYPHEN

         //     U+030fb  KATAKANA MIDDLE DOT

         //     U+0fe63  SMALL HYPHEN-MINUS

         //     U+0ff0d  FULLWIDTH HYPHEN-MINUS

         //     U+0ff65  HALFWIDTH KATAKANA MIDDLE DOT

         ucfPrHyphen = 0x100,

         // - Characters considered to be CJKV (Chinese, Japanese, Korean, and Vietnamese) ideographs.

         ucfPrIdeographic = 0x200,

         // - Those format control characters which have specific functions for control of cursive joining and ligation.

         ucfPrJoinControl = 0x400,

         // - There are a small number of characters that do not use logical order.

         //   These characters require special handling in most processing.

         ucfPrLogicalOrderException = 0x800,

         // - Code points that are permanently reserved for internal use.

         ucfPrNoncharacterCodePoint = 0x1000,

         // - Used for pattern syntax as described in UAX #31: Identifier and Pattern Syntax.

         ucfPrPatternSyntax = 0x2000,

         ucfPrPatternWhiteSpace = 0x4000,

         // - Those punctuation characters that function as quotation marks.

         //     U+00022  QUOTATION MARK

         //     U+00027  APOSTROPHE

         //     U+000ab  LEFT-POINTING DOUBLE ANGLE QUOTATION MARK

         //     U+000bb  RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK

         //     U+02018  LEFT SINGLE QUOTATION MARK

         //     U+02019  RIGHT SINGLE QUOTATION MARK

         //     U+0201a  SINGLE LOW-9 QUOTATION MARK

         //     U+0201b  SINGLE HIGH-REVERSED-9 QUOTATION MARK

         //     U+0201c  LEFT DOUBLE QUOTATION MARK

         //     U+0201d  RIGHT DOUBLE QUOTATION MARK

         //     U+0201e  DOUBLE LOW-9 QUOTATION MARK

         //     U+0201f  DOUBLE HIGH-REVERSED-9 QUOTATION MARK

         //     U+02039  SINGLE LEFT-POINTING ANGLE QUOTATION MARK

         //     U+0203a  SINGLE RIGHT-POINTING ANGLE QUOTATION MARK

         //     U+0300c  LEFT CORNER BRACKET

         //     U+0300d  RIGHT CORNER BRACKET

         //     U+0300e  LEFT WHITE CORNER BRACKET

         //     U+0300f  RIGHT WHITE CORNER BRACKET

         //     U+0301d  REVERSED DOUBLE PRIME QUOTATION MARK

         //     U+0301e  DOUBLE PRIME QUOTATION MARK

         //     U+0301f  LOW DOUBLE PRIME QUOTATION MARK

         //     U+0fe41  PRESENTATION FORM FOR VERTICAL LEFT CORNER BRACKET

         //     U+0fe42  PRESENTATION FORM FOR VERTICAL RIGHT CORNER BRACKET

         //     U+0fe43  PRESENTATION FORM FOR VERTICAL LEFT WHITE CORNER BRACKET

         //     U+0fe44  PRESENTATION FORM FOR VERTICAL RIGHT WHITE CORNER BRACKET

         //     U+0ff02  FULLWIDTH QUOTATION MARK

         //     U+0ff07  FULLWIDTH APOSTROPHE

         //     U+0ff62  HALFWIDTH LEFT CORNER BRACKET

         //     U+0ff63  HALFWIDTH RIGHT CORNER BRACKET

         ucfPrQuotationMark = 0x8000,

         // - Characters with a "soft dot", like i or j. An accent placed on these characters causes the dot to disappear.

         //   An explicit _dot above_ can be added where required, such as in Lithuanian.

         ucfPrSoftDotted = 0x10000,

         // - Sentence Terminal. Used in UAX #29: Text Boundaries.

         //     U+00021  EXCLAMATION MARK

         //     U+0002e  FULL STOP

         //     U+0003f  QUESTION MARK

         //     U+0203c  DOUBLE EXCLAMATION MARK

         //     U+0203d  INTERROBANG

         //     U+02047  DOUBLE QUESTION MARK

         //     U+02048  QUESTION EXCLAMATION MARK

         //     U+02049  EXCLAMATION QUESTION MARK

         //     U+03002  IDEOGRAPHIC FULL STOP

         //     [plus many characters from other writing systems]

         ucfPrSTerm = 0x20000,

         // - Those punctuation characters that generally mark the end of textual units.

         //   [JB note: this set contains more character than STerm.  For example, it contains

         //   the comma, colon and semicolon, whereas STerm doesn't.]

         //     U+00021  EXCLAMATION MARK

         //     U+0002c  COMMA

         //     U+0002e  FULL STOP

         //     U+0003a  COLON

         //     U+0003b  SEMICOLON

         //     U+0003f  QUESTION MARK

         //     U+0203c  DOUBLE EXCLAMATION MARK

         //     U+0203d  INTERROBANG

         //     U+02047  DOUBLE QUESTION MARK

         //     U+02048  QUESTION EXCLAMATION MARK

         //     U+02049  EXCLAMATION QUESTION MARK

         //     [plus *lots* of charcters from other writing systems]

         ucfPrTerminalPunctuation = 0x40000,

         // - Indicates all those characters that qualify as Variation Selectors.

         //   For details on the behavior of these characters, see StandardizedVariants.html and

         //   Section 16.4, Variation Selectors in [Unicode].

         ucfPrVariationSelector = 0x80000,

         // - Those separator characters and control characters which should be treated by

         //   programming languages as "white space" for the purpose of parsing elements.

         //   Note: ZERO WIDTH SPACE and ZERO WIDTH NO-BREAK SPACE are not included,

         //         since their functions are restricted to line-break control.

         //         Their names are unfortunately misleading in this respect.

         //   Note: There are other senses of "whitespace" that encompass a different set of characters.

         //         [JB note: e.g. there's a BIDI class for whitespace ('WS') in UnicodeData.txt.

         //         There's also a "Sp" class in the sentence boundary algorithm, see UAX #29, sec. 5.1.]

         //   This includes the following characters:

         //     U+0009  <control>

         //     U+000a  <control>

         //     U+000b  <control>

         //     U+000c  <control>

         //     U+000d  <control>

         //     U+0020  SPACE

         //     U+0085  <control>

         //     U+00a0  NO-BREAK SPACE

         //     U+1680  OGHAM SPACE MARK

         //     U+180e  MONGOLIAN VOWEL SEPARATOR

         //     U+2000  EN QUAD

         //     U+2001  EM QUAD

         //     U+2002  EN SPACE

         //     U+2003  EM SPACE

         //     U+2004  THREE-PER-EM SPACE

         //     U+2005  FOUR-PER-EM SPACE

         //     U+2006  SIX-PER-EM SPACE

         //     U+2007  FIGURE SPACE

         //     U+2008  PUNCTUATION SPACE

         //     U+2009  THIN SPACE

         //     U+200a  HAIR SPACE

         //     U+2028  LINE SEPARATOR

         //     U+2029  PARAGRAPH SEPARATOR

         //     U+202f  NARROW NO-BREAK SPACE

         //     U+205f  MEDIUM MATHEMATICAL SPACE

         //     U+3000  IDEOGRAPHIC SPACE

         ucfPrWhiteSpace = 0x100000

 }

 TUniChProperties;


 typedef enum TUniChPropertiesX_

 {

         // More properties from PropList.txt.

         // - Used to derive the properties in DerivedCoreProperties.txt.

         ucfPxOtherAlphabetic = 1,

         ucfPxOtherDefaultIgnorableCodePoint = 2,

         ucfPxOtherGraphemeExtend = 4,

         ucfPxOtherIdContinue = 8,

         ucfPxOtherIdStart = 0x10,

         ucfPxOtherLowercase = 0x20,

         ucfPxOtherMath = 0x40,

         ucfPxOtherUppercase = 0x80,

         // - Used in ideographic description sequences.

         ucfPxIdsBinaryOperator = 0x100,

         ucfPxIdsTrinaryOperator = 0x200,

         ucfPxRadical = 0x400,

         ucfPxUnifiedIdeograph = 0x800

 }

 TUniChPropertiesX;


 //-----------------------------------------------------------------------------

 // TUniChInfo -- contains information about a single Unicode codepoint

 //-----------------------------------------------------------------------------


 class TUniChInfo

 {

 public:

         enum { // combining classes (for 'combClass'); from UnicodeData.txt

                 ccStarter = 0, // 0: Spacing, split, enclosing, reordrant, and Tibetan subjoined

                 ccOverlaysAndInterior = 1,

                 ccNuktas = 7,

                 ccHiraganaKatakanaVoicingMarks = 8,

                 ccViramas = 9,

                 ccFixedPositionStart = 10, // Start of fixed position classes

                 ccFixedPositionEnd = 199, // End of fixed position classes

                 ccBelowLeftAttached = 200,

                 ccBelowAttached = 202,

                 ccBelowRightAttached = 204,

                 ccLeftAttached = 208, // Left attached (reordrant around single base character)

                 ccRightAttached = 210,

                 ccAboveLeftAttached = 212,

                 ccAboveAttached = 214,

                 ccAboveRightAttached = 216,

                 ccBelowLeft = 218,

                 ccBelow = 220,

                 ccBelowRight = 222,

                 ccLeft = 224, // Left (reordrant around single base character)

                 ccRight = 226,

                 ccAboveLeft = 228,

                 ccAbove = 230,

                 ccAboveRight = 232,

                 ccDoubleBelow = 233,

                 ccDoubleAbove = 234,

                 ccBelowIotaSubscript = 240, // Below (iota subscript)

                 ccInvalid = 255 // not defined by Unicode

         };

         char chCat, chSubCat; // chCat + chSubCat together comprise the general category (from UnicodeData.txt)

         uchar combClass; // canonical combining class

         TUniChCategory cat; // = TUniChCategory(chCat)

         TUniChSubCategory subCat; // = TUniChSubCategory(cat << 8 | subCat)

         signed char script; // keyId into 'TUniChDb.scriptNames'; -1 if unknown

         int simpleUpperCaseMapping, simpleLowerCaseMapping, simpleTitleCaseMapping; // from UnicodeData.txt

         int decompOffset; // offset into 'TUniChDb.decompositions'; or -1 if the character doesn't change during decomposition

         int nameOffset; // offset into 'TUniChDb.charNames'

         int flags; // a combination of TUniChFlags

         int properties; // a combination of TUniChProperties

         int propertiesX; // a combination of TUniChPropertiesX

         ushort lineBreak; // from LineBreak.txt


         // Converts a 2-letter linebreak code into a 16-bit integer.

         static inline ushort GetLineBreakCode(char c1, char c2) { return ((static_cast<ushort>(static_cast<uchar>(c1)) & 0xff) << 8) | ((static_cast<ushort>(static_cast<uchar>(c2)) & 0xff)); }

         static const ushort LineBreak_Unknown, LineBreak_ComplexContext, LineBreak_Numeric, LineBreak_InfixNumeric, LineBreak_Quotation;


 public:

         void InitAfterLoad() {

                 cat = (TUniChCategory) chCat;

                 subCat = (TUniChSubCategory) (((static_cast<int>(static_cast<uchar>(chCat)) & 0xff) << 8) | (static_cast<int>(static_cast<uchar>(chSubCat)) & 0xff)); }

         void SetCatAndSubCat(const TUniChSubCategory catAndSubCat) {

                 cat = (TUniChCategory) ((int(catAndSubCat) >> 8) & 0xff);

                 subCat = catAndSubCat;

                 chCat = (char) cat; chSubCat = (char) (int(subCat) & 0xff); }

         friend class TUniChDb;


         // Inexplicably missing from TSIn/TSOut...

         static inline void LoadUShort(TSIn& SIn, ushort& u) { SIn.LoadBf(&u, sizeof(u)); }

         static inline void LoadSChar(TSIn& SIn, signed char& u) { SIn.LoadBf(&u, sizeof(u)); }

         static inline void SaveUShort(TSOut& SOut, ushort u) { SOut.SaveBf(&u, sizeof(u)); }

         static inline void SaveSChar(TSOut& SOut, signed char u) { SOut.SaveBf(&u, sizeof(u)); }


 public:

         void Save(TSOut& SOut) const {

                 SOut.Save(chCat); SOut.Save(chSubCat); SOut.Save(combClass); SaveSChar(SOut, script);

                 SOut.Save(simpleUpperCaseMapping); SOut.Save(simpleLowerCaseMapping); SOut.Save(simpleTitleCaseMapping);

                 SOut.Save(decompOffset); SOut.Save(nameOffset);

                 SOut.Save(flags); SOut.Save(properties); SOut.Save(propertiesX); SaveUShort(SOut, lineBreak); }

         void Load(TSIn& SIn) {

                 SIn.Load(chCat); SIn.Load(chSubCat); SIn.Load(combClass); LoadSChar(SIn, script);

                 SIn.Load(simpleUpperCaseMapping); SIn.Load(simpleLowerCaseMapping); SIn.Load(simpleTitleCaseMapping);

                 SIn.Load(decompOffset); SIn.Load(nameOffset);

                 SIn.Load(flags); SIn.Load(properties); SIn.Load(propertiesX); LoadUShort(SIn, lineBreak); InitAfterLoad(); }

         explicit TUniChInfo(TSIn& SIn) { Load(SIn); }

         TUniChInfo() : chCat(char(ucOther)), chSubCat(char(ucOtherNotAssigned & 0xff)), combClass(ccInvalid),

                 script(-1),simpleUpperCaseMapping(-1), simpleLowerCaseMapping(-1), simpleTitleCaseMapping(-1),

                 decompOffset(-1), nameOffset(-1), flags(0), properties(0), propertiesX(0), lineBreak(LineBreak_Unknown) {

                 InitAfterLoad(); }


         // DerivedCoreProperties flags.

         bool IsDcpFlag(const TUniChFlags flag) const { Assert((flag & ucfDcpMask) == flag); return (flags & flag) == flag; }

         void ClrDcpFlags() { flags = flags & ~ucfDcpMask; }

         void SetDcpFlag(const TUniChFlags flag) { Assert((flag & ucfDcpMask) == flag); flags |= flag; }

         bool IsAlphabetic() const { return IsDcpFlag(ucfDcpAlphabetic); }

         bool IsUppercase() const { return IsDcpFlag(ucfDcpUppercase); }

         bool IsLowercase() const { return IsDcpFlag(ucfDcpLowercase); }

         bool IsMath() const { return IsDcpFlag(ucfDcpMath); }

         bool IsDefaultIgnorable() const { return IsDcpFlag(ucfDcpDefaultIgnorableCodePoint); }

         bool IsGraphemeBase() const { return IsDcpFlag(ucfDcpGraphemeBase); }

         bool IsGraphemeExtend() const { return IsDcpFlag(ucfDcpGraphemeExtend); }

         bool IsIdStart() const { return IsDcpFlag(ucfDcpIdStart); }

         bool IsIdContinue() const { return IsDcpFlag(ucfDcpIdContinue); }

         bool IsXidStart() const { return IsDcpFlag(ucfDcpXidStart); }

         bool IsXidContinue() const { return IsDcpFlag(ucfDcpXidContinue); }


         // PropList.txt flags.

         bool IsProperty(const TUniChProperties flag) const { return (properties & flag) == flag; }

         void SetProperty(const TUniChProperties flag) { properties |= flag; }

         bool IsAsciiHexDigit() const { return IsProperty(ucfPrAsciiHexDigit); }

         bool IsBidiControl() const { return IsProperty(ucfPrBidiControl); }

         bool IsDash() const { return IsProperty(ucfPrDash); }

         bool IsDeprecated() const { return IsProperty(ucfPrDeprecated); }

         bool IsDiacritic() const { return IsProperty(ucfPrDiacritic); }

         bool IsExtender() const { return IsProperty(ucfPrExtender); }

         bool IsGraphemeLink() const { return IsProperty(ucfPrGraphemeLink); }

         bool IsHexDigit() const { return IsProperty(ucfPrHexDigit); }

         bool IsHyphen() const { return IsProperty(ucfPrHyphen); }

         bool IsIdeographic() const { return IsProperty(ucfPrIdeographic); }

         bool IsJoinControl() const { return IsProperty(ucfPrJoinControl); }

         bool IsLogicalOrderException() const { return IsProperty(ucfPrLogicalOrderException); }

         bool IsNoncharacter() const { return IsProperty(ucfPrNoncharacterCodePoint); }

         bool IsQuotationMark() const { return IsProperty(ucfPrQuotationMark); }

         bool IsSoftDotted() const { return IsProperty(ucfPrSoftDotted); }

         bool IsSTerminal() const { return IsProperty(ucfPrSTerm); }

         bool IsTerminalPunctuation() const { return IsProperty(ucfPrTerminalPunctuation); }

         bool IsVariationSelector() const { return IsProperty(ucfPrVariationSelector); }

         bool IsWhiteSpace() const { return IsProperty(ucfPrWhiteSpace); }


         // Additional PropList.txt flags.

         bool IsPropertyX(const TUniChPropertiesX flag) const { return (propertiesX & flag) == flag; }

         void SetPropertyX(const TUniChPropertiesX flag) { propertiesX |= flag; }


         // Miscellaneous flags.

         bool IsCompositionExclusion() const { return (flags & ucfCompositionExclusion) == ucfCompositionExclusion; }

         bool IsCompatibilityDecomposition() const { return (flags & ucfCompatibilityDecomposition) == ucfCompatibilityDecomposition; }


         // Word-boundary flags.

         bool IsWbFlag(const TUniChFlags flag) const { Assert((flag & ucfWbMask) == flag); return (flags & flag) == flag; }

         void ClrWbAndSbFlags() { flags = flags & ~(ucfWbMask | ucfSbMask); }

         void SetWbFlag(const TUniChFlags flag) { Assert((flag & ucfWbMask) == flag); flags |= flag; }

         int GetWbFlags() const { return flags & ucfWbMask; }

         bool IsWbFormat() const { return IsWbFlag(ucfWbFormat); }

         TStr GetWbFlagsStr() const { return GetWbFlagsStr(GetWbFlags()); }

         static TStr GetWbFlagsStr(const int flags) { return TStr("") + (flags & ucfWbALetter ? "A" : "") +

                 (flags & ucfWbFormat ? "F" : "") + (flags & ucfWbKatakana ? "K" : "") + (flags & ucfWbMidLetter ? "M" : "") +

                 (flags & ucfWbMidNum ? "m" : "") + (flags & ucfWbNumeric ? "N" : "") + (flags & ucfWbExtendNumLet ? "E" : ""); }


         // Sentence-boundary flags.

         bool IsSbFlag(const TUniChFlags flag) const { Assert((flag & ucfSbMask) == flag); return (flags & flag) == flag; }

         void SetSbFlag(const TUniChFlags flag) { Assert((flag & ucfSbMask) == flag); flags |= flag; }

         int GetSbFlags() const { return flags & ucfSbMask; }

         bool IsSbFormat() const { return IsSbFlag(ucfSbFormat); }

         TStr GetSbFlagsStr() const { return GetSbFlagsStr(GetSbFlags()); }

         static TStr GetSbFlagsStr(const int flags) { return TStr("") + (flags & ucfSbSep ? "S" : "") +

                 (flags & ucfSbFormat ? "F" : "") + (flags & ucfSbSp ? "_" : "") + (flags & ucfSbLower ? "L" : "") +

                 (flags & ucfSbUpper ? "U" : "") + (flags & ucfSbOLetter ? "O" : "") + (flags & ucfSbNumeric ? "N" : "") +

                 (flags & ucfSbATerm ? "A" : "") + (flags & ucfSbSTerm ? "T" : "") + (flags & ucfSbClose ? "C" : ""); }


         bool IsSbSep() const { return (flags & ucfSbSep) == ucfSbSep; }


         // Grapheme-boundary flags.

         bool IsGbExtend() const { return IsGraphemeExtend(); }


         // Sec. 3.13, D47: C is cased iff it is uppercase, lowercase, or general_category == titlecase_letter.

         bool IsCased() const { return IsUppercase() || IsLowercase() || (subCat == ucLetterTitlecase); }


         // Character categories.

         TUniChCategory GetCat() const { return (TUniChCategory) cat; }

         TUniChSubCategory GetSubCat() const { return (TUniChSubCategory) subCat; }

         // The following characters belong to the 'symbol/currency' subcategory:

         //     U+00024  DOLLAR SIGN

         //     U+000a2  CENT SIGN

         //     U+000a3  POUND SIGN

         //     U+000a4  CURRENCY SIGN

         //     U+000a5  YEN SIGN

         //     U+020a3  FRENCH FRANC SIGN

         //     U+020a4  LIRA SIGN

         //     U+020ac  EURO SIGN

         //     [and plenty of others]

         bool IsCurrency() const { return subCat == ucSymbolCurrency; }

         // Note: most private-use and surrogate characters aren't listed explicitly in UnicodeData.txt.

         // Thus, it's better to call TUniChDb's versions of these methods, which are aware of

         // the full ranges of private-use and surrogate characters.

         bool IsPrivateUse() const { return subCat == ucOtherPrivateUse; }

         bool IsSurrogate() const { return subCat == ucOtherSurrogate; }


         inline static bool IsValidSubCat(const char chCat, const char chSubCat) {

                 static const char s[] = "LuLlLtLmLoMnMcMeNdNlNoPcPdPsPePiPfPoSmScSkSoZsZlZpCcCfCsCoCn";

                 for (const char *p = s; *p; p += 2)

                         if (chCat == p[0] && chSubCat == p[1]) return true;

                 return false; }

 };


 //-----------------------------------------------------------------------------

 // TUniTrie -- a trie for suffixes that should not appear at the end

 // of a sentence

 //-----------------------------------------------------------------------------


 template<typename TItem_>

 class TUniTrie

 {

 public:

         typedef TItem_ TItem;

 protected:

         class TNode {

         public:

                 TItem item;

                 int child, sib;

                 bool terminal;

                 TNode() : child(-1), sib(-1), terminal(false) { }

                 TNode(const TItem& item_, const int child_, const int sib_, const bool terminal_) : item(item_), child(child_), sib(sib_), terminal(terminal_) { }

         };

         typedef TVec<TNode> TNodeV;

         typedef TPair<TItem, TItem> TItemPr;

         typedef TTriple<TItem, TItem, TItem> TItemTr;

         typedef TUniVecIdx TVecIdx;

         THash<TItem, TVoid> singles; //

         THash<TItemPr, TVoid> pairs;

         THash<TItemTr, TInt> roots;

         TNodeV nodes;

 public:

         TUniTrie() { }

         void Clr() { singles.Clr(); pairs.Clr(); roots.Clr(); nodes.Clr(); }


         bool Empty() const { return singles.Empty() && pairs.Empty() && roots.Empty(); }


         bool Has1Gram(const TItem& item) const { return singles.IsKey(item); }

         bool Has2Gram(const TItem& last, const TItem& butLast) const { return pairs.IsKey(TItemPr(last, butLast)); }

         int Get3GramRoot(const TItem& last, const TItem& butLast, const TItem& butButLast) const {

                 int keyId = roots.GetKeyId(TItemTr(last, butLast, butButLast));

                 if (keyId < 0) return 0; else return roots[keyId]; }

         int GetChild(const int parentIdx, const TItem& item) const {

                 for (int childIdx = nodes[parentIdx].child; childIdx >= 0; ) {

                         const TNode &node = nodes[childIdx];

                         if (node.item == item) return childIdx;

                         childIdx = node.sib; }

                 return -1; }

         bool IsNodeTerminal(const int nodeIdx) const { return nodes[nodeIdx].terminal; }


         // Adds a new string to the trie.  Note that the last characters appear

         // closer to the root of the trie.

         template<typename TSrcVec>

         void Add(const TSrcVec& src, const size_t srcIdx, const size_t srcCount)

         {

                 IAssert(srcCount > 0);

                 if (srcCount == 1) { singles.AddKey(TItem(src[TVecIdx(srcIdx)])); return; }

                 if (srcCount == 2) { pairs.AddKey(TItemPr(TItem(src[TVecIdx(srcIdx + 1)]), TItem(src[TVecIdx(srcIdx)]))); return; }

                 size_t srcLast = srcIdx + (srcCount - 1);

                 TItemTr tr = TItemTr(TItem(src[TVecIdx(srcLast)]), TItem(src[TVecIdx(srcLast - 1)]), TItem(src[TVecIdx(srcLast - 2)]));

                 int keyId = roots.GetKeyId(tr), curNodeIdx = -1;

                 if (keyId >= 0) curNodeIdx = roots[keyId];

                 else { curNodeIdx = nodes.Add(TNode(TItem(0), -1, -1, false)); roots.AddDat(tr, curNodeIdx); }

                 //

                 if (srcCount > 3) for (size_t srcPos = srcLast - 3; ; )

                 {

                         const TItem curItem = src[TVecIdx(srcPos)];

                         int childNodeIdx = nodes[curNodeIdx].child;

                         while (childNodeIdx >= 0) {

                                 TNode &childNode = nodes[childNodeIdx];

                                 if (childNode.item == curItem) break;

                                 childNodeIdx = childNode.sib; }

                         if (childNodeIdx < 0) {

                                 childNodeIdx = nodes.Add(TNode(curItem, -1, nodes[curNodeIdx].child, false));

                                 nodes[curNodeIdx].child = childNodeIdx; }

                         curNodeIdx = childNodeIdx;

                         if (srcPos == srcIdx) break; else srcPos--;

                 }

                 nodes[curNodeIdx].terminal = true;

         }


         template<typename TSrcVec>

         void Add(const TSrcVec& src) { Add(src, 0, (size_t) src.Len()); }

 };


 //-----------------------------------------------------------------------------

 // TUniChDb -- provides access to the Unicode Character Database

 //-----------------------------------------------------------------------------


 class TUniChDb

 {

 protected:

         void InitAfterLoad();

         typedef TUniVecIdx TVecIdx;


 public:

         THash<TInt, TUniChInfo> h; // key: codepoint

         TStrPool charNames;

         TStrIntH scripts; // keyID = used in TUniChInfo.script; key = script name; dat = number of characters (informative only)

         TIntV decompositions;

         THash<TIntPr, TInt> inverseDec;

         TUniCaseFolding caseFolding;

         // These hash tables contain only the unconditional mappings from SpecialCasing.txt.

         // The conditional mappings are hardcoded into GetCaseConverted().

         TIntIntVH specialCasingLower, specialCasingUpper, specialCasingTitle;

         int scriptUnknown; // = scripts.GetKey("Unknown")


         TUniChDb() : scriptUnknown(-1) { }

         explicit TUniChDb(TSIn& SIn) { Load(SIn); }

         void Clr() {

                 h.Clr(); charNames.Clr(); decompositions.Clr(); inverseDec.Clr(); caseFolding.Clr();

                 specialCasingLower.Clr(); specialCasingUpper.Clr(); specialCasingTitle.Clr();

                 scripts.Clr(); }

         void Save(TSOut& SOut) const {

                 h.Save(SOut); charNames.Save(SOut); decompositions.Save(SOut);

                 inverseDec.Save(SOut); caseFolding.Save(SOut); scripts.Save(SOut);

                 specialCasingLower.Save(SOut); specialCasingUpper.Save(SOut); specialCasingTitle.Save(SOut);

                 SOut.SaveCs(); }

         void Load(TSIn& SIn) {

                 h.Load(SIn); charNames.~TStrPool(); new (&charNames) TStrPool(SIn);

                 decompositions.Load(SIn);

                 inverseDec.Load(SIn); caseFolding.Load(SIn); scripts.Load(SIn);

                 specialCasingLower.Load(SIn); specialCasingUpper.Load(SIn); specialCasingTitle.Load(SIn);

                 SIn.LoadCs(); InitAfterLoad(); }

         void LoadBin(const TStr& fnBin) {

                 PSIn SIn = TFIn::New(fnBin); Load(*SIn); }

         void Test(const TStr& basePath);


         // File names used by LoadTxt() and its subroutines.

         static TStr GetCaseFoldingFn() { return "CaseFolding.txt"; }

         static TStr GetSpecialCasingFn() { return "SpecialCasing.txt"; }

         static TStr GetUnicodeDataFn() { return "UnicodeData.txt"; }

         static TStr GetCompositionExclusionsFn() { return "CompositionExclusions.txt"; }

         static TStr GetScriptsFn() { return "Scripts.txt"; }

         static TStr GetDerivedCorePropsFn() { return "DerivedCoreProperties.txt"; }

         static TStr GetLineBreakFn() { return "LineBreak.txt"; }

         static TStr GetPropListFn() { return "PropList.txt"; }

         static TStr GetAuxiliaryDir() { return "auxiliary"; }

         static TStr GetWordBreakTestFn() { return "WordBreakTest.txt"; }

         static TStr GetWordBreakPropertyFn() { return "WordBreakProperty.txt"; }

         static TStr GetSentenceBreakTestFn() { return "SentenceBreakTest.txt"; }

         static TStr GetSentenceBreakPropertyFn() { return "SentenceBreakProperty.txt"; }

         static TStr GetNormalizationTestFn() { return "NormalizationTest.txt"; }

         static TStr GetBinFn() { return "UniChDb.bin"; } // used only by Test()


         //-------------------------------------------------------------------------

         // Script names

         //-------------------------------------------------------------------------


         // These constants are used when initializing from the text files.

         static TStr GetScriptNameUnknown() { return "Unknown"; }

         static TStr GetScriptNameKatakana() { return "Katakana"; }

         static TStr GetScriptNameHiragana() { return "Hiragana"; }

         //

         const TStr& GetScriptName(const int scriptId) const { return scripts.GetKey(scriptId); }

         int GetScriptByName(const TStr& scriptName) const { return scripts.GetKeyId(scriptName); }

         int GetScript(const TUniChInfo& ci) const { int s = ci.script; if (s < 0) s = scriptUnknown; return s; }

         int GetScript(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return scriptUnknown; else return GetScript(h[i]); }


         //-------------------------------------------------------------------------

         // Character namesnames

         //-------------------------------------------------------------------------


         // GetCharName returns 0 if the name is unknown; GetCharNameS returns a string of the form "U+1234".

         const char *GetCharName(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return 0; int ofs = h[i].nameOffset; return ofs < 0 ? 0 : charNames.GetCStr(ofs); }

         TStr GetCharNameS(const int cp) const {

                 // ToDo: Add special processing for precomposed Hangul syllables (UAX #15, sec. 16).

                 const char *p = GetCharName(cp); if (p) return p;

                 char buf[20]; sprintf(buf, "U+%04x", cp); return TStr(buf); }

         template<class TSrcVec> void PrintCharNames(FILE *f, const TSrcVec& src, size_t srcIdx, const size_t srcCount, const TStr& prefix) const {

                 if (! f) f = stdout;

                 for (const size_t srcEnd = srcIdx + srcCount; srcIdx < srcEnd; srcIdx++) {

                         fprintf(f, "%s", prefix.CStr());

                         int cp = src[TVecIdx(srcIdx)]; fprintf(f, (cp >= 0x10000 ? "U+%05x" : "U+%04x "), cp);

                         fprintf(f, " %s\n", GetCharNameS(cp).CStr()); }}

         template<class TSrcVec> void PrintCharNames(FILE *f, const TSrcVec& src, const TStr& prefix) const { PrintCharNames(f, src, 0, src.Len(), prefix); }


         //-------------------------------------------------------------------------

         // Character information

         //-------------------------------------------------------------------------

         // These methods provide access to a subset of the functionality

         // available in TUniChInfo.


         bool IsGetChInfo(const int cp, TUniChInfo& ChInfo) {

                 int i = h.GetKeyId(cp);

                 if (i < 0) return false; else { ChInfo=h[i]; return true; }}

         TUniChCategory GetCat(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return ucOther; else return h[i].cat; }

         TUniChSubCategory GetSubCat(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return ucOtherNotAssigned; else return h[i].subCat; }


         bool IsWbFlag(const int cp, const TUniChFlags flag) const { int i = h.GetKeyId(cp); if (i < 0) return false; else return h[i].IsWbFlag(flag); }

         int GetWbFlags(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return 0; else return h[i].GetWbFlags(); }

         bool IsSbFlag(const int cp, const TUniChFlags flag) const { int i = h.GetKeyId(cp); if (i < 0) return false; else return h[i].IsSbFlag(flag); }

         int GetSbFlags(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return 0; else return h[i].GetSbFlags(); }


 #define ___UniFwd1(name) bool name(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return false; else return h[i].name(); }

 #define ___UniFwd2(name1, name2) ___UniFwd1(name1) ___UniFwd1(name2)

 #define ___UniFwd3(name1, name2, name3) ___UniFwd2(name1, name2) ___UniFwd1(name3)

 #define ___UniFwd4(name1, name2, name3, name4) ___UniFwd3(name1, name2, name3) ___UniFwd1(name4)

 #define ___UniFwd5(name1, name2, name3, name4, name5) ___UniFwd4(name1, name2, name3, name4) ___UniFwd1(name5)


 #define DECLARE_FORWARDED_PROPERTY_METHODS \

         ___UniFwd5(IsAsciiHexDigit, IsBidiControl, IsDash, IsDeprecated, IsDiacritic) \

         ___UniFwd5(IsExtender, IsGraphemeLink, IsHexDigit, IsHyphen, IsIdeographic)  \

         ___UniFwd5(IsJoinControl, IsLogicalOrderException, IsNoncharacter, IsQuotationMark, IsSoftDotted)  \

         ___UniFwd4(IsSTerminal, IsTerminalPunctuation, IsVariationSelector, IsWhiteSpace)  \

         ___UniFwd5(IsAlphabetic, IsUppercase, IsLowercase, IsMath, IsDefaultIgnorable)  \

         ___UniFwd4(IsGraphemeBase, IsGraphemeExtend, IsIdStart, IsIdContinue)  \

         ___UniFwd2(IsXidStart, IsXidContinue)  \

         ___UniFwd3(IsCompositionExclusion, IsCompatibilityDecomposition, IsSbSep)  \

         ___UniFwd1(IsGbExtend)  \

         ___UniFwd2(IsCased, IsCurrency)


         DECLARE_FORWARDED_PROPERTY_METHODS


 #undef ___UniFwd1


         bool IsPrivateUse(const int cp) const {

                 int i = h.GetKeyId(cp); if (i >= 0) return h[i].IsPrivateUse();

                 return (0xe000 <= cp && cp <= 0xf8ff) ||  // plane 0 private-use area

                         // Planes 15 and 16 are entirely for private use.

                         (0xf0000 <= cp && cp <= 0xffffd) || (0x100000 <= cp && cp <= 0x10fffd); }

         // Note: d800..dbff are high surrogates, dc00..dfff are low surrogates.

         // For db80..dbff it is clear that the surrogate pair containing this high surrogate

         // will refer to a private-use codepoint, but IsPrivateUse nevertheless returns false

         // for db80..dbff.  This is consistent with the category codes assigned in UnicodeData.txt.

         bool IsSurrogate(const int cp) const {

                 int i = h.GetKeyId(cp); if (i >= 0) return h[i].IsSurrogate();

                 return 0xd800 <= cp && cp <= 0xdcff; }


         // Note: in particular, all Hangul characters (HangulLBase..HangulLBase + HangulLCount - 1

         // and HangulSBase..HangulSBase + HangulSCount - 1) should be treated as starters

         // for composition to work correctly.

         int GetCombiningClass(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return TUniChInfo::ccStarter; else return h[i].combClass; }


         //-------------------------------------------------------------------------

         // Hangul constants

         //-------------------------------------------------------------------------


         enum {

         HangulSBase = 0xAC00, HangulLBase = 0x1100, HangulVBase = 0x1161, HangulTBase = 0x11A7,

         HangulLCount = 19, HangulVCount = 21, HangulTCount = 28,

         HangulNCount = HangulVCount * HangulTCount,   // 588

         HangulSCount = HangulLCount * HangulNCount   // 11172

         };


         //-------------------------------------------------------------------------

         // Word boundaries (UAX #29)

         //-------------------------------------------------------------------------


 protected:

         // UAX #29, rule WB3: ignore Format and Extend characters.

         // [Note: rule SB5 for sentence boundaries is identical, and thus these methods will also be used for sentence-boundary detection.]

         static bool IsWbIgnored(const TUniChInfo& ci) { return ci.IsGbExtend() || ci.IsWbFormat(); }

         bool IsWbIgnored(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return false; else return IsWbIgnored(h[i]); }

         // Sets 'position' to the smallest index from 'position..srcEnd-1' that contains a non-ignored character.

         template<typename TSrcVec> void WbFindCurOrNextNonIgnored(const TSrcVec& src, size_t& position, const size_t srcEnd) const {

                 while (position < srcEnd && IsWbIgnored(src[TVecIdx(position)])) position++; }

         // Sets 'position' to the smallest index from 'position+1..srcEnd-1' that contains a non-ignored character.

         template<typename TSrcVec> void WbFindNextNonIgnored(const TSrcVec& src, size_t& position, const size_t srcEnd) const {

                 if (position >= srcEnd) return;

                 position++; while (position < srcEnd && IsWbIgnored(src[TVecIdx(position)])) position++; }

         // Sets 'position' to the smallest index from 'position+1..srcEnd-1' that contains a non-ignored character.

         template<typename TSrcVec> void WbFindNextNonIgnoredS(const TSrcVec& src, size_t& position, const size_t srcEnd) const {

                 if (position >= srcEnd) return;

                 if (IsSbSep(src[TVecIdx(position)])) { position++; return; }

                 position++; while (position < srcEnd && IsWbIgnored(src[TVecIdx(position)])) position++; }

         // Sets 'position' to the largest index from 'srcStart..position-1' that contains a non-ignored character.

         template<typename TSrcVec> bool WbFindPrevNonIgnored(const TSrcVec& src, const size_t srcStart, size_t& position) const {

                 if (position <= srcStart) return false;

                 while (position > srcStart) {

                         position--; if (! IsWbIgnored(src[TVecIdx(position)])) return true; }

                 return false; }

         // Test driver for WbFind*NonIgnored.

         void TestWbFindNonIgnored(const TIntV& src) const;

         void TestWbFindNonIgnored() const;

 public:

         // Finds the next word boundary strictly after 'position'.

         // Note that there is a valid word boundary at 'srcIdx + srcCount'.

         // If there is no such word boundary, it returns 'false' and sets 'position' to 'srcIdx + srcCount'.

         template<typename TSrcVec>

         bool FindNextWordBoundary(const TSrcVec& src, const size_t srcIdx, const size_t srcCount, size_t &position) const;

         // Creates, in 'dest', a vector of 'srcCount + 1' elements, where 'dest[i]' tells if there is a word

         // boundary between 'src[srcIdx + i - 1]' and 'src[srcIdx + i]'.  Note that 'dest[0]' and 'dest[srcCount]' are

         // always set to 'true'.

         template<typename TSrcVec>

         void FindWordBoundaries(const TSrcVec& src, const size_t srcIdx, const size_t srcCount, TBoolV& dest) const;

 protected:

         void TestFindNextWordOrSentenceBoundary(const TStr& basePath, bool sentence);


         //-------------------------------------------------------------------------

         // Sentence boundaries (UAX #29)

         //-------------------------------------------------------------------------


 protected:

         TUniTrie<TInt> sbExTrie;


         // Checks whether a sentence that ended at src[position - 1]

         // would end in one of the suffixes from sbExTrie.

         template<typename TSrcVec>

         bool CanSentenceEndHere(const TSrcVec& src, const size_t srcIdx, const size_t position) const;


 public:

         // Finds the next sentence boundary strictly after 'position'.

         // Note that there is a valid sentence boundary at 'srcIdx + srcCount'.

         // If there is no such sentence boundary, it returns 'false' and sets 'position' to 'srcIdx + srcCount'.

         template<typename TSrcVec>

         bool FindNextSentenceBoundary(const TSrcVec& src, const size_t srcIdx, const size_t srcCount, size_t &position) const;

         // Creates, in 'dest', a vector of 'srcCount + 1' elements, where 'dest[i]' tells if there is a sentence

         // boundary between 'src[srcIdx + i - 1]' and 'src[srcIdx + i]'.  Note that 'dest[0]' and 'dest[srcCount]' are

         // always set to 'true'.

         template<typename TSrcVec>

         void FindSentenceBoundaries(const TSrcVec& src, const size_t srcIdx, const size_t srcCount, TBoolV& dest) const;


         // These methods allow the user to define a set of sentence boundary exceptions.

         // This is a set of strings, stored in 'sbExTrie'.  If the Unicode rules require

         // a sentence boundary in a position that would cause the sentence to end with

         // 'x (STerm | ATerm) Close* Sp* Sep?', where 'x' is a word from 'sbExTrie',

         // we will *not* place a sentence boundary there.

         //

         // NOTE: sbExTrie is not saved or loaded by the Save() and Load() methods.

         // By default, it is empty.  Use SbEx_Clr() to clear it, and SbEx_SetStdEnglish() to obtain

         // a standard set of English-language exceptions.

         void SbEx_Clr() { sbExTrie.Clr(); }

         template<class TSrcVec> void SbEx_Add(const TSrcVec& v) { sbExTrie.Add(v); }

         // template<> void SbEx_Add(const TStr& s) {

         void SbEx_Add(const TStr& s) {

           TIntV v; int n = s.Len(); v.Gen(n); for (int i = 0; i < n; i++) v[i] = int(uchar(s[i])); SbEx_Add(v); }

         void SbEx_AddUtf8(const TStr& s) { TUniCodec codec; TIntV v; codec.DecodeUtf8(s, v); SbEx_Add(v); }

         int SbEx_AddMulti(const TStr& words, const bool wordsAreUtf8 = true) { TStrV vec; words.SplitOnAllCh('|', vec);

                 for (int i = 0; i < vec.Len(); i++) if (wordsAreUtf8) SbEx_AddUtf8(vec[i]); else SbEx_Add(vec[i]);

                 return vec.Len(); }

         void SbEx_Set(const TUniTrie<TInt>& newTrie) { sbExTrie = newTrie; }

         int SbEx_SetStdEnglish() {

                 static const TStr data = "Ms|Mrs|Mr|Rev|Dr|Prof|Gov|Sen|Rep|Gen|Brig|Col|Capt|Lieut|Lt|Sgt|Pvt|Cmdr|Adm|Corp|St|Mt|Ft|e.g|e. g.|i.e|i. e|ib|ibid|s.v|s. v|s.vv|s. vv";

                 SbEx_Clr(); return SbEx_AddMulti(data, false); }


         //-------------------------------------------------------------------------

         // Normalization, decomposition, etc. (UAX #15)

         //-------------------------------------------------------------------------


 protected:

         // Adds, to 'dest', the decomposition of 'codePoint' (calling itself recursively if necessary).

         // If 'compatibility == false', only canonical decompositions are used.

         template<typename TDestCh>

         void AddDecomposition(const int codePoint, TVec<TDestCh>& dest, const bool compatibility) const;

 public:

         // This appends, to 'dest', the decomposed form of the source string.

         // - for normalization form D (NFD), i.e. canonical decomposition: use compatibility == false;

         // - for normalization form KD (NFKD), i.e. compatibility decomposition: use compatibility == true.

         template<typename TSrcVec, typename TDestCh>

         void Decompose(const TSrcVec& src, size_t srcIdx, const size_t srcCount,

                         TVec<TDestCh>& dest, bool compatibility, bool clrDest = true) const;

         template<typename TSrcVec, typename TDestCh>

         void Decompose(const TSrcVec& src, TVec<TDestCh>& dest, bool compatibility, bool clrDest = true) const {

                 Decompose(src, 0, src.Len(), dest, compatibility, clrDest); }

         // This performs canonical composition on the source string, and appends

         // the result to the destination string.  The source string should be the

         // result of a (canonical or compatibility) decomposition; if this is the

         // case, the composition will lead to a normalization form C (NFC) or

         // normalization form KC (NFKC), depending on whether canonical or compatibility

         // decomposition was used.

         template<typename TSrcVec, typename TDestCh>

         void Compose(const TSrcVec& src, size_t srcIdx, const size_t srcCount,

                         TVec<TDestCh>& dest, bool clrDest = true) const;

         template<typename TSrcVec, typename TDestCh>

         void Compose(const TSrcVec& src, TVec<TDestCh>& dest, bool clrDest = true) const {

                 Compose(src, 0, src.Len(), dest, clrDest); }

         // Calls Decompose, followed by Compose; thus the result is the NFC (if

         // compatibility == false) or NFKC (if compatibility == true) of the source string.

         // A temporary TIntV is used to contain the intermediate NF(K)D form of the

         // source string.

         template<typename TSrcVec, typename TDestCh>

         void DecomposeAndCompose(const TSrcVec& src, size_t srcIdx, const size_t srcCount,

                         TVec<TDestCh>& dest, bool compatibility, bool clrDest = true) const;

         template<typename TSrcVec, typename TDestCh>

         void DecomposeAndCompose(const TSrcVec& src, TVec<TDestCh>& dest, bool compatibility, bool clrDest = true) const {

                 DecomposeAndCompose(src, 0, src.Len(), dest, compatibility, clrDest); }

         // Copies the starter characters from 'src' to 'dest'; the other

         // characters are skipped.  'src' should already have been decomposed.

         // Returns the number of characters extracted.

         template<typename TSrcVec, typename TDestCh>

         size_t ExtractStarters(const TSrcVec& src, size_t srcIdx, const size_t srcCount,

                         TVec<TDestCh>& dest, bool clrDest = true) const;

         template<typename TSrcVec, typename TDestCh>

         size_t ExtractStarters(const TSrcVec& src, TVec<TDestCh>& dest, bool clrDest = true) const {

                 return ExtractStarters(src, 0, src.Len(), dest, clrDest); }

         // Extracts the starters into a temporary vector and then copies it into 'src'.

         template<typename TSrcVec>

         size_t ExtractStarters(TSrcVec& src) const {

                 TIntV temp; size_t retVal = ExtractStarters(src, temp);

                 src.Clr(); for (int i = 0; i < temp.Len(); i++) src.Add(temp[i]);

                 return retVal; }


 protected:

         void TestComposition(const TStr& basePath);


         //-------------------------------------------------------------------------

         // Initialization from the text files

         //-------------------------------------------------------------------------


 protected:

         void InitWordAndSentenceBoundaryFlags(const TStr& basePath);

         void InitScripts(const TStr& basePath);

         void InitLineBreaks(const TStr& basePath);

         void InitDerivedCoreProperties(const TStr& basePath);

         void InitPropList(const TStr& basePath);

         void InitSpecialCasing(const TStr& basePath);

         void LoadTxt_ProcessDecomposition(TUniChInfo& ci, TStr s);

 public:

         void LoadTxt(const TStr& basePath);

         void SaveBin(const TStr& fnBinUcd);


         //-------------------------------------------------------------------------

         // Case conversions

         //-------------------------------------------------------------------------


 public:

         typedef enum TCaseConversion_ { ccLower = 0, ccUpper = 1, ccTitle = 2, ccMax = 3 } TCaseConversion;

         // Appends the case-converted form of 'src' to 'dest'.

         // 'how' defines what kind of case conversion is required.

         // 'turkic' should be set to true iff the text is in Turkic ('tr') or Azeri ('ar').

         // 'lithuanian' should be set to true iff the text is in Lithuanian ('lt').

         template<typename TSrcVec, typename TDestCh> void GetCaseConverted(const TSrcVec& src, size_t srcIdx, const size_t srcCount, TVec<TDestCh>& dest, const bool clrDest, const TCaseConversion how, const bool turkic, const bool lithuanian) const;

         template<typename TSrcVec, typename TDestCh> void GetLowerCase(const TSrcVec& src, size_t srcIdx, const size_t srcCount, TVec<TDestCh>& dest, const bool clrDest = true, const bool turkic = false, const bool lithuanian = false) const { GetCaseConverted(src, srcIdx, srcCount, dest, clrDest, ccLower, turkic, lithuanian); }

         template<typename TSrcVec, typename TDestCh> void GetUpperCase(const TSrcVec& src, size_t srcIdx, const size_t srcCount, TVec<TDestCh>& dest, const bool clrDest = true, const bool turkic = false, const bool lithuanian = false) const { GetCaseConverted(src, srcIdx, srcCount, dest, clrDest, ccUpper, turkic, lithuanian); }

         template<typename TSrcVec, typename TDestCh> void GetTitleCase(const TSrcVec& src, size_t srcIdx, const size_t srcCount, TVec<TDestCh>& dest, const bool clrDest = true, const bool turkic = false, const bool lithuanian = false) const { GetCaseConverted(src, srcIdx, srcCount, dest, clrDest, ccTitle, turkic, lithuanian); }

         template<typename TSrcVec, typename TDestCh> void GetLowerCase(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true, const bool turkic = false, const bool lithuanian = false) const { GetLowerCase(src, 0, src.Len(), dest, clrDest, turkic, lithuanian); }

         template<typename TSrcVec, typename TDestCh> void GetUpperCase(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true, const bool turkic = false, const bool lithuanian = false) const { GetUpperCase(src, 0, src.Len(), dest, clrDest, turkic, lithuanian); }

         template<typename TSrcVec, typename TDestCh> void GetTitleCase(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true, const bool turkic = false, const bool lithuanian = false) const { GetTitleCase(src, 0, src.Len(), dest, clrDest, turkic, lithuanian); }


         // GetSimpleCaseConverted uses only the simple case mappings (from UnicodeData.txt).

         // This is simpler and faster.  Since each character now maps into exactly one

         // character, case conversion can also be done in place (see ToSimpleCaseConverted, etc.).

         template<typename TSrcVec, typename TDestCh> void GetSimpleCaseConverted(const TSrcVec& src, size_t srcIdx, const size_t srcCount, TVec<TDestCh>& dest, const bool clrDest, const TCaseConversion how) const;

         template<typename TSrcVec, typename TDestCh> void GetSimpleLowerCase(const TSrcVec& src, size_t srcIdx, const size_t srcCount, TVec<TDestCh>& dest, const bool clrDest = true) const { GetSimpleCaseConverted(src, srcIdx, srcCount, dest, clrDest, ccLower); }

         template<typename TSrcVec, typename TDestCh> void GetSimpleUpperCase(const TSrcVec& src, size_t srcIdx, const size_t srcCount, TVec<TDestCh>& dest, const bool clrDest = true) const { GetSimpleCaseConverted(src, srcIdx, srcCount, dest, clrDest, ccUpper); }

         template<typename TSrcVec, typename TDestCh> void GetSimpleTitleCase(const TSrcVec& src, size_t srcIdx, const size_t srcCount, TVec<TDestCh>& dest, const bool clrDest = true) const { GetSimpleCaseConverted(src, srcIdx, srcCount, dest, clrDest, ccTitle); }

         template<typename TSrcVec, typename TDestCh> void GetSimpleLowerCase(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true) const { GetSimpleLowerCase(src, 0, src.Len(), dest, clrDest); }

         template<typename TSrcVec, typename TDestCh> void GetSimpleUpperCase(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true) const { GetSimpleUpperCase(src, 0, src.Len(), dest, clrDest); }

         template<typename TSrcVec, typename TDestCh> void GetSimpleTitleCase(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true) const { GetSimpleTitleCase(src, 0, src.Len(), dest, clrDest); }


         template<typename TSrcVec> void ToSimpleCaseConverted(TSrcVec& src, size_t srcIdx, const size_t srcCount, const TCaseConversion how) const;

         template<typename TSrcVec> void ToSimpleUpperCase(TSrcVec& src, size_t srcIdx, const size_t srcCount) const { ToSimpleCaseConverted(src, srcIdx, srcCount, ccUpper); }

         template<typename TSrcVec> void ToSimpleLowerCase(TSrcVec& src, size_t srcIdx, const size_t srcCount) const { ToSimpleCaseConverted(src, srcIdx, srcCount, ccLower); }

         template<typename TSrcVec> void ToSimpleTitleCase(TSrcVec& src, size_t srcIdx, const size_t srcCount) const { ToSimpleCaseConverted(src, srcIdx, srcCount, ccTitle); }

         template<typename TSrcVec> void ToSimpleUpperCase(TSrcVec& src) const { ToSimpleUpperCase(src, 0, src.Len()); }

         template<typename TSrcVec> void ToSimpleLowerCase(TSrcVec& src) const { ToSimpleLowerCase(src, 0, src.Len()); }

         template<typename TSrcVec> void ToSimpleTitleCase(TSrcVec& src) const { ToSimpleTitleCase(src, 0, src.Len()); }


 public:

         friend class TUniCaseFolding;


         // Case folding is an alternative to the above functions.  It is intended primarily

         // to produce strings that are suitable for comparisons.  For example,

         // ToLowerCase(sigma) = sigma, ToLowerCase(final-sigma) = final-sigma;

         // but ToCaseFolder(sigma) = sigma, ToCaseFolded(final-sigma) = sigma.

         // - 'turkic' enables special processing for Turkic languages (I-dot and i-dotless).

         // - 'full' enables full case mappings -- i.e. sometimes a character may be mapped

         //   into a string of two or more characters.

         // - Note: For best results, perform NFD(CaseFold(NFD(x)) or NFKD(CaseFold(NFKD(x)) on

         //   each string before comparing them (see sec. 3.13 of the standard).

         template<typename TSrcVec, typename TDestCh>

         void GetCaseFolded(const TSrcVec& src, size_t srcIdx, const size_t srcCount,

                 TVec<TDestCh>& dest, const bool clrDest, const bool full, const bool turkic = false) const { caseFolding.Fold(src, srcIdx, srcCount, dest, clrDest, full, turkic); }

         template<typename TSrcVec, typename TDestCh>

         void GetCaseFolded(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true, const bool full = true, const bool turkic = false) const {

                 GetCaseFolded(src, 0, src.Len(), dest, clrDest, full, turkic); }

         // ToCaseFolded folds the string in place.  However, this means that only the simple

         // case foldings can be used (the full ones could increase the length of the string).

         template<typename TSrcVec> void ToCaseFolded(TSrcVec& src, size_t srcIdx, const size_t srcCount, const bool turkic = false) const { caseFolding.FoldInPlace(src, srcIdx, srcCount, turkic); }

         template<typename TSrcVec> void ToCaseFolded(TSrcVec& src, const bool turkic = false) const { ToCaseFolded(src, 0, src.Len(), turkic); }


 protected:

         void TestCaseConversion(const TStr& source, const TStr& trueLc, const TStr& trueTc, const TStr& trueUc, bool turkic, bool lithuanian);

         void TestCaseConversions();


         //-------------------------------------------------------------------------

         // Text file reader for the Unicode character database

         //-------------------------------------------------------------------------


 protected:


         class TUcdFileReader

         {

         protected:

                 TChA buf;

         public:

                 TChA comment; // contains '#' and everything after it

         protected:

                 FILE *f;

                 int putBackCh;

                 int GetCh() {

                         if (putBackCh >= 0) { int c = putBackCh; putBackCh = EOF; return c; }

                         return fgetc(f); }

                 void PutBack(int c) { Assert(putBackCh == EOF); putBackCh = c; }

                 // Returns 'false' iff the EOF was encountered before anything was read.

                 bool ReadNextLine() {

                         buf.Clr(); comment.Clr();

                         bool inComment = false, first = true;

                         while (true) {

                                 int c = GetCh();

                                 if (c == EOF) return ! first;

                                 else if (c == 13) {

                                         c = GetCh(); if (c != 10) PutBack(c);

                                         return true; }

                                 else if (c == 10) return true;

                                 else if (c == '#') inComment = true;

                                 if (! inComment) buf += char(c);

                                 else comment += char(c); }

                                 /*first = false;*/}

         private:

                 TUcdFileReader& operator = (const TUcdFileReader& r) { Fail; return *((TUcdFileReader *) 0); }

                 TUcdFileReader(const TUcdFileReader& r) { Fail; }

         public:

                 TUcdFileReader() : f(0) { }

                 TUcdFileReader(const TStr& fileName) : f(0), putBackCh(EOF) { Open(fileName); }

                 void Open(const TStr& fileName) { Close(); f = fopen(fileName.CStr(), "rt"); IAssertR(f, fileName); putBackCh = EOF; }

                 void Close() { putBackCh = EOF; if (f) { fclose(f); f = 0; }}

                 ~TUcdFileReader() { Close(); }

                 bool GetNextLine(TStrV& dest) {

                         dest.Clr();

                         while (true) {

                                 if (! ReadNextLine()) return false;

                                 TStr line = buf; line.ToTrunc();

                                 if (line.Len() <= 0) continue;

                                 line.SplitOnAllCh(';', dest, false);

                                 for (int i = 0; i < dest.Len(); i++) dest[i].ToTrunc();

                                 return true; }}

                 static int ParseCodePoint(const TStr& s) {

                         int c; bool ok = s.IsHexInt(true, 0, 0x10ffff, c); IAssertR(ok, s); return c; }

                 static void ParseCodePointList(const TStr& s, TIntV& dest, bool ClrDestP = true) { // space-separated list

                         if (ClrDestP) dest.Clr();

                         TStrV parts; s.SplitOnWs(parts);

                         for (int i = 0; i < parts.Len(); i++) {

                                 int c; bool ok = parts[i].IsHexInt(true, 0, 0x10ffff, c); IAssertR(ok, s);

                                 dest.Add(c); } }

                 static void ParseCodePointRange(const TStr& s, int& from, int &to) { // xxxx or xxxx..yyyy

                         int i = s.SearchStr(".."); if (i < 0) { from = ParseCodePoint(s); to = from; return; }

                         from = ParseCodePoint(s.GetSubStr(0, i - 1));

                         to = ParseCodePoint(s.GetSubStr(i + 2, s.Len() - 1)); }

         };


         //-------------------------------------------------------------------------

         // Helper class for processing the text files

         //-------------------------------------------------------------------------

         // Files such as DerivedCoreProps.txt often refer to ranges of codepoints,

         // and not all codepoints from the range have also been listed in

         // UnicodeData.txt.  Thus, new TUniChInfo instances will be created

         // when processing DerivedCoreProps.txt and similar files.

         // To assign the correct (sub)categories to these new codepoints,

         // the following class will extract the subcategory info from the

         // comments in DerivedCoreProps.txt and similar files.


         class TSubcatHelper

         {

         public:

                 bool hasCat; TUniChSubCategory subCat;

                 TStrH invalidCatCodes;

                 TUniChDb &owner;


                 TSubcatHelper(TUniChDb &owner_) : owner(owner_) { }


                 void ProcessComment(TUniChDb::TUcdFileReader &reader)

                 {

                         hasCat = false; subCat = ucOtherNotAssigned;

                         if (reader.comment.Len() > 3)

                         {

                                 IAssert(reader.comment[0] == '#');

                                 IAssert(reader.comment[1] == ' ');

                                 char chCat = reader.comment[2], chSubCat = reader.comment[3];

                                 if (reader.comment.Len() > 4) IAssert(isspace(uchar(reader.comment[4])));

                                 if (TUniChInfo::IsValidSubCat(chCat, chSubCat)) {

                                         hasCat = true; subCat = (TUniChSubCategory) ((int(uchar(chCat)) << 8) | (int(uchar(chSubCat)))); }

                                 else invalidCatCodes.AddKey(TStr(chCat) + TStr(chSubCat));

                         }

                 }


                 void SetCat(const int cp) {

                         int i = owner.h.GetKeyId(cp); IAssert(i >= 0);

                         IAssert(owner.h[i].subCat == ucOtherNotAssigned);

                         IAssert(hasCat);

                         owner.h[i].SetCatAndSubCat(subCat); }

                 void TestCat(const int cp) {

                         if (! hasCat) return;

                         int i = owner.h.GetKeyId(cp); IAssert(i >= 0);

                         IAssert(owner.h[i].subCat == subCat); }


                 ~TSubcatHelper()

                 {

                         if (invalidCatCodes.IsKey("L&")) invalidCatCodes.DelKey("L&");

                         // Output any unexpected ones (there shouldn't be any).

                         if (! invalidCatCodes.Empty()) {

                                 printf("Invalid cat code(s) in the comments: ");

                                 for (int i = invalidCatCodes.FFirstKeyId(); invalidCatCodes.FNextKeyId(i); )

                                         printf(" \"%s\"", invalidCatCodes.GetKey(i).CStr());

                                 printf("\n"); }

                 }

         };

 };


 //-----------------------------------------------------------------------------

 // TUnicode -- a sadly emasculated wrapper around TUniCodec and TUniChDb

 //-----------------------------------------------------------------------------


 class TUnicode

 {

 public:

         TUniCodec codec;

         TUniChDb ucd;


         TUnicode() { Init(); }

         explicit TUnicode(const TStr& fnBinUcd) { ucd.LoadBin(fnBinUcd); Init(); }

         void Init() { InitCodecs(); }


         //-----------------------------------------------------------------------

         // UTF-8

         //-----------------------------------------------------------------------


         // Returns the number of characters that have been successfully decoded.

         // This does not include any replacement characters that may have been inserted into 'dest'.

         int DecodeUtf8(const TIntV& src, TIntV& dest) const { return (int) codec.DecodeUtf8(src, dest); }

         int DecodeUtf8(const TStr& src, TIntV& dest) const { return (int) codec.DecodeUtf8(src, dest); }


         // Returns the number of characters that have been successfully encoded.

         // This does not include any replacement characters that may have been inserted into 'dest'.

         int EncodeUtf8(const TIntV& src, TIntV& dest) const { return (int) codec.EncodeUtf8(src, dest); }


         // The following wrapper around the UTF-8 encoder returns a TStr containing

         // the UTF-8-encoded version of the input string.

         TStr EncodeUtf8Str(const TIntV& src) const { return codec.EncodeUtf8Str(src); }


         // encoding one character to UTF8

         static void EncodeUtf8(const uint& Ch, TChA& Dest);

         static TStr EncodeUtf8(const uint& Ch);


         //-----------------------------------------------------------------------

         // UTF-16 Decoder

         //-----------------------------------------------------------------------


         // Returns the number of characters that have been successfully decoded.

         // This does not include any replacement characters that may have been inserted into 'dest'.

         // Each element of 'src' is assumed to contain one byte of data.

         // srcCount must be even (though srcIdx doesn't need to be).

         int DecodeUtf16FromBytes(const TIntV& src, TIntV& dest,

                 const TUtf16BomHandling bomHandling = bomAllowed,

                 const TUniByteOrder defaultByteOrder = boMachineEndian) const {

                         return (int) codec.DecodeUtf16FromBytes(src, 0, src.Len(), dest, true, bomHandling, defaultByteOrder); }


         // Here, each element of 'src' is treated as a 16-bit word.  The byte-order settings

         // are used to determine if the two bytes of each word should be swapped before further

         // processing.  For example, if a BOM is present, it must have the value 0xfeff; if it

         // actually has the value 0xfffe, this means that the two bytes of each word must be swapped.

         // Basically, the combination of the byteOrder parameter and the byte order mark (if present) at the

         // beginning of the source data is used to determine the "original" byte order of the data;

         // if this doesn't match the byte order of the local machine, the two bytes of each word will

         // be swapped during the decoding process.

         int DecodeUtf16FromWords(const TIntV& src, TIntV& dest,

                 const TUtf16BomHandling bomHandling = bomAllowed,

                 const TUniByteOrder defaultByteOrder = boMachineEndian) const {

                         return (int) codec.DecodeUtf16FromWords(src, 0, src.Len(), dest, true, bomHandling, defaultByteOrder); }


         //-----------------------------------------------------------------------

         // UTF-16 Encoder

         //-----------------------------------------------------------------------


         // Returns the number of characters that have been successfully encoded.

         // This does not include any replacement characters that may have been inserted into 'dest'.

         int EncodeUtf16ToWords(const TIntV& src, TIntV& dest, const bool insertBom,

                 const TUniByteOrder destByteOrder = boMachineEndian) const {

                         return (int) codec.EncodeUtf16ToWords(src, 0, src.Len(), dest, true, insertBom, destByteOrder); }


         int EncodeUtf16ToBytes(const TIntV& src, TIntV& dest, const bool insertBom,

                 const TUniByteOrder destByteOrder = boMachineEndian) const {

                         return (int) codec.EncodeUtf16ToBytes(src, 0, src.Len(), dest, true, insertBom, destByteOrder); }


         //-----------------------------------------------------------------------

         // 8-bit codecs

         //-----------------------------------------------------------------------


         T8BitCodec<TEncoding_ISO8859_1> iso8859_1;

         T8BitCodec<TEncoding_ISO8859_2> iso8859_2;

         T8BitCodec<TEncoding_ISO8859_3> iso8859_3;

         T8BitCodec<TEncoding_ISO8859_4> iso8859_4;

         T8BitCodec<TEncoding_YuAscii> yuAscii;

         T8BitCodec<TEncoding_CP1250> cp1250;

         T8BitCodec<TEncoding_CP852> cp852;

         T8BitCodec<TEncoding_CP437> cp437;


         //-----------------------------------------------------------------------

         // Codec registry

         //-----------------------------------------------------------------------

         // If you know you'll need ISO-8859-2, just use

         //   TUnicode unicode;

         //   unicode.iso8859_2.Encode(...);

         // If you don't know what you'll need, use:

         //   TUnicode unicode;

         //   PCodecBase myCodec = unicode.GetCodec(myCodecName);

         //   myCodec->Encode(...);

         // Note that the first approach is slightly more efficient because there

         // aren't any virtual method calls involved.


 protected:

         THash<TStr, PCodecBase> codecs;

         static inline TStr NormalizeCodecName(const TStr& name) {

                 TStr s = name.GetLc(); s.ChangeStrAll("_", ""); s.ChangeStrAll("-", ""); return s; }

 public:

         void RegisterCodec(const TStr& nameList, const PCodecBase& codec) {

                 TStrV names; nameList.SplitOnWs(names);

                 for (int i = 0; i < names.Len(); i++)

                         codecs.AddDat(NormalizeCodecName(names[i]), codec); }

         void UnregisterCodec(const TStr& nameList) {

                 TStrV names; nameList.SplitOnWs(names);

                 for (int i = 0; i < names.Len(); i++)

                         codecs.DelKey(NormalizeCodecName(names[i])); }

         void ClrCodecs() { codecs.Clr(); }

         void InitCodecs();

         PCodecBase GetCodec(const TStr& name) const {

                 TStr s = NormalizeCodecName(name);

                 PCodecBase p; if (! codecs.IsKeyGetDat(s, p)) p.Clr();

                 return p; }

         void GetAllCodecs(TCodecBaseV& dest) const {

                 dest.Clr();

                 for (int i = codecs.FFirstKeyId(); codecs.FNextKeyId(i); ) {

                         PCodecBase codec = codecs[i]; bool found = false;

                         for (int j = 0; j < dest.Len(); j++) if (dest[j]() == codec()) { found = true; break; }

                         if (! found) dest.Add(codec); }}


         //-------------------------------------------------------------------------

         // Word boundaries (UAX #29)

         //-------------------------------------------------------------------------


         // Finds the next word boundary strictly after 'position'.

         // Note that there are valid word boundaries at 0 and at 'src.Len()'.

         // If there is no such word boundary, it returns 'false' and sets 'position' to 'src.Len()'.

         bool FindNextWordBoundary(const TIntV& src, int &position) const {

                 if (position < 0) { position = 0; return true; }

                 size_t position_; bool retVal = ucd.FindNextWordBoundary(src, 0, src.Len(), position_); position = int(position_); return retVal; }

         // Creates, in 'dest', a vector of 'src.Len() + 1' elements, where 'dest[i]' tells if there is a word

         // boundary between 'src[i - 1]' and 'src[i]'.  Note that 'dest[0]' and 'dest[src.Len()]' are

         // always set to 'true'.

         void FindWordBoundaries(const TIntV& src, TBoolV& dest) const { ucd.FindWordBoundaries(src, 0, src.Len(), dest); }


         //-------------------------------------------------------------------------

         // Sentence boundaries (UAX #29)

         //-------------------------------------------------------------------------


         // Finds the next sentence boundary strictly after 'position'.

         // Note that there are valid sentence boundaries at 0 and at 'src.Len()'.

         // If there is no such sentence boundary, it returns 'false' and sets 'position' to 'src.Len()'.

         bool FindNextSentenceBoundary(const TIntV& src, int &position) const {

                 if (position < 0) { position = 0; return true; }

                 size_t position_; bool retVal = ucd.FindNextSentenceBoundary(src, 0, src.Len(), position_); position = int(position_); return retVal; }

         // Creates, in 'dest', a vector of 'src.Len() + 1' elements, where 'dest[i]' tells if there is a sentence

         // boundary between 'src[i - 1]' and 'src[i]'.  Note that 'dest[0]' and 'dest[src.Len()]' are

         // always set to 'true'.

         void FindSentenceBoundaries(const TIntV& src, TBoolV& dest) const { ucd.FindSentenceBoundaries(src, 0, src.Len(), dest); }


         void ClrSentenceBoundaryExceptions() { ucd.SbEx_Clr(); }

         void UseEnglishSentenceBoundaryExceptions() { ucd.SbEx_SetStdEnglish(); }


         //-------------------------------------------------------------------------

         // Normalization, decomposition, etc. (UAX #15)

         //-------------------------------------------------------------------------


         // This sets 'dest' to the decomposed form of the source string.

         // - for normalization form D (NFD), i.e. canonical decomposition: use compatibility == false;

         // - for normalization form KD (NFKD), i.e. compatibility decomposition: use compatibility == true.

         void Decompose(const TIntV& src, TIntV& dest, bool compatibility) const { ucd.Decompose(src, dest, compatibility, true); }

         // This performs canonical composition on the source string, and stores

         // the result in the destination vector.  The source string should be the

         // result of a (canonical or compatibility) decomposition; if this is the

         // case, the composition will lead to a normalization form C (NFC) or

         // normalization form KC (NFKC), depending on whether canonical or compatibility

         // decomposition was used.

         void Compose(const TIntV& src, TIntV& dest) const { return ucd.Compose(src, dest, true); }

         // Calls Decompose, followed by Compose; thus the result is the NFC (if

         // compatibility == false) or NFKC (if compatibility == true) of the source string.

         // A temporary TIntV is used to contain the intermediate NF(K)D form of the

         // source string.

         void DecomposeAndCompose(const TIntV& src, TIntV& dest, bool compatibility) const { return ucd.DecomposeAndCompose(src, dest, compatibility); }

         // Copies the starter characters from 'src' to 'dest'; the other

         // characters are skipped.  'src' should already have been decomposed.

         // Returns the number of characters extracted.  This function can be

         // used to remove diacritical marks from a string (after it has been decomposed!).

         int ExtractStarters(const TIntV& src, TIntV& dest) const { return (int) ucd.ExtractStarters(src, dest); }

         // Extracts the starters into a temporary vector and then copies it into 'src'.

         int ExtractStarters(TIntV& src) const { return (int) ucd.ExtractStarters(src); }


         //-------------------------------------------------------------------------

         // Case conversions

         //-------------------------------------------------------------------------

         // NOTE: if you will be dealing with Turkish, Azeri or Lithuanian text,

         // use the case-conversion methods in TUniChDb, which allow the caller

         // to request language-specific case mappings for these languages.


 public:

         typedef TUniChDb::TCaseConversion TCaseConversion;

         // Sets 'dest' to the case-converted form of 'src'.

         void GetLowerCase(const TIntV& src, TIntV& dest) const { ucd.GetLowerCase(src, dest, true, false, false); }

         void GetUpperCase(const TIntV& src, TIntV& dest) const { ucd.GetUpperCase(src, dest, true, false, false); }

         void GetTitleCase(const TIntV& src, TIntV& dest) const { ucd.GetTitleCase(src, dest, true, false, false); }


         // GetSimpleCaseConverted uses only the simple case mappings (from UnicodeData.txt).

         // This is simpler and faster.  Since each character now maps into exactly one

         // character, case conversion can also be done in place (see ToSimpleCaseConverted, etc.).

         void GetSimpleLowerCase(const TIntV& src, TIntV& dest) const { ucd.GetSimpleLowerCase(src, dest, true); }

         void GetSimpleUpperCase(const TIntV& src, TIntV& dest) const { ucd.GetSimpleUpperCase(src, dest, true); }

         void GetSimpleTitleCase(const TIntV& src, TIntV& dest) const { ucd.GetSimpleTitleCase(src, dest, true); }


         // These functions perform simple case-conversions in-place.

         void ToSimpleUpperCase(TIntV& src) const { ucd.ToSimpleUpperCase(src); }

         void ToSimpleLowerCase(TIntV& src) const { ucd.ToSimpleLowerCase(src); }

         void ToSimpleTitleCase(TIntV& src) const { ucd.ToSimpleTitleCase(src); }


         // Case folding is an alternative to the above functions.  It is intended primarily

         // to produce strings that are suitable for comparisons.  For example,

         // ToLowerCase(sigma) = sigma, ToLowerCase(final-sigma) = final-sigma;

         // but ToCaseFolded(sigma) = sigma, ToCaseFolded(final-sigma) = sigma.

         // - 'full' enables full case mappings -- i.e. sometimes a character may be mapped

         //   into a string of two or more characters.

         // - Note: For best results, perform NFD(CaseFold(NFD(x)) or NFKD(CaseFold(NFKD(x)) on

         //   each string before comparing them (see sec. 3.13 of the standard).

         void GetCaseFolded(const TIntV& src, TIntV& dest, const bool full = true) const { return ucd.GetCaseFolded(src, dest, true, full, false); }

         // ToCaseFolded folds the string in place.  However, this means that only the simple

         // case foldings can be used (the full ones could increase the length of the string).

         void ToCaseFolded(TIntV& src) const { return ucd.ToCaseFolded(src, false); }


         TStr GetUtf8CaseFolded(const TStr& s) const {

                 bool isAscii = true;

                 for (int i = 0, n = s.Len(); i < n; i++) if (uchar(s[i]) >= 128) { isAscii = false; break; }

                 if (isAscii) return s.GetLc();

                 TIntV src; DecodeUtf8(s, src);

                 TIntV dest; GetCaseFolded(src, dest);

                 return EncodeUtf8Str(dest); }


         //-------------------------------------------------------------------------

         // Character properties

         //-------------------------------------------------------------------------

         // These methods simply call the corresponding TUniChDb method

         // (which typically calls the corresponding method of TUniChInfo).

         // See the declaration for DECLARE_FORWARDED_PROPERTY_METHODS for a complete list.

         // They are all of the form        bool IsXxxx(const int cp) const

         // Some of the more notable ones include:

         // - IsAlphabetic, IsUppercase, IsLowercase, IsMath, IsAsciiHexDigit

         //   IsDash, IsDeprecated, IsDiacritic, IsHexDigit, IsHyphen, IsIdeographic

         //   IsNoncharacter, IsQuotationMark, IsSoftDotted, IsTerminalPunctuation, IsWhiteSpace


 #define ___UniFwd1(name) bool name(const int cp) const { return ucd.name(cp); }

         DECLARE_FORWARDED_PROPERTY_METHODS

 #undef DECLARE_FORWARDED_PROPERTY_METHODS

 #undef __UniFwd1

         ___UniFwd2(IsPrivateUse, IsSurrogate)


         TUniChCategory GetCat(const int cp) const { return ucd.GetCat(cp); }

         TUniChSubCategory GetSubCat(const int cp) const { return ucd.GetSubCat(cp); }


         // GetCharName returns 0 if the name is unknown; GetCharNameS returns a string of the form "U+1234".

         const char *GetCharName(const int cp) const { return ucd.GetCharName(cp); }

         TStr GetCharNameS(const int cp) const { return ucd.GetCharNameS(cp); }


 };


 //-----------------------------------------------------------------------------

 // TUniCodec -- UTF-8 Decoder

 //-----------------------------------------------------------------------------


 // Returns the number of characters that have been successfully decoded.

 // This does not include any replacement characters that may have been inserted into 'dest'.

 template<typename TSrcVec, typename TDestCh>

 size_t TUniCodec::DecodeUtf8(

         const TSrcVec& src, size_t srcIdx, const size_t srcCount,

         TVec<TDestCh>& dest, const bool clrDest) const

 {

         size_t nDecoded = 0;

         if (clrDest) dest.Clr();

         const size_t origSrcIdx = srcIdx;

         const size_t srcEnd = srcIdx + srcCount;

         while (srcIdx < srcEnd)

         {

                 const size_t charSrcIdx = srcIdx;

                 uint c = src[TVecIdx(srcIdx)] & 0xff; srcIdx++;

                 if ((c & _1000_0000) == 0) {

                         // c is one of the characters 0..0x7f, encoded as a single byte.

                         dest.Add(TDestCh(c)); nDecoded++; continue; }

                 else if ((c & _1100_0000) == _1000_0000) {

                         // No character in a valid UTF-8-encoded string should begin with a byte of the form 10xxxxxx.

                         // We must have been thrown into the middle of a multi-byte character.

                         switch (errorHandling) {

                         case uehThrow: throw TUnicodeException(charSrcIdx, c, "Invalid character: 10xxxxxx.");

                         case uehAbort: return nDecoded;

                         case uehReplace: dest.Add(TDestCh(replacementChar)); continue;

                         case uehIgnore: continue;

                         default: Fail; } }

                 else

                 {

                         // c introduces a sequence of 2..6 bytes, depending on how many

                         // of the most significant bits of c are set.

                         uint nMoreBytes = 0, nBits = 0, minVal = 0;

                         if ((c & _1110_0000) == _1100_0000) nMoreBytes = 1, nBits = 5, minVal = 0x80;

                         else if ((c & _1111_0000) == _1110_0000) nMoreBytes = 2, nBits = 4, minVal = 0x800;

                         else if ((c & _1111_1000) == _1111_0000) nMoreBytes = 3, nBits = 3, minVal = 0x10000;

                         else if ((c & _1111_1100) == _1111_1000) nMoreBytes = 4, nBits = 2, minVal = 0x200000;

                         else if ((c & _1111_1110) == _1111_1100) nMoreBytes = 5, nBits = 1, minVal = 0x4000000;

                         else {

                                 // c is of the form 1111111x, which is invalid even in the early definitions of UTF-8

                                 // (which allowed the encoding of codepoints up to 2^31 - 1).  However, in principle this

                                 // could be used to encode 32-bit integers with the msb set: 1aaabbbbccccddddeeeeffffgggghhhh

                                 // could be encoded as 1111111a 10aabbbb 10ccccdd 10ddeeee 10ffffgg 10gghhhh.

                                 if (strict)  {

                                         switch (errorHandling) {

                                         case uehThrow: throw TUnicodeException(charSrcIdx, c, "Invalid character: 1111111x.");

                                         case uehAbort: return nDecoded;

                                         // In the case of uehReplace and uehIgnore, we'll read the next 5 bytes

                                         // and try to decode the character.  Then, since 'strict' is true and

                                         // the codepoint is clearly >= 2^31, we'll notice this as an error later

                                         // and (in the case of uehReplace) insert a replacement character then.

                                         // This is probably better than inserting a replacement character right

                                         // away and then trying to read the next byte as if a new character

                                         // was beginning there -- if the current byte is really followed by five

                                         // 10xxxxxx bytes, we'll just get six replacement characters in a row.

                                         case uehReplace: break; //dest.Add(TDestCh(replacementChar)); continue;

                                         case uehIgnore: break; // continue;

                                         default: Fail; } }

                                 nMoreBytes = 5; nBits = 2; minVal = 0x80000000u; }

                         // Decode this multi-byte sequence.

                         uint cOut = c & ((1 << nBits) - 1); // First extract the nBits least significant bits from c.

                         bool cancel = false;

                         for (uint i = 0; i < nMoreBytes && ! cancel; i++) {

                                 // See if there are enough bytes left in the source vector.

                                 if (! (srcIdx < srcEnd)) {

                                         switch (errorHandling) {

                                         case uehThrow: throw TUnicodeException(charSrcIdx, c, TInt::GetStr(nMoreBytes) + " more bytes expected, only " + TInt::GetStr(int(srcEnd - charSrcIdx - 1)) + " available.");

                                         case uehAbort: return nDecoded;

                                         case uehReplace: dest.Add(TDestCh(replacementChar)); cancel = true; continue;

                                         case uehIgnore: cancel = true; continue;

                                         default: Fail; } }

                                 // Read the next byte.

                                 c = src[TVecIdx(srcIdx)] & 0xff; srcIdx++;

                                 if ((c & _1100_0000) != _1000_0000) { // Each subsequent byte should be of the form 10xxxxxx.

                                         switch (errorHandling) {

                                         case uehThrow: throw TUnicodeException(charSrcIdx, c, "Byte " + TInt::GetStr(i) + " of " + TInt::GetStr(nMoreBytes) + " extra bytes should begin with 10xxxxxx.");

                                         case uehAbort: return nDecoded;

                                         case uehReplace: dest.Add(TDestCh(replacementChar)); srcIdx--; cancel = true; continue;

                                         case uehIgnore: srcIdx--; cancel = true; continue;

                                         default: Fail; } }

                                 cOut <<= 6; cOut |= (c & _0011_1111); }

                         if (cancel) continue;

                         if (strict) {

                                 // err1: This codepoint has been represented by more bytes than it should have been.

                                 // For example, cOut in the range 0..127 should be represented by a single byte,

                                 // not by two or more bytes.

                                 // - For example, this may happen in the "modified UTF-8" sometimes used for Java

                                 // serialization, where the codepoint 0 is encoded as 11000000 10000000 to avoid

                                 // the appearance of null bytes in the encoded stream.

                                 bool err1 = (cOut < minVal);

                                 // err2: Early definitions of UTF-8 allowed any 31-bit integer to be encoded, using up to 6 bytes.

                                 // However, later this was restricted to the codepoints 0..0x10ffff only, because only these

                                 // are valid Unicode codepoints.  Thus, no more than 4 bytes are ever necessary.

                                 bool err2 = (nMoreBytes > 3 || (nMoreBytes == 3 && cOut > 0x10ffff));

                                 if (err1 || err2) switch (errorHandling) {

                                         case uehThrow:

                                                 if (err1) throw TUnicodeException(charSrcIdx, c, "The codepoint 0x" + TInt::GetStr(cOut, "%08x") + " has been represented by too many bytes (" + TInt::GetStr(nMoreBytes + 1) + ").");

                                                 else if (err2) throw TUnicodeException(charSrcIdx, c, "Invalid multibyte sequence: it decodes into 0x" + TInt::GetStr(cOut, "%08x") + ", but only codepoints 0..0x10ffff are valid.");

                                                 else { Fail; break; }

                                         case uehAbort: return nDecoded;

                                         case uehReplace: dest.Add(TDestCh(replacementChar)); continue;

                                         case uehIgnore: continue;

                                         default: Fail; } }

                         // Add the decoded codepoint to the destination vector.

                         // If this is the first decoded character, and it's one of the byte-order marks

                         // (0xfffe and 0xfeff), we will skip it (unless skipBom is false).

                         if (! (skipBom && (cOut == 0xfffe || cOut == 0xfeff) && charSrcIdx == origSrcIdx)) {

                                 dest.Add(cOut); nDecoded++; }

                 } // else (multi-byte sequence)

         } // while

         return nDecoded;

 }


 //-----------------------------------------------------------------------

 // TUniCodec -- UTF-8 Encoder

 //-----------------------------------------------------------------------


 // Returns the number of characters that have been successfully encoded.

 // This does not include any replacement characters that may have been inserted into 'dest'.

 template<typename TSrcVec, typename TDestCh>

 size_t TUniCodec::EncodeUtf8(

         const TSrcVec& src, size_t srcIdx, const size_t srcCount,

         TVec<TDestCh>& dest, const bool clrDest) const

 {

         size_t nEncoded = 0;

         for (const size_t srcEnd = srcIdx + srcCount; srcIdx < srcEnd; srcIdx++)

         {

                 uint c = uint(src[TVecIdx(srcIdx)]);

                 bool err = false;

                 if (strict && c > 0x10ffff) {

                         err = true;

                         switch (errorHandling) {

                         case uehThrow: throw TUnicodeException(srcIdx, c, "Invalid character (0x" + TInt::GetStr(c, "%x") + "; only characters in the range 0..0x10ffff are allowed).");

                         case uehAbort: return nEncoded;

                         case uehReplace: c = replacementChar; break;

                         case uehIgnore: continue;

                         default: Fail; } }

                 if (c < 0x80u)

                         dest.Add(TDestCh(c & 0xffu));

                 else if (c < 0x800u) {

                         dest.Add(TDestCh(_1100_0000 | ((c >> 6) & _0001_1111)));

                         dest.Add(TDestCh(_1000_0000 | (c & _0011_1111))); }

                 else if (c < 0x10000u) {

                         dest.Add(TDestCh(_1110_0000 | ((c >> 12) & _0000_1111)));

                         dest.Add(TDestCh(_1000_0000 | ((c >> 6) & _0011_1111)));

                         dest.Add(TDestCh(_1000_0000 | (c & _0011_1111))); }

                 else if (c < 0x200000u) {

                         dest.Add(TDestCh(_1111_0000 | ((c >> 18) & _0000_0111)));

                         dest.Add(TDestCh(_1000_0000 | ((c >> 12) & _0011_1111)));

                         dest.Add(TDestCh(_1000_0000 | ((c >> 6) & _0011_1111)));

                         dest.Add(TDestCh(_1000_0000 | (c & _0011_1111))); }

                 else if (c < 0x4000000u) {

                         dest.Add(TDestCh(_1111_1000 | ((c >> 24) & _0000_0011)));

                         dest.Add(TDestCh(_1000_0000 | ((c >> 18) & _0011_1111)));

                         dest.Add(TDestCh(_1000_0000 | ((c >> 12) & _0011_1111)));

                         dest.Add(TDestCh(_1000_0000 | ((c >> 6) & _0011_1111)));

                         dest.Add(TDestCh(_1000_0000 | (c & _0011_1111))); }

                 else {

                         dest.Add(TDestCh(_1111_1100 | ((c >> 30) & _0000_0011)));

                         dest.Add(TDestCh(_1000_0000 | ((c >> 24) & _0011_1111)));

                         dest.Add(TDestCh(_1000_0000 | ((c >> 18) & _0011_1111)));

                         dest.Add(TDestCh(_1000_0000 | ((c >> 12) & _0011_1111)));

                         dest.Add(TDestCh(_1000_0000 | ((c >> 6) & _0011_1111)));

                         dest.Add(TDestCh(_1000_0000 | (c & _0011_1111))); }

                 if (! err) nEncoded++;

         }

         return nEncoded;

 }


 //-----------------------------------------------------------------------

 // TUniCodec -- UTF-16 Encoder

 //-----------------------------------------------------------------------


 // Returns the number of characters that have been successfully decoded.

 // This does not include any replacement characters that may have been inserted into 'dest'.

 // Each element of 'src' is assumed to contain one byte of data.

 // srcCount must be even (though srcIdx doesn't need to be).

 template<typename TSrcVec, typename TDestCh>

 size_t TUniCodec::DecodeUtf16FromBytes(

         const TSrcVec& src, size_t srcIdx, const size_t srcCount,

         TVec<TDestCh>& dest, const bool clrDest,

         const TUtf16BomHandling bomHandling,

         const TUniByteOrder defaultByteOrder) const

 {

         IAssert(srcCount % 2 == 0);

         IAssert(bomHandling == bomAllowed || bomHandling == bomRequired || bomHandling == bomIgnored);

         IAssert(defaultByteOrder == boMachineEndian || defaultByteOrder == boBigEndian || defaultByteOrder == boLittleEndian);

         if (clrDest) dest.Clr();

         size_t nDecoded = 0;

         if (srcCount <= 0) return nDecoded;

         const size_t origSrcIdx = srcIdx, srcEnd = srcIdx + srcCount;

         bool littleEndian = false;

         bool leDefault = (defaultByteOrder == boLittleEndian || (defaultByteOrder == boMachineEndian && IsMachineLittleEndian()));

         if (bomHandling == bomIgnored) littleEndian = leDefault;

         else if (bomHandling == bomAllowed || bomHandling == bomRequired)

         {

                 int byte1 = uint(src[TVecIdx(srcIdx)]) & 0xff, byte2 = uint(src[TVecIdx(srcIdx + 1)]) & 0xff;

                 if (byte1 == 0xfe && byte2 == 0xff) { littleEndian = false; if (skipBom) srcIdx += 2; }

                 else if (byte1 == 0xff && byte2 == 0xfe) { littleEndian = true; if (skipBom) srcIdx += 2; }

                 else if (bomHandling == bomAllowed) littleEndian = leDefault;

                 else { // Report an error.

                         switch (errorHandling) {

                         case uehThrow: throw TUnicodeException(srcIdx, byte1, "BOM expected at the beginning of the input vector (" + TInt::GetStr(byte1, "%02x") + " " + TInt::GetStr(byte2, "%02x") + " found instead).");

                         case uehAbort: case uehReplace: case uehIgnore: return size_t(-1);

                         default: Fail; } }

         }

         else Fail;

         while (srcIdx < srcEnd)

         {

                 const size_t charSrcIdx = srcIdx;

                 uint byte1 = uint(src[TVecIdx(srcIdx)]) & 0xff, byte2 = uint(src[TVecIdx(srcIdx + 1)]) & 0xff; srcIdx += 2;

                 uint c = littleEndian ? (byte1 | (byte2 << 8)) : (byte2 | (byte1 << 8));

                 if (Utf16FirstSurrogate <= c && c <= Utf16FirstSurrogate + 1023)

                 {

                         // c is the first character in a surrogate pair.  Read the next character.

                         if (! (srcIdx + 2 <= srcEnd)) {

                                 switch (errorHandling) {

                                 case uehThrow: throw TUnicodeException(charSrcIdx, c, "The second character of a surrogate pair is missing.");

                                 case uehAbort: return nDecoded;

                                 case uehReplace: dest.Add(TDestCh(replacementChar)); continue;

                                 case uehIgnore: continue;

                                 default: Fail; } }

                         uint byte1 = uint(src[TVecIdx(srcIdx)]) & 0xff, byte2 = uint(src[TVecIdx(srcIdx + 1)]) & 0xff; srcIdx += 2;

                         uint c2 = littleEndian ? (byte1 | (byte2 << 8)) : (byte2 | (byte1 << 8));

                         // c2 should be the second character of the surrogate pair.

                         if (c2 < Utf16SecondSurrogate || Utf16SecondSurrogate + 1023 < c2) {

                                 switch (errorHandling) {

                                 case uehThrow: throw TUnicodeException(charSrcIdx + 2, c2, "The second character of a surrogate pair should be in the range " + TInt::GetStr(Utf16SecondSurrogate, "%04x") + ".." + TInt::GetStr(Utf16SecondSurrogate + 1023, "%04x") + ", not " + TInt::GetStr(c2, "04x") + ".");

                                 case uehAbort: return nDecoded;

                                 // with uehReplace and uehIgnore, we'll just skip the first character of the surrogate pair; we'll process the second one during the next iteration, this time as an ordinary character

                                 case uehReplace: dest.Add(TDestCh(replacementChar)); srcIdx -= 2; continue;

                                 case uehIgnore: srcIdx -= 2; continue;

                                 default: Fail; } }

                         // c and c2 each contain 10 bits of information.

                         uint cc = ((c - Utf16FirstSurrogate) << 10) | (c2 - Utf16SecondSurrogate);

                         cc += 0x10000;

                         dest.Add(TDestCh(cc)); nDecoded++; continue;

                 }

                 else if (strict && Utf16SecondSurrogate <= c && c <= Utf16SecondSurrogate + 1023) {

                         switch (errorHandling) {

                         case uehThrow: throw TUnicodeException(charSrcIdx, c, "This 16-bit value should be used only as the second character of a surrogate pair.");

                         case uehAbort: return nDecoded;

                         case uehReplace: dest.Add(TDestCh(replacementChar)); continue;

                         case uehIgnore: continue;

                         default: Fail; } }

                 // If 'c' is the first character in the input stream, and it's a BOM, we might have to skip it.

                 if (charSrcIdx == origSrcIdx && (c == 0xfffeu || c == 0xfeffu) && skipBom) continue;

                 // Otherwise, store 'c' to the destination vector.

                 dest.Add(TDestCh(c)); nDecoded++;

         }

         return nDecoded;

 }


 // Here, each element of 'src' is treated as a 16-bit word.  The byte-order settings

 // are used to determine if the two bytes of each word should be swapped before further

 // processing.  For example, if a BOM is present, it must have the value 0xfeff; if it

 // actually has the value 0xfffe, this means that the two bytes of each word must be swapped.

 // Basically, the combination of the byteOrder parameter and the byte order mark (if present) at the

 // beginning of the source data is used to determine the "original" byte order of the data;

 // if this doesn't match the byte order of the local machine, the two bytes of each word will

 // be swapped during the decoding process.

 template<typename TSrcVec, typename TDestCh>

 size_t TUniCodec::DecodeUtf16FromWords(

         const TSrcVec& src, size_t srcIdx, const size_t srcCount,

         TVec<TDestCh>& dest, bool clrDest,

         const TUtf16BomHandling bomHandling,

         const TUniByteOrder defaultByteOrder) const

 {

         IAssert(bomHandling == bomAllowed || bomHandling == bomRequired || bomHandling == bomIgnored);

         IAssert(defaultByteOrder == boMachineEndian || defaultByteOrder == boBigEndian || defaultByteOrder == boLittleEndian);

         if (clrDest) dest.Clr();

         size_t nDecoded = 0;

         if (srcCount <= 0) return nDecoded;

         const size_t origSrcIdx = srcIdx, srcEnd = srcIdx + srcCount;

         bool swap = false;

         bool isMachineLe = IsMachineLittleEndian();

         bool isDefaultLe = (defaultByteOrder == boLittleEndian || (defaultByteOrder == boMachineEndian && isMachineLe));

         if (bomHandling == bomIgnored) swap = (isDefaultLe != isMachineLe);

         else if (bomHandling == bomAllowed || bomHandling == bomRequired)

         {

                 int c = uint(src[TVecIdx(srcIdx)]) & 0xffff;

                 if (c == 0xfeff) { swap = false; if (skipBom) srcIdx += 1; }

                 else if (c == 0xfffe) { swap = true; if (skipBom) srcIdx += 1; }

                 else if (bomHandling == bomAllowed) swap = (isMachineLe != isDefaultLe);

                 else { // Report an error.

                         switch (errorHandling) {

                         case uehThrow: throw TUnicodeException(srcIdx, c, "BOM expected at the beginning of the input vector (" + TInt::GetStr(c, "%04x") + " found instead).");

                         case uehAbort: case uehReplace: case uehIgnore: return size_t(-1);

                         default: Fail; } }

         }

         else Fail;

         while (srcIdx < srcEnd)

         {

                 const size_t charSrcIdx = srcIdx;

                 uint c = uint(src[TVecIdx(srcIdx)]) & 0xffffu; srcIdx++;

                 if (swap) c = ((c >> 8) & 0xff) | ((c & 0xff) << 8);

                 if (Utf16FirstSurrogate <= c && c <= Utf16FirstSurrogate + 1023)

                 {

                         // c is the first character in a surrogate pair.  Read the next character.

                         if (! (srcIdx < srcEnd)) {

                                 switch (errorHandling) {

                                 case uehThrow: throw TUnicodeException(charSrcIdx, c, "The second character of a surrogate pair is missing.");

                                 case uehAbort: return nDecoded;

                                 case uehReplace: dest.Add(TDestCh(replacementChar)); continue;

                                 case uehIgnore: continue;

                                 default: Fail; } }

                         uint c2 = uint(src[TVecIdx(srcIdx)]) & 0xffffu; srcIdx++;

                         if (swap) c2 = ((c2 >> 8) & 0xff) | ((c2 & 0xff) << 8);

                         // c2 should be the second character of the surrogate pair.

                         if (c2 < Utf16SecondSurrogate || Utf16SecondSurrogate + 1023 < c2) {

                                 switch (errorHandling) {

                                 case uehThrow: throw TUnicodeException(charSrcIdx + 1, c2, "The second character of a surrogate pair should be in the range " + TInt::GetStr(Utf16SecondSurrogate, "%04x") + ".." + TInt::GetStr(Utf16SecondSurrogate + 1023, "%04x") + ", not " + TInt::GetStr(c2, "04x") + ".");

                                 case uehAbort: return nDecoded;

                                 // with uehReplace and uehIgnore, we'll just skip the first character of the surrogate pair; we'll process the second one during the next iteration, this time as an ordinary character

                                 case uehReplace: dest.Add(TDestCh(replacementChar)); srcIdx -= 1; continue;

                                 case uehIgnore: srcIdx -= 1; continue;

                                 default: Fail; } }

                         // c and c2 each contain 10 bits of information.

                         uint cc = ((c - Utf16FirstSurrogate) << 10) | (c2 - Utf16SecondSurrogate);

                         cc += 0x10000;

                         dest.Add(TDestCh(cc)); nDecoded++; continue;

                 }

                 else if (strict && Utf16SecondSurrogate <= c && c <= Utf16SecondSurrogate + 1023) {

                         switch (errorHandling) {

                         case uehThrow: throw TUnicodeException(charSrcIdx, c, "This 16-bit value should be used only as the second character of a surrogate pair.");

                         case uehAbort: return nDecoded;

                         case uehReplace: dest.Add(TDestCh(replacementChar)); continue;

                         case uehIgnore: continue;

                         default: Fail; } }

                 // If 'c' is the first character in the input stream, and it's a BOM, we might have to skip it.

                 if (charSrcIdx == origSrcIdx && (c == 0xfffeu || c == 0xfeffu) && skipBom) continue;

                 // Otherwise, store 'c' to the destination vector.

                 dest.Add(TDestCh(c)); nDecoded++;

         }

         return nDecoded;

 }


 //-----------------------------------------------------------------------

 // TUniCodec -- UTF-16 Encoder

 //-----------------------------------------------------------------------


 // Returns the number of characters that have been successfully encoded.

 // This does not include any replacement characters that may have been inserted into 'dest'.

 template<typename TSrcVec, typename TDestCh>

 size_t TUniCodec::EncodeUtf16ToWords(

         const TSrcVec& src, size_t srcIdx, const size_t srcCount,

         TVec<TDestCh>& dest, const bool clrDest, const bool insertBom,

         const TUniByteOrder destByteOrder) const

 {

         bool isMachineLe = IsMachineLittleEndian();

         bool swap = (destByteOrder == boLittleEndian && ! isMachineLe) || (destByteOrder == boBigEndian && isMachineLe);

         size_t nEncoded = 0, srcEnd = srcIdx + srcCount;

         if (insertBom) { dest.Add(TDestCh(swap ? 0xfffeu : 0xfeffu)); nEncoded++; }

         while (srcIdx < srcEnd)

         {

                 uint c = uint(src[TVecIdx(srcIdx)]); srcIdx++;

                 if (! (c <= 0x10ffffu)) {

                         switch (errorHandling) {

                         case uehThrow: throw TUnicodeException(srcIdx - 1, c, "UTF-16 only supports characters in the range 0..10ffff (not " + TUInt::GetStr(c, "%08x") + ").");

                         case uehAbort: return nEncoded;

                         case uehReplace: dest.Add(TDestCh(swap ? SwapBytes(replacementChar) : replacementChar)); continue;

                         case uehIgnore: continue;

                         default: Fail; } }

                 if (Utf16FirstSurrogate <= c && c < Utf16FirstSurrogate + 1023) {

                         switch (errorHandling) {

                         case uehThrow: throw TUnicodeException(srcIdx - 1, c, "UTF-16 cannot encode " + TUInt::GetStr(c, "%04x") + " as it belongs to the first surrogate range (" + TUInt::GetStr(Utf16FirstSurrogate, "%04x") + ".." + TUInt::GetStr(Utf16FirstSurrogate + 1023, "%04x") + ").");

                         case uehAbort: return nEncoded;

                         case uehReplace: dest.Add(TDestCh(swap ? SwapBytes(replacementChar) : replacementChar)); continue;

                         case uehIgnore: continue;

                         default: Fail; } }

                 if (Utf16SecondSurrogate <= c && c < Utf16SecondSurrogate + 1023) {

                         switch (errorHandling) {

                         case uehThrow: throw TUnicodeException(srcIdx - 1, c, "The character " + TUInt::GetStr(c, "%04x") + " belongs to the second surrogate range (" + TUInt::GetStr(Utf16FirstSurrogate, "%04x") + ".." + TUInt::GetStr(Utf16FirstSurrogate + 1023, "%04x") + "), which is not allowed with strict == true.");

                         case uehAbort: return nEncoded;

                         case uehReplace: dest.Add(TDestCh(swap ? SwapBytes(replacementChar) : replacementChar)); continue;

                         case uehIgnore: continue;

                         default: Fail; } }

                 // If c is <= 0xffff, it can be stored directly.

                 if (c <= 0xffffu) {

                         if (swap) c = ((c >> 8) & 0xff) | ((c & 0xff) << 8);

                         dest.Add(TDestCh(c)); nEncoded++; continue; }

                 // Otherwise, represent c by a pair of surrogate characters.

                 c -= 0x10000u; IAssert(/*0 <= c &&*/ c <= 0xfffffu);

                 uint c1 = (c >> 10) & 1023, c2 = c & 1023;

                 c1 += Utf16FirstSurrogate; c2 += Utf16SecondSurrogate;

                 if (swap) {

                         c1 = ((c1 >> 8) & 0xff) | ((c1 & 0xff) << 8);

                         c2 = ((c2 >> 8) & 0xff) | ((c2 & 0xff) << 8); }

                 dest.Add(TDestCh(c1));

                 dest.Add(TDestCh(c2));

                 nEncoded++; continue;

         }

         return nEncoded;

 }


 template<typename TSrcVec, typename TDestCh>

 size_t TUniCodec::EncodeUtf16ToBytes(

         const TSrcVec& src, size_t srcIdx, const size_t srcCount,

         TVec<TDestCh>& dest, const bool clrDest, const bool insertBom,

         const TUniByteOrder destByteOrder) const

 {

         bool isDestLe = (destByteOrder == boLittleEndian || (destByteOrder == boMachineEndian && IsMachineLittleEndian()));

         size_t nEncoded = 0, srcEnd = srcIdx + srcCount;

         if (insertBom) { dest.Add(isDestLe ? 0xff : 0xfe); dest.Add(isDestLe ? 0xfe : 0xff); nEncoded++; }

         while (srcIdx < srcEnd)

         {

                 uint c = uint(src[TVecIdx(srcIdx)]); srcIdx++;

                 if (! (c <= 0x10ffffu)) {

                         switch (errorHandling) {

                         case uehThrow: throw TUnicodeException(srcIdx - 1, c, "UTF-16 only supports characters in the range 0..10ffff (not " + TUInt::GetStr(c, "%08x") + ").");

                         case uehAbort: return nEncoded;

 #define ___OutRepl if (isDestLe) { dest.Add(replacementChar & 0xff); dest.Add((replacementChar >> 8) & 0xff); } else { dest.Add((replacementChar >> 8) & 0xff); dest.Add(replacementChar & 0xff); }

                         case uehReplace: ___OutRepl; continue;

                         case uehIgnore: continue;

                         default: Fail; } }

                 if (Utf16FirstSurrogate <= c && c < Utf16FirstSurrogate + 1023) {

                         switch (errorHandling) {

                         case uehThrow: throw TUnicodeException(srcIdx - 1, c, "UTF-16 cannot encode " + TUInt::GetStr(c, "%04x") + " as it belongs to the first surrogate range (" + TUInt::GetStr(Utf16FirstSurrogate, "%04x") + ".." + TUInt::GetStr(Utf16FirstSurrogate + 1023, "%04x") + ").");

                         case uehAbort: return nEncoded;

                         case uehReplace: ___OutRepl; continue;

                         case uehIgnore: continue;

                         default: Fail; } }

                 if (Utf16SecondSurrogate <= c && c < Utf16SecondSurrogate + 1023) {

                         switch (errorHandling) {

                         case uehThrow: throw TUnicodeException(srcIdx - 1, c, "The character " + TUInt::GetStr(c, "%04x") + " belongs to the second surrogate range (" + TUInt::GetStr(Utf16FirstSurrogate, "%04x") + ".." + TUInt::GetStr(Utf16FirstSurrogate + 1023, "%04x") + "), which is not allowed with strict == true.");

                         case uehAbort: return nEncoded;

                         case uehReplace: ___OutRepl; continue;

                         case uehIgnore: continue;

                         default: Fail; } }

 #undef ___OutRepl

                 // If c is <= 0xffff, it can be stored directly.

                 if (c <= 0xffffu) {

                         if (isDestLe) { dest.Add(c & 0xff); dest.Add((c >> 8) & 0xff); }

                         else { dest.Add((c >> 8) & 0xff); dest.Add(c & 0xff); }

                         nEncoded++; continue; }

                 // Otherwise, represent c by a pair of surrogate characters.

                 c -= 0x10000u; IAssert(/*0 <= c &&*/ c <= 0xfffffu);

                 uint c1 = (c >> 10) & 1023, c2 = c & 1023;

                 c1 += Utf16FirstSurrogate; c2 += Utf16SecondSurrogate;

                 if (isDestLe) { dest.Add(c1 & 0xff); dest.Add((c1 >> 8) & 0xff); dest.Add(c2 & 0xff); dest.Add((c2 >> 8) & 0xff); }

                 else { dest.Add((c1 >> 8) & 0xff); dest.Add(c1 & 0xff); dest.Add((c2 >> 8) & 0xff); dest.Add(c2 & 0xff); }

                 nEncoded++; continue;

         }

         return nEncoded;

 }


 //-----------------------------------------------------------------------------

 // TUniChDb -- word boundaries

 //-----------------------------------------------------------------------------


 template<typename TSrcVec>

 bool TUniChDb::FindNextWordBoundary(const TSrcVec& src, const size_t srcIdx, const size_t srcCount, size_t &position) const

 {

         // WB1.  Break at the start of text.

         if (position < srcIdx) { position = srcIdx; return true; }

         // If we are beyond the end of the text, there aren't any word breaks left.

         const size_t srcEnd = srcIdx + srcCount;

         if (position >= srcEnd) return false;

         // If 'position' is currently at an ignored character, move it back to the last nonignored character.

         size_t origPos = position;

         if (IsWbIgnored(src[TVecIdx(position)])) {

                 if (! WbFindPrevNonIgnored(src, srcIdx, position))

                         position = origPos;

         }

         // Determine the previous nonignored character (before 'position').

         size_t posPrev = position;

         if (! WbFindPrevNonIgnored(src, srcIdx, posPrev)) posPrev = position;

         // Sec 6.2.  Allow a break between Sep and an ignored character.

         if (position == origPos && position + 1 < srcEnd && IsSbSep(src[TVecIdx(position)]) && IsWbIgnored(src[TVecIdx(position + 1)])) { position += 1; return true; }

         // Determine the next nonignored character (after 'position').

         size_t posNext = position; WbFindNextNonIgnored(src, posNext, srcEnd);

         size_t posNext2;

         int cPrev = (posPrev < position ? (int) src[TVecIdx(posPrev)] : -1), cCur = (position < srcEnd ? (int) src[TVecIdx(position)] : -1);

         int cNext = (position < posNext && posNext < srcEnd ? (int) src[TVecIdx(posNext)] : -1);

         int wbfPrev = GetWbFlags(cPrev), wbfCur = GetWbFlags(cCur), wbfNext = GetWbFlags(cNext);

         int cNext2, wbfNext2;

         //

         for ( ; position < srcEnd; posPrev = position, position = posNext, posNext = posNext2,

                                                            cPrev = cCur, cCur = cNext, cNext = cNext2,

                                                            wbfPrev = wbfCur, wbfCur = wbfNext, wbfNext = wbfNext2)

         {

                 // Should there be a word boundary between 'position' and 'posNext' (or, more accurately,

                 // between src[posNext - 1] and src[posNext] --- any ignored characters between 'position'

                 // and 'posNext' are considered to belong to the previous character ('position'), not to the next one)?

                 posNext2 = posNext; WbFindNextNonIgnored(src, posNext2, srcEnd);

                 cNext2 = (posNext < posNext2 && posNext2 < srcEnd ? (int) src[TVecIdx(posNext2)] : -1);

                 wbfNext2 = GetWbFlags(cNext2);

 #define TestCurNext(curFlag, nextFlag) if ((wbfCur & curFlag) == curFlag && (wbfNext & nextFlag) == nextFlag) continue

 #define TestCurNext2(curFlag, nextFlag, next2Flag) if ((wbfCur & curFlag) == curFlag && (wbfNext & nextFlag) == nextFlag && (wbfNext2 & next2Flag) == next2Flag) continue

 #define TestPrevCurNext(prevFlag, curFlag, nextFlag) if ((wbfPrev & prevFlag) == prevFlag && (wbfCur & curFlag) == curFlag && (wbfNext & nextFlag) == nextFlag) continue

                 // WB3.  Do not break within CRLF.

                 if (cCur == 13 && cNext == 10) continue;

                 // WB5.  Do not break between most letters.

                 TestCurNext(ucfWbALetter, ucfWbALetter);

                 // WB6.  Do not break letters across certain punctuation.

                 TestCurNext2(ucfWbALetter, ucfWbMidLetter, ucfWbALetter);

                 // WB7.  Do not break letters across certain punctuation.

                 TestPrevCurNext(ucfWbALetter, ucfWbMidLetter, ucfWbALetter);

                 // WB8.  Do not break within sequences of digits, or digits adjacent to letters.

                 TestCurNext(ucfWbNumeric, ucfWbNumeric);

                 // WB9.  Do not break within sequences of digits, or digits adjacent to letters.

                 TestCurNext(ucfWbALetter, ucfWbNumeric);

                 // WB10.  Do not break within sequences of digits, or digits adjacent to letters.

                 TestCurNext(ucfWbNumeric, ucfWbALetter);

                 // WB11.  Do not break within sequences, such as "3.2" or "3.456,789".

                 TestPrevCurNext(ucfWbNumeric, ucfWbMidNum, ucfWbNumeric);

                 // WB12.  Do not break within sequences, such as "3.2" or "3.456,789".

                 TestCurNext2(ucfWbNumeric, ucfWbMidNum, ucfWbNumeric);

                 // WB13.  Do not break between Katakana.

                 TestCurNext(ucfWbKatakana, ucfWbKatakana);

                 // WB13a.  Do not break from extenders.

                 if ((wbfCur & (ucfWbALetter | ucfWbNumeric | ucfWbKatakana | ucfWbExtendNumLet)) != 0 &&

                         (wbfNext & ucfWbExtendNumLet) == ucfWbExtendNumLet) continue;

                 // WB13b.  Do not break from extenders.

                 if ((wbfCur & ucfWbExtendNumLet) == ucfWbExtendNumLet &&

                         (wbfNext & (ucfWbALetter | ucfWbNumeric | ucfWbKatakana)) != 0) continue;

                 // WB14.  Otherwise, break everywhere.

                 position = posNext; return true;

 #undef TestCurNext

 #undef TestCurNext2

 #undef TestPrevCurNext

         }

         // WB2.  Break at the end of text.

         IAssert(position == srcEnd);

         return true;

 }


 // ToDo: provide a more efficient implementation of this.

 template<typename TSrcVec>

 void TUniChDb::FindWordBoundaries(const TSrcVec& src, const size_t srcIdx, const size_t srcCount, TBoolV& dest) const

 {

         if (size_t(dest.Len()) != srcCount + 1) dest.Gen(TVecIdx(srcCount + 1));

         dest.PutAll(false);

         size_t position = srcIdx;

         dest[TVecIdx(position - srcIdx)] = true;

         while (position < srcIdx + srcCount)

         {

                 size_t oldPos = position;

                 FindNextWordBoundary(src, srcIdx, srcCount, position);

     if (oldPos >= position) {

                   Assert(oldPos < position);

     }

     Assert(position <= srcIdx + srcCount);

                 dest[TVecIdx(position - srcIdx)] = true;

         }

         Assert(dest[TVecIdx(srcCount)]);

 }


 //-----------------------------------------------------------------------------

 // TUniChDb -- sentence boundaries

 //-----------------------------------------------------------------------------


 template<typename TSrcVec>

 bool TUniChDb::CanSentenceEndHere(const TSrcVec& src, const size_t srcIdx, const size_t position) const

 {

         if (sbExTrie.Empty()) return true;

         // We'll move back from the position where a sentence-boundary is being considered.

         size_t pos = position;

         if (! WbFindPrevNonIgnored(src, srcIdx, pos)) return true;

         int c = (int) src[TVecIdx(pos)]; int sfb = GetSbFlags(c);

         // - Skip the Sep, if there is one.

         if ((c & ucfSbSep) == ucfSbSep) {

                 if (! WbFindPrevNonIgnored(src, srcIdx, pos)) return true;

                 c = (int) src[TVecIdx(pos)]; sfb = GetSbFlags(c); }

         // - Skip any Sp characters.

         while ((sfb & ucfSbSp) == ucfSbSp) {

                 if (! WbFindPrevNonIgnored(src, srcIdx, pos)) return true;

                 c = (int) src[TVecIdx(pos)]; sfb = GetSbFlags(c); }

         // - Skip any Close characters.

         while ((sfb & ucfSbSp) == ucfSbSp) {

                 if (! WbFindPrevNonIgnored(src, srcIdx, pos)) return true;

                 c = (int) src[TVecIdx(pos)]; sfb = GetSbFlags(c); }

         // - Skip any ATerm | STerm characters.

         while ((sfb & (ucfSbATerm | ucfSbSTerm)) != 0) {

                 if (! WbFindPrevNonIgnored(src, srcIdx, pos)) return true;

                 c = (int) src[TVecIdx(pos)]; sfb = GetSbFlags(c); }

         // Now start moving through the trie.

         int cLast = c, cButLast = -1, cButButLast = -1, len = 1, node = -1;

         while (true)

         {

                 bool atEnd = (! WbFindPrevNonIgnored(src, srcIdx, pos));

                 c = (atEnd ? -1 : (int) src[TVecIdx(pos)]);

                 TUniChCategory cat = GetCat(c);

                 if (atEnd || ! (cat == ucLetter || cat == ucNumber || cat == ucSymbol)) {

                         // Check if the suffix we've read so far is one of those that appear in the trie.

                         if (len == 1) return ! sbExTrie.Has1Gram(cLast);

                         if (len == 2) return ! sbExTrie.Has2Gram(cLast, cButLast);

                         IAssert(len >= 3); IAssert(node >= 0);

                         if (sbExTrie.IsNodeTerminal(node)) return false;

                         if (atEnd) return true; }

                 if (len == 1) { cButLast = c; len++; }

                 else if (len == 2) { cButButLast = c; len++;

                         // Now we have read the last three characters; start descending the suitable subtrie.

                         node = sbExTrie.Get3GramRoot(cLast, cButLast, cButButLast);

                         if (node < 0) return true; }

                 else {

                         // Descend down the trie.

                         node = sbExTrie.GetChild(node, c);

                         if (node < 0) return true; }

         }

         //return true;

 }


 template<typename TSrcVec>

 bool TUniChDb::FindNextSentenceBoundary(const TSrcVec& src, const size_t srcIdx, const size_t srcCount, size_t &position) const

 {

         // SB1.  Break at the start of text.

         if (position < srcIdx) { position = srcIdx; return true; }

         // If we are beyond the end of the text, there aren't any word breaks left.

         const size_t srcEnd = srcIdx + srcCount;

         if (position >= srcEnd) return false;

         // If 'position' is currently at an ignored character, move it back to the last nonignored character.

         size_t origPos = position;

         if (IsWbIgnored(src[TVecIdx(position)])) {

                 if (! WbFindPrevNonIgnored(src, srcIdx, position))

                         position = origPos;

         }

         // Determine the previous nonignored character (before 'position').

         size_t posPrev = position;

         if (! WbFindPrevNonIgnored(src, srcIdx, posPrev)) posPrev = position;

         // Sec 6.2.  Allow a break between Sep and an ignored character.

         if (position == origPos && position + 1 < srcEnd && IsSbSep(src[TVecIdx(position)]) && IsWbIgnored(src[TVecIdx(position + 1)])) { position += 1; return true; }

         // Determine the next nonignored character (after 'position').

         size_t posNext = position; WbFindNextNonIgnored(src, posNext, srcEnd);

         size_t posNext2;

         int cPrev = (posPrev < position ? (int) src[TVecIdx(posPrev)] : -1), cCur = (position < srcEnd ? (int) src[TVecIdx(position)] : -1);

         int cNext = (position < posNext && posNext < srcEnd ? (int) src[TVecIdx(posNext)] : -1);

         int sbfPrev = GetSbFlags(cPrev), sbfCur = GetSbFlags(cCur), sbfNext = GetSbFlags(cNext);

         int cNext2, sbfNext2;

         // Initialize the state of the peek-back automaton.

         typedef enum { stInit, stATerm, stATermSp, stATermSep, stSTerm, stSTermSp, stSTermSep } TPeekBackState;

         TPeekBackState backState;

         {

                 size_t pos = position;

                 bool wasSep = false, wasSp = false, wasATerm = false, wasSTerm = false;

                 while (true)

                 {

                         if (! WbFindPrevNonIgnored(src, srcIdx, pos)) break;

                         // Skip at most one Sep.

                         int cp = (int) src[TVecIdx(pos)]; int sbf = GetSbFlags(cp);

                         if ((sbf & ucfSbSep) == ucfSbSep) {

                                 wasSep = true;

                                 if (! WbFindPrevNonIgnored(src, srcIdx, pos)) break;

                                 cp = (int) src[TVecIdx(pos)]; sbf = GetSbFlags(cp); }

                         // Skip zero or more Sp's.

                         bool stop = false;

                         while ((sbf & ucfSbSp) == ucfSbSp) {

                                 wasSp = true;

                                 if (! WbFindPrevNonIgnored(src, srcIdx, pos)) { stop = true; break; }

                                 cp = (int) src[TVecIdx(pos)]; sbf = GetSbFlags(cp); }

                         if (stop) break;

                         // Skip zero or more Close's.

                         while ((sbf & ucfSbClose) == ucfSbClose) {

                                 if (! WbFindPrevNonIgnored(src, srcIdx, pos)) { stop = true; break; }

                                 cp = (int) src[TVecIdx(pos)]; sbf = GetSbFlags(cp); }

                         if (stop) break;

                         // Process an ATerm or STerm.

                         wasATerm = ((sbf & ucfSbATerm) == ucfSbATerm);

                         wasSTerm = ((sbf & ucfSbSTerm) == ucfSbSTerm);

                         break;

                 }

                 if (wasATerm) backState = (wasSep ? stATermSep : wasSp ? stATermSp : stATerm);

                 else if (wasSTerm) backState = (wasSep ? stSTermSep : wasSp ? stSTermSp : stSTerm);

                 else backState = stInit;

         }

         // Initialize the state of the peek-ahead automaton.  This state tells us what follows

         // after we skip all contiguous characters from the complement of the set {OLetter, Upper, Lower, Sep, STerm, ATerm}.

         // Thus, the next character is either OLetter, Upper, Lower, Sep, STerm, ATerm, or the end of the input string.

         // Our peek-ahead automaton must tell us whether it is Lower or something else.

         typedef enum { stUnknown, stLower, stNotLower } TPeekAheadState;

         TPeekAheadState aheadState = stUnknown;

         //

         for ( ; position < srcEnd; posPrev = position, position = posNext, posNext = posNext2,

                                                            cPrev = cCur, cCur = cNext, cNext = cNext2,

                                                            sbfPrev = sbfCur, sbfCur = sbfNext, sbfNext = sbfNext2)

         {

                 // Should there be a word boundary between 'position' and 'posNext' (or, more accurately,

                 // between src[posNext - 1] and src[posNext] --- any ignored characters between 'position'

                 // and 'posNext' are considered to belong to the previous character ('position'), not to the next one)?

                 posNext2 = posNext; WbFindNextNonIgnored(src, posNext2, srcEnd);

                 cNext2 = (posNext < posNext2 && posNext2 < srcEnd ? (int) src[TVecIdx(posNext2)] : -1);

                 sbfNext2 = GetSbFlags(cNext2);

                 // Update the peek-back automaton.

 #define TestCur(curFlag) ((sbfCur & ucfSb##curFlag) == ucfSb##curFlag)

 #define Trans(curFlag, newState) if (TestCur(curFlag)) { backState = st##newState; break; }

                 switch (backState) {

                         case stInit: Trans(ATerm, ATerm); Trans(STerm, STerm); break;

                         case stATerm: Trans(Sp, ATermSp); Trans(Sep, ATermSep); Trans(ATerm, ATerm); Trans(STerm, STerm); Trans(Close, ATerm); backState = stInit; break;

                         case stSTerm: Trans(Sp, STermSp); Trans(Sep, STermSep); Trans(ATerm, ATerm); Trans(STerm, STerm); Trans(Close, STerm); backState = stInit; break;

                         case stATermSp: Trans(Sp, ATermSp); Trans(Sep, ATermSep); Trans(ATerm, ATerm); Trans(STerm, STerm); backState = stInit; break;

                         case stSTermSp: Trans(Sp, STermSp); Trans(Sep, STermSep); Trans(ATerm, ATerm); Trans(STerm, STerm); backState = stInit; break;

                         case stATermSep: Trans(ATerm, ATerm); Trans(STerm, STerm); backState = stInit; break;

                         case stSTermSep: Trans(ATerm, ATerm); Trans(STerm, STerm); backState = stInit; break;

                         default: IAssert(false); }

 #undef Trans

 #undef TestCur

                 // Update the peek-ahead automaton.

 #define IsPeekAheadSkippable(sbf) ((sbf & (ucfSbOLetter | ucfSbUpper | ucfSbLower | ucfSbSep | ucfSbSTerm | ucfSbATerm)) == 0)

                 if (! IsPeekAheadSkippable(sbfCur)) {

                         bool isLower = ((sbfCur & ucfSbLower) == ucfSbLower);

                         if (aheadState == stLower) IAssert(isLower);

                         else if (aheadState == stNotLower) IAssert(! isLower);

                         // We haven't peaked ahead farther than this so far -- invalidate the state.

                         aheadState = stUnknown; }

                 if (aheadState == stUnknown)

                 {

                         // Peak ahead to the next non-peekahead-skippable character.

                         size_t pos = posNext;

                         while (pos < srcEnd) {

                                 int cp = (int) src[TVecIdx(pos)]; int sbf = GetSbFlags(cp);

                                 if (! IsPeekAheadSkippable(sbf)) {

                                         if ((sbf & ucfSbLower) == ucfSbLower) aheadState = stLower;

                                         else aheadState = stNotLower;

                                         break; }

                                 WbFindNextNonIgnored(src, pos, srcEnd); }

                         if (! (pos < srcEnd)) aheadState = stNotLower;

                 }

 #undef IsPeekAheadSkippable

                 //

 #define TestCurNext(curFlag, nextFlag) if ((sbfCur & curFlag) == curFlag && (sbfNext & nextFlag) == nextFlag) continue

 #define TestCurNext2(curFlag, nextFlag, next2Flag) if ((sbfCur & curFlag) == curFlag && (sbfNext & nextFlag) == nextFlag && (sbfNext2 & next2Flag) == next2Flag) continue

 #define TestPrevCurNext(prevFlag, curFlag, nextFlag) if ((sbfPrev & prevFlag) == prevFlag && (sbfCur & curFlag) == curFlag && (sbfNext & nextFlag) == nextFlag) continue

                 // SB3.  Do not break within CRLF.

                 if (cCur == 13 && cNext == 10) continue;

                 // SB4.  Break ater paragraph separators.

                 if ((sbfCur & ucfSbSep) == ucfSbSep) {

                         if (! CanSentenceEndHere(src, srcIdx, position)) continue;

                         position = posNext; return true; }

                 // Do not break after ambiguous terminators like period, if they are immediately followed by a number

                 // or lowercase letter, if they are between uppercase letters, or if the first following letter

                 // (optionally after certain punctuation) is lowercase.  For example, a period may be an abbreviation

                 // or numeric period, and thus may not mark the end of a sentence.

                 TestCurNext(ucfSbATerm, ucfSbNumeric); // SB6

                 TestPrevCurNext(ucfSbUpper, ucfSbATerm, ucfSbUpper); // SB7

                 // SB8a.  (STerm | ATerm) Close* Sp* [do not break] (STerm | ATerm)

                 if ((backState == stATerm || backState == stATermSp || backState == stSTerm || backState == stSTermSp) &&

                         (sbfNext & (ucfSbSTerm | ucfSbATerm)) != 0) continue;

                 // SB8*.  ATerm Close* Sp* [do not break] ( ! (OLetter | Upper | Lower | Sep | STerm | ATerm) )* Lower

                 if ((backState == stATerm || backState == stATermSp) && aheadState == stLower) continue;

                 // Break after sentence terminators, but include closing punctuation, trailing spaces, and a paragraph separator (if present).

                 // SB9. ( STerm | ATerm ) Close* [do not break] ( Close | Sp | Sep )

                 if ((backState == stATerm || backState == stSTerm) && (sbfNext & (ucfSbClose | ucfSbSp | ucfSbSep)) != 0) continue;

                 // SB10. ( STerm | ATerm ) Close* Sp* [do not break] ( Sp | Sep )

                 // SB11*. ( STerm | ATerm ) Close* Sp* Sep? [do break]

                 if (backState == stATerm || backState == stATermSp || backState == stATermSep || backState == stSTerm || backState == stSTermSp || backState == stSTermSep) {

                         if ((sbfNext & (ucfSbSp | ucfSbSep)) != 0) continue; // SB10

                         if (! CanSentenceEndHere(src, srcIdx, position)) continue;

                         position = posNext; return true; } // SB11

                 // WB12.  Otherwise, do not break.

                 continue;

 #undef TestCurNext

 #undef TestCurNext2

 #undef TestPrevCurNext

         }

         // WB2.  Break at the end of text.

         IAssert(position == srcEnd);

         return true;

 }


 // ToDo: provide a more efficient implementation of this.

 template<typename TSrcVec>

 void TUniChDb::FindSentenceBoundaries(const TSrcVec& src, const size_t srcIdx, const size_t srcCount, TBoolV& dest) const

 {

         if (size_t(dest.Len()) != srcCount + 1) dest.Gen(TVecIdx(srcCount + 1));

         dest.PutAll(false);

         size_t position = srcIdx;

         dest[TVecIdx(position - srcIdx)] = true;

         while (position < srcIdx + srcCount)

         {

                 size_t oldPos = position;

                 FindNextSentenceBoundary(src, srcIdx, srcCount, position);

     if (oldPos >= position) {

                   Assert(oldPos < position);

     }

     Assert(position <= srcIdx + srcCount);

                 dest[TVecIdx(position - srcIdx)] = true;

         }

         Assert(dest[TVecIdx(srcCount)]);

 }


 //-----------------------------------------------------------------------------

 // TUniChDb -- case conversions

 //-----------------------------------------------------------------------------


 template<typename TSrcVec, typename TDestCh>

 void TUniChDb::GetCaseConverted(const TSrcVec& src, size_t srcIdx, const size_t srcCount,

                                                                 TVec<TDestCh>& dest, const bool clrDest,

                                                                 const TUniChDb::TCaseConversion how,

                                                                 const bool turkic, const bool lithuanian) const

 {

         const TIntIntVH &specials = (how == ccUpper ? specialCasingUpper : how == ccLower ? specialCasingLower : how == ccTitle ? specialCasingTitle : *((TIntIntVH *) 0));

         if (clrDest) dest.Clr();

         enum {

                 GreekCapitalLetterSigma = 0x3a3,

                 GreekSmallLetterSigma = 0x3c3,

                 GreekSmallLetterFinalSigma = 0x3c2,

                 LatinCapitalLetterI = 0x49,

                 LatinCapitalLetterJ = 0x4a,

                 LatinCapitalLetterIWithOgonek = 0x12e,

                 LatinCapitalLetterIWithGrave = 0xcc,

                 LatinCapitalLetterIWithAcute = 0xcd,

                 LatinCapitalLetterIWithTilde = 0x128,

                 LatinCapitalLetterIWithDotAbove = 0x130,

                 LatinSmallLetterI = 0x69,

                 CombiningDotAbove = 0x307

         };

         //

         bool seenCased = false, seenTwoCased = false; int cpFirstCased = -1;

         size_t nextWordBoundary = srcIdx;

         TBoolV wordBoundaries; bool wbsKnown = false;

         for (const size_t origSrcIdx = srcIdx, srcEnd = srcIdx + srcCount; srcIdx < srcEnd; )

         {

                 int cp = src[TVecIdx(srcIdx)]; srcIdx++;

                 //if (turkic && cp == 0x130 && how == ccLower) printf("!");

                 // For conversion to titlecase, the first cased character of each word

                 // must be converted to titlecase; everything else must be converted

                 // to lowercase.

                 TUniChDb::TCaseConversion howHere;

                 if (how != ccTitle) howHere = how;

                 else {

                         if (srcIdx - 1 == nextWordBoundary) { // A word starts/ends here.

                                 seenCased = false; seenTwoCased = false; cpFirstCased = -1;

                                 size_t next = nextWordBoundary; FindNextWordBoundary(src, origSrcIdx, srcCount, next);

                                 IAssert(next > nextWordBoundary); nextWordBoundary = next; }

                         bool isCased = IsCased(cp);

                         if (isCased && ! seenCased) { howHere = ccTitle; seenCased = true; cpFirstCased = cp; }

                         else { howHere = ccLower;

                                 if (isCased && seenCased) seenTwoCased = true; }

                 }

                 // First, process the conditional mappings from SpecialCasing.txt.

                 // These will be processed in code -- they were ignored while

                 // we were reading SpecialCasing.txt itself.

                 if (cp == GreekCapitalLetterSigma && howHere == ccLower)

                 {

                         // SpecialCasing.txt mentions the 'FinalSigma' condition, but sec. 3.13 of

                         // the standard doesn't define it.  We'll use FinalCased instead.

                         // FinalCased: within the closest word boundaries containing C,

                         // there is a cased letter before C, and there is no cased letter after C.

                         //size_t nextBoundary = srcIdx - 1; FindNextWordBoundary(src, srcIdx, srcCount, nextBoundary);

                         if (! wbsKnown) { FindWordBoundaries(src, origSrcIdx, srcCount, wordBoundaries); wbsKnown = true; }

                         size_t srcIdx2 = srcIdx; bool casedAfter = false;

                         if (how == ccTitle)

                                 printf("!");

                         //while (srcIdx2 < nextBoundary)

                         while (! wordBoundaries[TVecIdx(srcIdx2 - origSrcIdx)])

                         {

                                 int cp2 = src[TVecIdx(srcIdx2)]; srcIdx2++;

                                 if (IsCased(cp2)) { casedAfter = true; break; }

                         }

                         if (! casedAfter)

                         {

                                 //size_t prevBoundary = srcIdx - 1;

                                 //FindPreviousWordBoundary(src, srcIdx, srcCount, prevBoundary);

                                 srcIdx2 = srcIdx - 1; bool casedBefore = false;

                                 //while (prevBoundary < srcIdx2)

                                 while (! wordBoundaries[TVecIdx(srcIdx2 - origSrcIdx)])

                                 {

                                         --srcIdx2; int cp2 = src[TVecIdx(srcIdx2)];

                                         if (IsCased(cp2)) { casedBefore = true; break; }

                                 }

                                 if (casedBefore) {

                                         // Now we have a FinalCased character.

                                         dest.Add(GreekSmallLetterFinalSigma); Assert(howHere == ccLower); continue; }

                         }

                         // If we got here, add a non-final sigma.

                         dest.Add(GreekSmallLetterSigma); continue;

                 }

                 else if (lithuanian)

                 {

                         if (howHere == ccLower)

                         {

                                 if (cp == LatinCapitalLetterI || cp == LatinCapitalLetterJ || cp == LatinCapitalLetterIWithOgonek)

                                 {

                                         bool moreAbove = false;

                                         for (size_t srcIdx2 = srcIdx; srcIdx2 < srcEnd; )

                                         {

                                                 const int cp2 = src[TVecIdx(srcIdx2)]; srcIdx2++;

                                                 const int cc2 = GetCombiningClass(cp2);

                                                 if (cc2 == TUniChInfo::ccStarter) break;

                                                 if (cc2 == TUniChInfo::ccAbove) { moreAbove = true; break; }

                                         }

                                         if (moreAbove)

                                         {

                                                 if (cp == LatinCapitalLetterI) { dest.Add(0x69); dest.Add(0x307); continue; }

                                                 if (cp == LatinCapitalLetterJ) { dest.Add(0x6a); dest.Add(0x307); continue; }

                                                 if (cp == LatinCapitalLetterIWithOgonek) { dest.Add(0x12f); dest.Add(0x307); continue; }

                                         }

                                 }

                                 else if (cp == LatinCapitalLetterIWithGrave) { dest.Add(0x69); dest.Add(0x307); dest.Add(0x300); continue; }

                                 else if (cp == LatinCapitalLetterIWithAcute) { dest.Add(0x69); dest.Add(0x307); dest.Add(0x301); continue; }

                                 else if (cp == LatinCapitalLetterIWithTilde) { dest.Add(0x69); dest.Add(0x307); dest.Add(0x303); continue; }

                         }

                         if (cp == CombiningDotAbove)

                         {

                                 // Lithuanian, howHere != ccLower.

                                 // AfterSoftDotted := the last preceding character with a combining class

                                 // of zero before C was Soft_Dotted, and there is no intervening combining

                                 // character class 230 (ABOVE).

                                 bool afterSoftDotted = false;

                                 size_t srcIdx2 = srcIdx - 1; // now srcIdx2 is the index from which we got 'cp'

                                 while (origSrcIdx < srcIdx2)

                                 {

                                         --srcIdx2; int cp2 = src[TVecIdx(srcIdx2)];

                                         int cc2 = GetCombiningClass(cp2);

                                         if (cc2 == TUniChInfo::ccAbove) break;

                                         if (cc2 == TUniChInfo::ccStarter) {

                                                 afterSoftDotted = IsSoftDotted(cp2); break; }

                                 }

                                 if (afterSoftDotted)

                                 {

                                         Assert(lithuanian);

                                         // Remove DOT ABOVE after "i" with upper or titlecase.

                                         // - Note: but this must only be done if that "i" was actually placed into uppercase (if how == ccTitle,

                                         //   the "i" may have been kept lowercase and thus we shouldn't remove the dot).

                                         if (how == ccLower) { dest.Add(0x307); continue; }

                                         if (how == ccUpper) continue;

                                         Assert(how == ccTitle);

                                         Assert(howHere == ccLower); // because CombiningDotAbove is not a cased character

                                         if (seenCased && ! seenTwoCased) continue; // The "i" has been placed into uppercase; thus, remove the dot.

                                         dest.Add(0x307); continue;

                                 }

                         }

                 }

                 else if (turkic) // language code 'tr' (Turkish) and 'az' (Azeri)

                 {

                         // I and i-dotless; I-dot and i are case pairs in Turkish and Azeri

                         // The following rules handle those cases.

                         if (cp == LatinCapitalLetterIWithDotAbove) {

                                 dest.Add(howHere == ccLower ? 0x69 : 0x130); continue; }

                         // When lowercasing, remove dot_above in the sequence I + dot_above,

                         // which will turn into i.  This matches the behavior of the

                         // canonically equivalent I-dot_above.

                         else if (cp == CombiningDotAbove)

                         {

                                 // AfterI: the last preceding base character was an uppercase I,

                                 // and there is no intervening combining character class 230 (ABOVE).

                                 bool afterI = false;

                                 size_t srcIdx2 = srcIdx - 1; // now srcIdx2 is the index from which we got 'cp'

                                 while (origSrcIdx < srcIdx2)

                                 {

                                         --srcIdx2; int cp2 = src[TVecIdx(srcIdx2)];

                                         if (cp2 == LatinCapitalLetterI) { afterI = true; break; }

                                         int cc2 = GetCombiningClass(cp2);

                                         if (cc2 == TUniChInfo::ccAbove || cc2 == TUniChInfo::ccStarter) break;

                                 }

                                 if (afterI) {

                                         if (how == ccTitle && seenCased && ! seenTwoCased) {

                                                 // Sec. 3.13 defines title-casing in an unusual way: find the first cased character in each word;

                                                 // if found, map it to titlecase; otherwise, map all characters in that word to lowercase.

                                                 // This suggests that if a cased character is found, others in that word should be left alone.

                                                 // This seems unusual; we map all other characters to lowercase instead.

                                                 // But this leads to problems with e.g. I followed by dot-above (U+0307): since the dot-above

                                                 // is not the first cased character (it isn't even cased), we attempt to set it to lowercase;

                                                 // but since afterI is also true here, this would mean deleting it.  Thus our titlecased

                                                 // form of "I followed by dot-above" would be just "I", which is clearly wrong.

                                                 // So we treat this as a special case here.

                                                 IAssert(cpFirstCased == LatinCapitalLetterI);

                                                 dest.Add(0x307); continue; }

                                         if (howHere != ccLower) dest.Add(0x307);

                                         continue; }

                         }

                         // When lowercasing, unless an I is before a dot_above,

                         // it turns into a dotless i.

                         else if (cp == LatinCapitalLetterI)

                         {

                                 // BeforeDot: C is followed by U+0307 (combining dot above).

                                 // Any sequence of characters with a combining class that is

                                 // neither 0 nor 230 may intervene between the current character

                                 // and the combining dot above.

                                 bool beforeDot = false;

                                 for (size_t srcIdx2 = srcIdx; srcIdx2 < srcEnd; )

                                 {

                                         const int cp2 = src[TVecIdx(srcIdx2)]; srcIdx2++;

                                         if (cp2 == 0x307) { beforeDot = true; break; }

                                         const int cc2 = GetCombiningClass(cp2);

                                         if (cc2 == TUniChInfo::ccStarter || cc2 == TUniChInfo::ccAbove) break;

                                 }

                                 if (! beforeDot) {

                                         dest.Add(howHere == ccLower ? 0x131 : 0x49); continue; }

                         }

                         // When uppercasing, i turns into a dotted capital I.

                         else if (cp == LatinSmallLetterI)

                         {

                                 dest.Add(howHere == ccLower ? 0x69 : 0x130); continue;

                         }

                 }

                 // Try to use the unconditional mappings.

                 const TIntIntVH &specHere = (

                         howHere == how ? specials :

                         howHere == ccLower ? specialCasingLower :

                         howHere == ccTitle ? specialCasingTitle :

                         howHere == ccUpper ? specialCasingUpper : *((TIntIntVH *) 0));

                 int i = specHere.GetKeyId(cp);

                 if (i >= 0) { TUniCaseFolding::AppendVector(specHere[i], dest); continue; }

                 // Try to use the simple (one-character) mappings.

                 i = h.GetKeyId(cp);

                 if (i >= 0) {

                         const TUniChInfo &ci = h[i];

                         int cpNew = (

                                 howHere == ccLower ? ci.simpleLowerCaseMapping :

                                 howHere == ccUpper ? ci.simpleUpperCaseMapping :

                                                                          ci.simpleTitleCaseMapping);

                         if (cpNew < 0) cpNew = cp;

                         dest.Add(cpNew); continue; }

                 // As a final resort, leave 'cp' unchanged.

                 dest.Add(cp);

         }

 }


 template<typename TSrcVec, typename TDestCh>

 void TUniChDb::GetSimpleCaseConverted(const TSrcVec& src, size_t srcIdx, const size_t srcCount,

         TVec<TDestCh>& dest, const bool clrDest, const TCaseConversion how) const

 {

         if (clrDest) dest.Clr();

         bool seenCased = false; size_t nextWordBoundary = srcIdx;

         for (const size_t origSrcIdx = srcIdx, srcEnd = srcIdx + srcCount; srcIdx < srcEnd; )

         {

                 const int cp = src[TVecIdx(srcIdx)]; srcIdx++;

                 int i = h.GetKeyId(cp); if (i < 0) { dest.Add(cp); continue; }

                 const TUniChInfo &ci = h[i];

                 // With titlecasing, the first cased character of each word must be put into titlecase,

                 // all others into lowercase.  This is what the howHere variable is for.

                 TUniChDb::TCaseConversion howHere;

                 if (how != ccTitle) howHere = how;

                 else {

                         if (srcIdx - 1 == nextWordBoundary) { // A word starts/ends here.

                                 seenCased = false;

                                 size_t next = nextWordBoundary; FindNextWordBoundary(src, origSrcIdx, srcCount, next);

                                 IAssert(next > nextWordBoundary); nextWordBoundary = next; }

                         bool isCased = IsCased(cp);

                         if (isCased && ! seenCased) { howHere = ccTitle; seenCased = true; }

                         else howHere = ccLower;

                 }

                 int cpNew = (howHere == ccTitle ? ci.simpleTitleCaseMapping : howHere == ccUpper ? ci.simpleUpperCaseMapping : ci.simpleLowerCaseMapping);

                 if (cpNew < 0) cpNew = cp;

                 dest.Add(cpNew);

         }

 }


 template<typename TSrcVec>

 void TUniChDb::ToSimpleCaseConverted(TSrcVec& src, size_t srcIdx, const size_t srcCount, const TCaseConversion how) const

 {

         bool seenCased = false; size_t nextWordBoundary = srcIdx;

         for (const size_t origSrcIdx = srcIdx, srcEnd = srcIdx + srcCount; srcIdx < srcEnd; srcIdx++)

         {

                 const int cp = src[TVecIdx(srcIdx)];

                 int i = h.GetKeyId(cp); if (i < 0) continue;

                 const TUniChInfo &ci = h[i];

                 // With titlecasing, the first cased character of each word must be put into titlecase,

                 // all others into lowercase.  This is what the howHere variable is for.

                 TUniChDb::TCaseConversion howHere;

                 if (how != ccTitle) howHere = how;

                 else {

                         if (srcIdx == nextWordBoundary) { // A word starts/ends here.

                                 seenCased = false;

                                 size_t next = nextWordBoundary; FindNextWordBoundary(src, origSrcIdx, srcCount, next);

                                 IAssert(next > nextWordBoundary); nextWordBoundary = next; }

                         bool isCased = IsCased(cp);

                         if (isCased && ! seenCased) { howHere = ccTitle; seenCased = true; }

                         else howHere = ccLower;

                 }

                 int cpNew = (howHere == ccTitle ? ci.simpleTitleCaseMapping : howHere == ccUpper ? ci.simpleUpperCaseMapping : ci.simpleLowerCaseMapping);

                 if (cpNew >= 0) src[TVecIdx(srcIdx)] = cpNew;

         }

 }


 //-----------------------------------------------------------------------------

 // TUniChDb -- composition, decomposition, normal forms

 //-----------------------------------------------------------------------------


 template<typename TDestCh>

 void TUniChDb::AddDecomposition(const int codePoint, TVec<TDestCh>& dest, const bool compatibility) const

 {

         if (HangulSBase <= codePoint && codePoint < HangulSBase + HangulSCount)

         {

                 // UAX #15, sec. 16: Hangul decomposition

                 const int SIndex = codePoint - HangulSBase;

                 const int L = HangulLBase + SIndex / HangulNCount;

                 const int V = HangulVBase + (SIndex % HangulNCount) / HangulTCount;

                 const int T = HangulTBase + (SIndex % HangulTCount);

                 dest.Add(L); dest.Add(V);

                 if (T != HangulTBase) dest.Add(T);

                 return;

         }

         int i = h.GetKeyId(codePoint); if (i < 0) { dest.Add(codePoint); return; }

         const TUniChInfo &ci = h[i];

         int ofs = ci.decompOffset; if (ofs < 0) { dest.Add(codePoint); return; }

         if ((! compatibility) && ci.IsCompatibilityDecomposition()) { dest.Add(codePoint); return; }

         while (true) {

                 int cp = decompositions[ofs++]; if (cp < 0) return;

                 AddDecomposition(cp, dest, compatibility); }

 }


 template<typename TSrcVec, typename TDestCh>

 void TUniChDb::Decompose(const TSrcVec& src, size_t srcIdx, const size_t srcCount,

                 TVec<TDestCh>& dest, const bool compatibility, bool clrDest) const

 {

         if (clrDest) dest.Clr();

         const size_t destStart = dest.Len()/*, srcEnd = srcIdx + srcCount*/;

         // Decompose the string.

         while (srcIdx < srcCount) {

                 AddDecomposition(src[TVecIdx(srcIdx)], dest, compatibility); srcIdx++; }

         // Rearrange the decomposed string into canonical order.

         for (size_t destIdx = destStart, destEnd = dest.Len(); destIdx < destEnd; )

         {

                 size_t j = destIdx;

                 int cp = dest[TVecIdx(destIdx)]; destIdx++;

                 int cpCls = GetCombiningClass(cp);

                 if (cpCls == TUniChInfo::ccStarter) continue;

                 while (destStart < j && GetCombiningClass(dest[TVecIdx(j - 1)]) > cpCls) {

                         dest[TVecIdx(j)] = dest[TVecIdx(j - 1)]; j--; }

                 dest[TVecIdx(j)] = cp;

         }

 }


 template<typename TSrcVec, typename TDestCh>

 void TUniChDb::DecomposeAndCompose(const TSrcVec& src, size_t srcIdx, const size_t srcCount,

                 TVec<TDestCh>& dest, bool compatibility, bool clrDest) const

 {

         if (clrDest) dest.Clr();

         TIntV temp;

         Decompose(src, srcIdx, srcCount, temp, compatibility);

         Compose(temp, 0, temp.Len(), dest, clrDest);

 }


 template<typename TSrcVec, typename TDestCh>

 void TUniChDb::Compose(const TSrcVec& src, size_t srcIdx, const size_t srcCount,

                 TVec<TDestCh>& dest, bool clrDest) const

 {

         if (clrDest) dest.Clr();

         bool lastStarterKnown = false; // has a starter been encountered yet?

         size_t lastStarterPos = size_t(-1);  // the index (in 'dest') of the last starter

         int cpLastStarter = -1; // the codepoint of the last starter (i.e. cpLastStarter == dest[lastStarterPos])

         const size_t srcEnd = srcIdx + srcCount;

         int ccMax = -1; // The highest combining class among the characters since the last starter.

         while (srcIdx < srcEnd)

         {

                 const int cp = src[TVecIdx(srcIdx)]; srcIdx++;

                 const int cpClass = GetCombiningClass(cp);

                 //int cpCombined = -1;

                 // If there is a starter with which 'cp' can be combined, and from which it is not blocked

                 // by some intermediate character, we can try to combine them.

                 if (lastStarterKnown && ccMax < cpClass)

                 {

                         int j = inverseDec.GetKeyId(TIntPr(cpLastStarter, cp));

                         int cpCombined = -1;

                         do {

                                 // Try to look up a composition in the inverseDec table.

                                 if (j >= 0) { cpCombined = inverseDec[j]; break; }

                                 // UAX #15, sec. 16: Hangul composition

                                 // - Try to combine L and V.

                                 const int LIndex = cpLastStarter - HangulLBase;

                                 if (0 <= LIndex && LIndex < HangulLCount) {

                                         const int VIndex = cp - HangulVBase;

                                         if (0 <= VIndex && VIndex < HangulVCount) {

                                                 cpCombined = HangulSBase + (LIndex * HangulVCount + VIndex) * HangulTCount;

                                                 break; } }

                                 // - Try to combine LV and T.

                                 const int SIndex = cpLastStarter - HangulSBase;

                                 if (0 <= SIndex && SIndex < HangulSCount && (SIndex % HangulTCount) == 0)

                                 {

                                         const int TIndex = cp - HangulTBase;

                                         if (0 <= TIndex && TIndex < HangulTCount) {

                                                 cpCombined = cpLastStarter + TIndex;

                                                 break; }

                                 }

                         } while (false);

                         // If a combining character has been found, use it to replace the old cpStarter.

                         if (cpCombined >= 0) {

                                 dest[TVecIdx(lastStarterPos)] = cpCombined;

                                 Assert(GetCombiningClass(cpCombined) == TUniChInfo::ccStarter);

                                 // if (cpCombined is not a starter) { starterKnown = false; lastStarterPos = size_t(01); cpLastStarter = -1; } else

                                 cpLastStarter = cpCombined; continue; }

                 }

                 if (cpClass == TUniChInfo::ccStarter) { // 'cp' is a starter, remember it for later.  Set ccMax to -1 so that this starter can be combined with another starter.

                         lastStarterKnown = true; lastStarterPos = dest.Len(); cpLastStarter = cp; ccMax = cpClass - 1; }

                 else if (cpClass > ccMax) // Remember cp's class as the new maximum class since the last starter (for blocking).

                         ccMax = cpClass;

                 dest.Add(cp);

         }

 }


 template<typename TSrcVec, typename TDestCh>

 size_t TUniChDb::ExtractStarters(const TSrcVec& src, size_t srcIdx, const size_t srcCount,

                 TVec<TDestCh>& dest, bool clrDest) const

 {

         if (clrDest) dest.Clr();

         size_t retVal = 0;

         for (const size_t srcEnd = srcIdx + srcCount; srcIdx < srcEnd; srcIdx++) {

                 const int cp = src[TVecIdx(srcIdx)];

                 if (GetCombiningClass(cp) == TUniChInfo::ccStarter)

                         { dest.Add(cp); retVal++; } }

         return retVal;

 }


 inline bool AlwaysFalse()

 {

         int sum = 0;

         for (int i = 0; i < 5; i++) sum += i;

         return sum > 100;

 }


 inline bool AlwaysTrue()

 {

         int sum = 0;

         for (int i = 0; i < 5; i++) sum += i;

         return sum < 100;

 }


 /*


 Notes on decomposition:


 - In UnicodeData.txt, there is a field with the decomposition mapping.

   This field may also include a tag, <...>.

   If there is a tag, this is a compatibility mapping.

   Otherwise it is a canonical mapping.

 - Canonical decomposition uses only canonical mappings,

   compatibility decomposition uses both canonical and compatibility mappings.

 - Decomposition:

   1. Apply the decomposition mappings (canonical or canonical+compatibility), recursively.

   2. Put the string into canonical order, which means:

      while there exists a pair of characters, A immediately followed by B,

          such that combiningclass(A) > combiningclass(B) > 0  [an "exchangeable pair"]:

            swap A and B;

   This results in NFD (normalized form D, after canonical decomposition)

   or NFKD (normalized form KD, after compatibility decomposition).

 - Canonical composition:

   1. Before composition, the string should have been decomposed

      (using either canonical or compatibility decomposition).

   2. For each character C (from left to right):

      2.1.  Find the last starter S before C (if not found, continue).

          2.2.  If there is, between S and C, some character with a combining class >= than that of C, then continue.

          2.3.  If there exists a character L for which the canonical decomposition is S+L

                and L is not in the composition exclusion table [i.e. L is a "primary composite"],

                    then replace S by L, and remove C.

   This results in NFC (normalized form C, with canonical decomposition followed by canonical composition)

   or NFKC (normalized form KD, with compatibility decomposition followed by canonical composition).

 - Composition exclusion table:

   - Anything in CompositionExclusions.txt.

   - Singletons: characters whose canonical decomposition is a single character.

   - Non-starter decompositions: characters whose canonical decomposition begins with a non-starter.


 Example:

                  E-grave  (00c8; composition class 0; canonical decomposition: 0045 0300)

                                  E-macron (0112; composition class 0;                          0045 0304)

                                  grave   (0300; composition class 230)

                  macron  (0304; composition class 230)

   source string: 00c8 0304

   after canonical decomposition (or compatibility decomposition, they would be the same here): 0045 0300 0304

   after canonical composition: 00c8 0304


   cc(horn) = 216

   cc(dot below) = 220

   cc(dot above) = 230


 ToDos:

 - case folding - je misljen predvsem za primerjanje tako dobljenih nizov.

   Funkcija f(s) = NFC(toCaseFold(s)) je idempotentna.

   Funkcija g(s) = NFKC(toCaseFold(s)) pa ni -- ce hocemo to, moramo pri foldingu

   upostevati se nekaj dodatnih mappingov (glej 5.18, zadnji odstavek; DerivedNormalizationProps.txt).

 - Zdi se, da je CaseFolding.txt v bistvu cisto navaden folding v lowercase.

   Ker hocemo imeti tudi ostale foldinge, glejmo raje SpecialCasing.txt

   (+ simple case mappinge v UnicodeData.txt).

   Predlagam, da pri branju SpecialCasing.txt conditional mappinge kar ignoriramo

   in jih potem upostevamo posebej kar v source kodi nasih programov [za

   podrobno definicijo pogojev pa glej tabelo 3.13].

   - Pripis: vseeno se mi zdi, da je CaseFolding.txt nekaj malo drugacnega od navadnega lowercase.

     Na primer, za small final sigma 03c2 je tam navedeno, naj se spremeni v navadno small sigma 03c3.

         To ne sledi niti iz UnicodeData.txt niti iz SpecialCasing.txt, pa ceprav v UCD.html pise,

         da je CaseFolding.txt izpeljan iz njiju.  Glavni namen CaseFolding.txt naj bi bil za

         potrebe "locale-independent case folding" (table 4.1 in sec. 5.18).

   - Preden se zacnes ubadati s case conversioni, si oglej razdelek 3.13

     in se posebej str. 90.

   - Glej str. 91 o kombinaciji N[K]FD + caseFold + N[K]FD

   - definicija cased ipd. na str. 89

 - isIdentifierStart(c), isIdentifierEnd(c) -- sec. 5.15

   Glej DerivedCoreProperties.txt, kjer je na podoben nacin definiranih se kup podobnih

   stvari, med drugim isLowerCase in isUpperCase.  Tam je tudi isLetter, isAlphabetic itd. (sec. 4.9).

   To je se najbolje dodati med flagse posameznega characterja.

 - general category: sec. 4.5

 - motivacija za titlecase: 5.18

 - primerjaj nas dosedanji izracun compositionExclusion s tistim, kar je naracunano v DerivedNormalizationProps.txt

   pod Full_Composition_Exclusion

 - script names: Scripts.txt in UAX #24.

 - block names: Blocks.txt

 - space characters: table 6.2 in baje tudi UCD.html

 - dash characters: table 6.3

 */


 //#endif


TUniChDb::ccUpper
Definition: unicode.h:1584

ucfSbClose
Definition: unicode.h:734

TUniChDb::InitAfterLoad
void InitAfterLoad()
Definition: unicode.cpp:1368

TCRef
Definition: bd.h:440

bomAllowed
Definition: unicode.h:48

IAssert
#define IAssert(Cond)
Definition: bd.h:262

TUniChInfo::GetWbFlags
int GetWbFlags() const
Definition: unicode.h:1118

TUniChInfo::IsVariationSelector
bool IsVariationSelector() const
Definition: unicode.h:1103

TUniChDb::ToSimpleTitleCase
void ToSimpleTitleCase(TSrcVec &src, size_t srcIdx, const size_t srcCount) const
Definition: unicode.h:1611

ucfPxOtherMath
Definition: unicode.h:971

TUniChCategory_
TUniChCategory_
Definition: unicode.h:662

TUniCodec::SwapBytes
static int SwapBytes(int x)
Definition: unicode.h:250

TUniCaseFolding::TUniCaseFolding
TUniCaseFolding()
Definition: unicode.h:284

TIntPr
TPair< TInt, TInt > TIntPr
Definition: ds.h:83

TUniCaseFolding::Load
void Load(TSIn &SIn)
Definition: unicode.h:286

TUniChDb::IsSbFlag
bool IsSbFlag(const int cp, const TUniChFlags flag) const
Definition: unicode.h:1358

ucfPrHexDigit
Definition: unicode.h:829

TEncoding_YuAscii::FromUnicode
static int FromUnicode(int c)
Definition: unicode.h:499

TUnicode::InitCodecs
void InitCodecs()
Definition: unicode.cpp:1683

TUniChDb::Clr
void Clr()
Definition: unicode.h:1276

TUnicode::ToSimpleUpperCase
void ToSimpleUpperCase(TIntV &src) const
Definition: unicode.h:1977

TUniChDb::GetUpperCase
void GetUpperCase(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest=true, const bool turkic=false, const bool lithuanian=false) const
Definition: unicode.h:1591

TVecElt< TChA >::Add
static void Add(TVector &vector, const TElement &element)
Definition: unicode.h:432

TUnicode::Init
void Init()
Definition: unicode.h:1779

TUniCodec::TestDecodeUtf16
void TestDecodeUtf16(TRnd &rnd, const TStr &testCaseDesc, const TUtf16BomHandling bomHandling, const TUniByteOrder defaultByteOrder, const bool insertBom)
Definition: unicode.cpp:341

TVecElt< TVec< TDat > >::TElement
TDat TElement
Definition: unicode.h:422

TEncoding_CP437::fromUnicodeTable1
static const int fromUnicodeTable1[6 *16]
Definition: unicode.h:510

TUniChDb::DecomposeAndCompose
void DecomposeAndCompose(const TSrcVec &src, TVec< TDestCh > &dest, bool compatibility, bool clrDest=true) const
Definition: unicode.h:1542

TUniChDb::GetCharName
const char * GetCharName(const int cp) const
Definition: unicode.h:1331

TUniChDb::Compose
void Compose(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, bool clrDest=true) const
Definition: unicode.h:3158

TEncoding_ISO8859_3
Definition: unicode.h:462

TUniCodec::strict
bool strict
Definition: unicode.h:83

TUniChProperties
enum TUniChProperties_ TUniChProperties

TUnicode::iso8859_4
T8BitCodec< TEncoding_ISO8859_4 > iso8859_4
Definition: unicode.h:1849

TInt::GetStr
TStr GetStr() const
Definition: dt.h:1107

IAssertR
#define IAssertR(Cond, Reason)
Definition: bd.h:265

TUniCodec::TVecIdx
TUniVecIdx TVecIdx
Definition: unicode.h:118

TUniTrie::Has1Gram
bool Has1Gram(const TItem &item) const
Definition: unicode.h:1204

TVecElt< TChA >::TElement
char TElement
Definition: unicode.h:431

TUniChDb::ToSimpleTitleCase
void ToSimpleTitleCase(TSrcVec &src) const
Definition: unicode.h:1614

TUnicodeException::srcIdx
size_t srcIdx
Definition: unicode.h:32

TStr::Len
int Len() const
Definition: dt.h:487

TUniChInfo::SetPropertyX
void SetPropertyX(const TUniChPropertiesX flag)
Definition: unicode.h:1108

TUniChDb::TUcdFileReader::ReadNextLine
bool ReadNextLine()
Definition: unicode.h:1663

TUniCaseFolding::AppendVector
static void AppendVector(const TVec< TSrcDat > &src, TVec< TDestDat > &dest)
Definition: unicode.h:278

TUniCodec::DefaultReplacementChar
Definition: unicode.h:59

TEncoding_CP852::GetName
static TStr GetName()
Definition: unicode.h:531

TUniChDb::GetCat
TUniChCategory GetCat(const int cp) const
Definition: unicode.h:1353

TEncoding_ISO8859_4::ToUnicode
static int ToUnicode(int c)
Definition: unicode.h:481

TUniChDb::GetScriptByName
int GetScriptByName(const TStr &scriptName) const
Definition: unicode.h:1322

TUniChDb::TUcdFileReader::operator=
TUcdFileReader & operator=(const TUcdFileReader &r)
Definition: unicode.h:1678

TUniChDb::ToSimpleUpperCase
void ToSimpleUpperCase(TSrcVec &src) const
Definition: unicode.h:1612

TUniTrie::Clr
void Clr()
Definition: unicode.h:1200

TUniTrie::Empty
bool Empty() const
Definition: unicode.h:1202

TUniChDb::SbEx_Add
void SbEx_Add(const TStr &s)
Definition: unicode.h:1492

ucfPxOtherIdStart
Definition: unicode.h:969

TUniChDb::TUcdFileReader::buf
TChA buf
Definition: unicode.h:1652

ucfPrPatternSyntax
Definition: unicode.h:854

TUniChDb::GetBinFn
static TStr GetBinFn()
Definition: unicode.h:1310

TUniCaseFolding::TVecIdx
TUniVecIdx TVecIdx
Definition: unicode.h:281

TUnicode::EncodeUtf8Str
TStr EncodeUtf8Str(const TIntV &src) const
Definition: unicode.h:1796

ucfWbALetter
Definition: unicode.h:719

TUniChInfo::IsSbFormat
bool IsSbFormat() const
Definition: unicode.h:1129

TUniChFlags
enum TUniChFlags_ TUniChFlags

TUniChInfo::IsCompositionExclusion
bool IsCompositionExclusion() const
Definition: unicode.h:1111

TVecElt< TVec< TDat > >::Add
static void Add(TVector &vector, const TElement &element)
Definition: unicode.h:423

TUniTrie::pairs
THash< TItemPr, TVoid > pairs
Definition: unicode.h:1195

TUniCaseFolding::TUniCaseFolding
TUniCaseFolding(TSIn &SIn)
Definition: unicode.h:285

TUniChDb::SaveBin
void SaveBin(const TStr &fnBinUcd)
Definition: unicode.cpp:1362

TUniTrie::TNode
Definition: unicode.h:1182

___OutRepl
#define ___OutRepl

TRnd
Definition: dt.h:11

TUnicode::TCaseConversion
TUniChDb::TCaseConversion TCaseConversion
Definition: unicode.h:1963

TUniChInfo::IsDcpFlag
bool IsDcpFlag(const TUniChFlags flag) const
Definition: unicode.h:1068

ucfPrAsciiHexDigit
Definition: unicode.h:786

TUniChInfo::LineBreak_Quotation
static const ushort LineBreak_Quotation
Definition: unicode.h:1032

ucfPrSTerm
Definition: unicode.h:901

TUniChInfo::SetProperty
void SetProperty(const TUniChProperties flag)
Definition: unicode.h:1085

TUniTrie::Add
void Add(const TSrcVec &src)
Definition: unicode.h:1249

TTriple
Definition: ds.h:129

TUniChInfo::IsGraphemeExtend
bool IsGraphemeExtend() const
Definition: unicode.h:1077

TUniChInfo::SetSbFlag
void SetSbFlag(const TUniChFlags flag)
Definition: unicode.h:1127

TUniChDb::GetSimpleCaseConverted
void GetSimpleCaseConverted(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest, const TCaseConversion how) const
Definition: unicode.h:3042

TEncoding_ISO8859_4::fromUnicodeTable1
static const int fromUnicodeTable1[14 *16]
Definition: unicode.h:480

TUniTrie::TNode::child
int child
Definition: unicode.h:1185

ucfPxOtherUppercase
Definition: unicode.h:972

TUniChDb::GetSpecialCasingFn
static TStr GetSpecialCasingFn()
Definition: unicode.h:1297

TUnicode
Definition: unicode.h:1771

TUniChSubCategory
enum TUniChSubCategory_ TUniChSubCategory

TUniChDb::HangulNCount
Definition: unicode.h:1408

TUniChInfo::subCat
TUniChSubCategory subCat
Definition: unicode.h:1020

TUniTrie::TItemPr
TPair< TItem, TItem > TItemPr
Definition: unicode.h:1191

TUnicode::iso8859_1
T8BitCodec< TEncoding_ISO8859_1 > iso8859_1
Definition: unicode.h:1846

TVecElt< TVec< TDat > >::TVector
TVec< TDat > TVector
Definition: unicode.h:421

TStrPool::Save
void Save(TSOut &SOut) const
Definition: dt.cpp:1694

TUniChDb::GetWbFlags
int GetWbFlags(const int cp) const
Definition: unicode.h:1357

PCodecBase
TPt< TCodecBase > PCodecBase
Definition: unicode.h:328

TUniChInfo::IsSurrogate
bool IsSurrogate() const
Definition: unicode.h:1162

TCodecBase::FromUnicode
virtual size_t FromUnicode(const TIntV &src, size_t srcIdx, const size_t srcCount, TIntV &dest, const bool clrDest=true) const =0

TUniChDb::WbFindNextNonIgnoredS
void WbFindNextNonIgnoredS(const TSrcVec &src, size_t &position, const size_t srcEnd) const
Definition: unicode.h:1429

TUniChInfo::SetDcpFlag
void SetDcpFlag(const TUniChFlags flag)
Definition: unicode.h:1070

TUniChInfo::SetWbFlag
void SetWbFlag(const TUniChFlags flag)
Definition: unicode.h:1117

TCodecBase::Test
virtual void Test() const
Definition: unicode.h:345

TUtf16BomHandling_
TUtf16BomHandling_
Definition: unicode.h:46

TUniChInfo::IsGraphemeBase
bool IsGraphemeBase() const
Definition: unicode.h:1076

TUniCodec::EncodeUtf8Str
TStr EncodeUtf8Str(const TSrcVec &src, size_t srcIdx, const size_t srcCount) const
Definition: unicode.h:149

TUniChDb::ToSimpleUpperCase
void ToSimpleUpperCase(TSrcVec &src, size_t srcIdx, const size_t srcCount) const
Definition: unicode.h:1609

TUniChCategory
enum TUniChCategory_ TUniChCategory

TUnicodeErrorHandling
enum TUnicodeErrorHandling_ TUnicodeErrorHandling

uint
unsigned int uint
Definition: bd.h:11

ucfPrBidiControl
Definition: unicode.h:788

TCodec_ISO8859_3
T8BitCodec< TEncoding_ISO8859_3 > TCodec_ISO8859_3
Definition: unicode.h:651

TEncoding_CP852::ToUnicode
static int ToUnicode(int c)
Definition: unicode.h:533

TUniChDb::GetSubCat
TUniChSubCategory GetSubCat(const int cp) const
Definition: unicode.h:1354

TUniChDb::TSubcatHelper::TSubcatHelper
TSubcatHelper(TUniChDb &owner_)
Definition: unicode.h:1727

T8BitCodec::FromUnicode
size_t FromUnicode(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TDestVec &dest, const bool clrDest=true) const
Definition: unicode.h:618

TUniChInfo::combClass
uchar combClass
Definition: unicode.h:1018

Fail
#define Fail
Definition: bd.h:238

T8BitCodec::ToUnicode
size_t ToUnicode(const TStr &src, TIntV &dest, const bool clrDest=true) const
Definition: unicode.h:613

TUniChDb::GetScriptNameKatakana
static TStr GetScriptNameKatakana()
Definition: unicode.h:1318

TUniChDb::sbExTrie
TUniTrie< TInt > sbExTrie
Definition: unicode.h:1461

TStrPool::GetCStr
const char * GetCStr(const uint &Offset) const
Definition: dt.h:811

TUniChInfo::ccStarter
Definition: unicode.h:989

TUniChInfo::LineBreak_InfixNumeric
static const ushort LineBreak_InfixNumeric
Definition: unicode.h:1032

TUniChInfo::cat
TUniChCategory cat
Definition: unicode.h:1019

TUniCodec::GetRndUint
static uint GetRndUint(TRnd &rnd)
Definition: unicode.cpp:62

TUniChInfo::IsLogicalOrderException
bool IsLogicalOrderException() const
Definition: unicode.h:1097

ucfPxOtherAlphabetic
Definition: unicode.h:965

TEncoding_YuAscii
Definition: unicode.h:490

TChA::Clr
void Clr()
Definition: dt.h:258

bd.h

TUniChDb::TUcdFileReader
Definition: unicode.h:1649

TUniCodec::TUniCodec
TUniCodec()
Definition: unicode.h:91

TEncoding_ISO8859_3::GetName
static TStr GetName()
Definition: unicode.h:465

ucfSbSp
Definition: unicode.h:727

TUnicodeException::message
TStr message
Definition: unicode.h:31

TCodec_ISO8859_4
T8BitCodec< TEncoding_ISO8859_4 > TCodec_ISO8859_4
Definition: unicode.h:652

TVec::Len
TSizeTy Len() const
Returns the number of elements in the vector.
Definition: ds.h:547

TEncoding_ISO8859_1::ToUnicode
static int ToUnicode(int c)
Definition: unicode.h:444

TUniChDb::TSubcatHelper::invalidCatCodes
TStrH invalidCatCodes
Definition: unicode.h:1724

ucfDcpAlphabetic
Definition: unicode.h:741

ucfPrDiacritic
Definition: unicode.h:821

TUniChDb::InitPropList
void InitPropList(const TStr &basePath)
Definition: unicode.cpp:950

TEncoding_CP852::toUnicodeTable
static const int toUnicodeTable[8 *16]
Definition: unicode.h:532

TCodecWrapper
Definition: unicode.h:377

TUniChInfo::GetSubCat
TUniChSubCategory GetSubCat() const
Definition: unicode.h:1146

TUniChDb::ToCaseFolded
void ToCaseFolded(TSrcVec &src, const bool turkic=false) const
Definition: unicode.h:1637

THash::Save
void Save(TSOut &SOut) const
Definition: hash.h:141

TUniChDb::GetSimpleTitleCase
void GetSimpleTitleCase(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest=true) const
Definition: unicode.h:1603

TUniChInfo::IsDefaultIgnorable
bool IsDefaultIgnorable() const
Definition: unicode.h:1075

THash::Empty
bool Empty() const
Definition: hash.h:185

TUniChDb::ToSimpleLowerCase
void ToSimpleLowerCase(TSrcVec &src) const
Definition: unicode.h:1613

T8BitCodec::T8BitCodec
T8BitCodec(TUnicodeErrorHandling errorHandling_, int replacementChar_=TUniCodec::DefaultReplacementChar)
Definition: unicode.h:571

TUnicode::ClrCodecs
void ClrCodecs()
Definition: unicode.h:1881

TUniChDb::TCaseConversion
enum TUniChDb::TCaseConversion_ TCaseConversion

TUniChInfo::ClrDcpFlags
void ClrDcpFlags()
Definition: unicode.h:1069

TUniChDb::GetSimpleUpperCase
void GetSimpleUpperCase(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest=true) const
Definition: unicode.h:1602

TUniChInfo::IsAlphabetic
bool IsAlphabetic() const
Definition: unicode.h:1071

TUniChDb::GetLowerCase
void GetLowerCase(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest=true, const bool turkic=false, const bool lithuanian=false) const
Definition: unicode.h:1590

TUniChDb::GetSbFlags
int GetSbFlags(const int cp) const
Definition: unicode.h:1359

TUniChDb::WbFindCurOrNextNonIgnored
void WbFindCurOrNextNonIgnored(const TSrcVec &src, size_t &position, const size_t srcEnd) const
Definition: unicode.h:1422

TUniChInfo::TUniChInfo
TUniChInfo(TSIn &SIn)
Definition: unicode.h:1061

TCodecWrapper::FromUnicode
virtual size_t FromUnicode(const TIntV &src, size_t srcIdx, const size_t srcCount, TStr &dest, const bool clrDest=true) const
Definition: unicode.h:397

TUniChInfo::LineBreak_ComplexContext
static const ushort LineBreak_ComplexContext
Definition: unicode.h:1032

uehReplace
Definition: unicode.h:23

TUnicode::cp852
T8BitCodec< TEncoding_CP852 > cp852
Definition: unicode.h:1852

TUniTrie::TNode::item
TItem item
Definition: unicode.h:1184

TUniCaseFolding::cfFull
TIntIntVH cfFull
Definition: unicode.h:275

TUniChDb::GetScriptName
const TStr & GetScriptName(const int scriptId) const
Definition: unicode.h:1321

ucfPrVariationSelector
Definition: unicode.h:921

TUniTrie::TNodeV
TVec< TNode > TNodeV
Definition: unicode.h:1190

TUniChDb::TUcdFileReader::Close
void Close()
Definition: unicode.h:1684

TChA::Len
int Len() const
Definition: dt.h:259

TUniChInfo::GetCat
TUniChCategory GetCat() const
Definition: unicode.h:1145

TEncoding_ISO8859_1::GetName
static TStr GetName()
Definition: unicode.h:443

TUniChDb::specialCasingUpper
TIntIntVH specialCasingUpper
Definition: unicode.h:1271

TUniChDb::GetScript
int GetScript(const TUniChInfo &ci) const
Definition: unicode.h:1323

TEncoding_YuAscii::yuAsciiChars
static const int yuAsciiChars[10]
Definition: unicode.h:493

TStr::GetSubStr
TStr GetSubStr(const int &BChN, const int &EChN) const
Definition: dt.cpp:811

TUniChInfo::ccBelowIotaSubscript
Definition: unicode.h:1014

TUnicode::GetUpperCase
void GetUpperCase(const TIntV &src, TIntV &dest) const
Definition: unicode.h:1966

ucfPrWhiteSpace
Definition: unicode.h:957

TUnicode::iso8859_3
T8BitCodec< TEncoding_ISO8859_3 > iso8859_3
Definition: unicode.h:1848

uehAbort
Definition: unicode.h:24

TUniTrie::singles
THash< TItem, TVoid > singles
Definition: unicode.h:1194

bomRequired
Definition: unicode.h:49

TEncoding_ISO8859_2
Definition: unicode.h:448

TUnicode::RegisterCodec
void RegisterCodec(const TStr &nameList, const PCodecBase &codec)
Definition: unicode.h:1873

TUnicode::ucd
TUniChDb ucd
Definition: unicode.h:1775

TUniChDb::InitDerivedCoreProperties
void InitDerivedCoreProperties(const TStr &basePath)
Definition: unicode.cpp:1007

TUniChInfo::InitAfterLoad
void InitAfterLoad()
Definition: unicode.h:1035

TCodecWrapper::Test
virtual void Test() const
Definition: unicode.h:386

TEncoding_ISO8859_1::FromUnicode
static int FromUnicode(int c)
Definition: unicode.h:445

TUniChDb::TUcdFileReader::f
FILE * f
Definition: unicode.h:1656

TUniChInfo::IsWhiteSpace
bool IsWhiteSpace() const
Definition: unicode.h:1104

TEncoding_CP1250
Definition: unicode.h:543

TUniChInfo::ccBelowRight
Definition: unicode.h:1006

TEncoding_ISO8859_3::FromUnicode
static int FromUnicode(int c)
Definition: unicode.h:469

TUniChDb::InitLineBreaks
void InitLineBreaks(const TStr &basePath)
Definition: unicode.cpp:1046

TCodecBase
Definition: unicode.h:332

TEncoding_YuAscii::uniChars
static const int uniChars[10]
Definition: unicode.h:493

TUniChDb::Decompose
void Decompose(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, bool compatibility, bool clrDest=true) const
Definition: unicode.h:3126

TUnicode::GetLowerCase
void GetLowerCase(const TIntV &src, TIntV &dest) const
Definition: unicode.h:1965

TUniChDb::WbFindNextNonIgnored
void WbFindNextNonIgnored(const TSrcVec &src, size_t &position, const size_t srcEnd) const
Definition: unicode.h:1425

TUnicode::ToSimpleLowerCase
void ToSimpleLowerCase(TIntV &src) const
Definition: unicode.h:1978

TUniChDb::WbFindPrevNonIgnored
bool WbFindPrevNonIgnored(const TSrcVec &src, const size_t srcStart, size_t &position) const
Definition: unicode.h:1434

TUniChInfo::chCat
char chCat
Definition: unicode.h:1017

Trans
#define Trans(curFlag, newState)

TUniTrie::Add
void Add(const TSrcVec &src, const size_t srcIdx, const size_t srcCount)
Definition: unicode.h:1220

TUniChDb::GetNormalizationTestFn
static TStr GetNormalizationTestFn()
Definition: unicode.h:1309

TCodec_ISO8859_2
T8BitCodec< TEncoding_ISO8859_2 > TCodec_ISO8859_2
Definition: unicode.h:650

TUniChDb::GetCaseFolded
void GetCaseFolded(const TSrcVec &src, TVec< TDestCh > &dest, const bool clrDest=true, const bool full=true, const bool turkic=false) const
Definition: unicode.h:1632

TUniChDb::HangulVBase
Definition: unicode.h:1406

TPt::Clr
void Clr()
Definition: bd.h:502

TVec::Load
void Load(TSIn &SIn)
Definition: ds.h:895

TUniTrie::TItem
TItem_ TItem
Definition: unicode.h:1180

TUniChPropertiesX
enum TUniChPropertiesX_ TUniChPropertiesX

TUnicode::GetSimpleLowerCase
void GetSimpleLowerCase(const TIntV &src, TIntV &dest) const
Definition: unicode.h:1972

ucfPxIdsTrinaryOperator
Definition: unicode.h:975

TUniChDb::IsWbFlag
bool IsWbFlag(const int cp, const TUniChFlags flag) const
Definition: unicode.h:1356

TUniChDb::TSubcatHelper::~TSubcatHelper
~TSubcatHelper()
Definition: unicode.h:1754

TUniChInfo::SaveUShort
static void SaveUShort(TSOut &SOut, ushort u)
Definition: unicode.h:1047

TUniCodec::errorHandling
TUnicodeErrorHandling errorHandling
Definition: unicode.h:66

TUniChInfo::IsGbExtend
bool IsGbExtend() const
Definition: unicode.h:1139

TIntIntVH
THash< TInt, TIntV > TIntIntVH
Definition: unicode.h:269

TUnicode::ToCaseFolded
void ToCaseFolded(TIntV &src) const
Definition: unicode.h:1992

TUniChDb::TSubcatHelper::owner
TUniChDb & owner
Definition: unicode.h:1725

TUnicode::DecomposeAndCompose
void DecomposeAndCompose(const TIntV &src, TIntV &dest, bool compatibility) const
Definition: unicode.h:1946

TUniChInfo::IsUppercase
bool IsUppercase() const
Definition: unicode.h:1072

TUniChDb::Test
void Test(const TStr &basePath)
Definition: unicode.cpp:1377

TEncoding_CP852::fromUnicodeTable2
static const int fromUnicodeTable2[2 *16]
Definition: unicode.h:532

ucfSbMask
Definition: unicode.h:735

TEncoding_CP437::fromUnicodeTable2
static const int fromUnicodeTable2[4 *16]
Definition: unicode.h:510

TUniChInfo::ccNuktas
Definition: unicode.h:991

TUnicode::cp437
T8BitCodec< TEncoding_CP437 > cp437
Definition: unicode.h:1853

TUniTrie::nodes
TNodeV nodes
Definition: unicode.h:1197

TSIn::LoadCs
void LoadCs()
Definition: fl.cpp:28

ucfPxOtherIdContinue
Definition: unicode.h:968

THash::Load
void Load(TSIn &SIn)
Definition: hash.h:137

TUniChInfo::ccHiraganaKatakanaVoicingMarks
Definition: unicode.h:992

uehThrow
Definition: unicode.h:22

TUniChDb::TUcdFileReader::ParseCodePointRange
static void ParseCodePointRange(const TStr &s, int &from, int &to)
Definition: unicode.h:1703

ucfDcpIdContinue
Definition: unicode.h:765

TUniChDb::specialCasingLower
TIntIntVH specialCasingLower
Definition: unicode.h:1271

TCodecBase::GetName
virtual TStr GetName() const =0

ucfDcpUppercase
Definition: unicode.h:771

TCodecWrapper::GetName
virtual TStr GetName() const
Definition: unicode.h:384

TUniChInfo::simpleUpperCaseMapping
int simpleUpperCaseMapping
Definition: unicode.h:1022

TUniCaseFolding::Test
void Test()
Definition: unicode.cpp:549

ucfPxOtherDefaultIgnorableCodePoint
Definition: unicode.h:966

TUniChDb::ExtractStarters
size_t ExtractStarters(const TSrcVec &src, TVec< TDestCh > &dest, bool clrDest=true) const
Definition: unicode.h:1551

TEncoding_CP437::GetName
static TStr GetName()
Definition: unicode.h:509

TUniChInfo::ccFixedPositionStart
Definition: unicode.h:994

TUnicode::FindNextWordBoundary
bool FindNextWordBoundary(const TIntV &src, int &position) const
Definition: unicode.h:1901

TUniTrie::TNode::sib
int sib
Definition: unicode.h:1185

TCodecBase::ToUnicode
size_t ToUnicode(const TStr &src, TIntV &dest, const bool clrDest=true) const
Definition: unicode.h:353

T8BitCodec
Definition: unicode.h:561

ucfPrHyphen
Definition: unicode.h:843

TUniChInfo::LoadUShort
static void LoadUShort(TSIn &SIn, ushort &u)
Definition: unicode.h:1045

T8BitCodec::UniToStr
size_t UniToStr(const TIntV &src, TStr &dest, const bool clrDest=true) const
Definition: unicode.h:646

TSIn
Definition: fl.h:58

TUniChDb::TestCaseConversion
void TestCaseConversion(const TStr &source, const TStr &trueLc, const TStr &trueTc, const TStr &trueUc, bool turkic, bool lithuanian)
Definition: unicode.cpp:825

TEncoding_ISO8859_4::FromUnicode
static int FromUnicode(int c)
Definition: unicode.h:483

TVec::Save
void Save(TSOut &SOut) const
Definition: ds.h:903

ucfWbNumeric
Definition: unicode.h:722

TUniChDb::GetUnicodeDataFn
static TStr GetUnicodeDataFn()
Definition: unicode.h:1298

TUniChDb::HangulVCount
Definition: unicode.h:1407

TUniChInfo::ccDoubleAbove
Definition: unicode.h:1013

TUniChInfo::ccBelowRightAttached
Definition: unicode.h:998

TUniChDb::HangulSCount
Definition: unicode.h:1409

TCodec_YuAscii
T8BitCodec< TEncoding_YuAscii > TCodec_YuAscii
Definition: unicode.h:656

TUniChDb::inverseDec
THash< TIntPr, TInt > inverseDec
Definition: unicode.h:1267

TUnicode::FindWordBoundaries
void FindWordBoundaries(const TIntV &src, TBoolV &dest) const
Definition: unicode.h:1907

TUniChInfo::IsPropertyX
bool IsPropertyX(const TUniChPropertiesX flag) const
Definition: unicode.h:1107

TUniChDb::FindNextSentenceBoundary
bool FindNextSentenceBoundary(const TSrcVec &src, const size_t srcIdx, const size_t srcCount, size_t &position) const
Definition: unicode.h:2636

TUniCodec::EncodeUtf8
size_t EncodeUtf8(const TSrcVec &src, TVec< TDestCh > &dest, const bool clrDest=true) const
Definition: unicode.h:145

TUnicode::ClrSentenceBoundaryExceptions
void ClrSentenceBoundaryExceptions()
Definition: unicode.h:1924

TUniChInfo::IsPrivateUse
bool IsPrivateUse() const
Definition: unicode.h:1161

TUniCodec::DecodeUtf16FromWords
size_t DecodeUtf16FromWords(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, bool clrDest, const TUtf16BomHandling bomHandling=bomAllowed, const TUniByteOrder defaultByteOrder=boMachineEndian) const
Definition: unicode.h:2294

TUniChInfo::GetWbFlagsStr
TStr GetWbFlagsStr() const
Definition: unicode.h:1120

TUniByteOrder_
TUniByteOrder_
Definition: unicode.h:38

ucfSbNumeric
Definition: unicode.h:731

TUniChDb::ccMax
Definition: unicode.h:1584

TUniChDb::GetScriptsFn
static TStr GetScriptsFn()
Definition: unicode.h:1300

TEncoding_CP437::fromUnicodeTable3
static const int fromUnicodeTable3[6 *16]
Definition: unicode.h:510

TUniChDb::ToSimpleCaseConverted
void ToSimpleCaseConverted(TSrcVec &src, size_t srcIdx, const size_t srcCount, const TCaseConversion how) const
Definition: unicode.h:3072

TUnicode::FindSentenceBoundaries
void FindSentenceBoundaries(const TIntV &src, TBoolV &dest) const
Definition: unicode.h:1922

ucfPrGraphemeLink
Definition: unicode.h:826

TEncoding_ISO8859_3::ToUnicode
static int ToUnicode(int c)
Definition: unicode.h:467

TUniChInfo::propertiesX
int propertiesX
Definition: unicode.h:1027

TUniChDb::HangulTCount
Definition: unicode.h:1407

TUniChInfo::IsLowercase
bool IsLowercase() const
Definition: unicode.h:1073

TStrPool::Clr
void Clr(bool DoDel=false)
Definition: dt.h:816

TCodecBase::FromUnicode
size_t FromUnicode(const TIntV &src, TChA &dest, const bool clrDest=true) const
Definition: unicode.h:362

TUniChInfo::IsDeprecated
bool IsDeprecated() const
Definition: unicode.h:1089

TUniCaseFolding::Clr
void Clr()
Definition: unicode.h:288

TUniChDb::TestCaseConversions
void TestCaseConversions()
Definition: unicode.cpp:853

TUniChInfo::ccInvalid
Definition: unicode.h:1015

TUniChInfo::simpleTitleCaseMapping
int simpleTitleCaseMapping
Definition: unicode.h:1022

TUniChInfo::IsCurrency
bool IsCurrency() const
Definition: unicode.h:1157

TStrPool
Definition: dt.h:778

TUniChFlags_
TUniChFlags_
Definition: unicode.h:712

TFIn::New
static PSIn New(const TStr &FNm)
Definition: fl.cpp:290

TUniCodec
Definition: unicode.h:54

THash::DelKey
void DelKey(const TKey &Key)
Definition: hash.h:362

ucfPxOtherGraphemeExtend
Definition: unicode.h:967

TUniChSubCategory_
TUniChSubCategory_
Definition: unicode.h:676

ucfDcpGraphemeExtend
Definition: unicode.h:762

T8BitCodec::Test
void Test() const
Definition: unicode.h:575

TUniChInfo::SetCatAndSubCat
void SetCatAndSubCat(const TUniChSubCategory catAndSubCat)
Definition: unicode.h:1038

TUniChInfo::GetWbFlagsStr
static TStr GetWbFlagsStr(const int flags)
Definition: unicode.h:1121

ucfPrJoinControl
Definition: unicode.h:847

T8BitCodec::UniToStr
size_t UniToStr(const TIntV &src, size_t srcIdx, const size_t srcCount, TStr &dest, const bool clrDest=true) const
Definition: unicode.h:642

T8BitCodec::ToUnicode
size_t ToUnicode(const TIntV &src, TIntV &dest, const bool clrDest=true) const
Definition: unicode.h:612

TUnicode::GetCharNameS
TStr GetCharNameS(const int cp) const
Definition: unicode.h:2025

TUnicode::GetCodec
PCodecBase GetCodec(const TStr &name) const
Definition: unicode.h:1883

TUnicode::TUnicode
TUnicode()
Definition: unicode.h:1777

TVec::Clr
void Clr(const bool &DoDel=true, const TSizeTy &NoDelLim=-1)
Clears the contents of the vector.
Definition: ds.h:971

TStr::ChangeStrAll
int ChangeStrAll(const TStr &SrcStr, const TStr &DstStr, const bool &FromStartP=false)
Definition: dt.cpp:1141

TSOut::SaveCs
void SaveCs()
Definition: fl.h:171

TEncoding_CP1250::fromUnicodeTable2
static const int fromUnicodeTable2[2 *16]
Definition: unicode.h:547

TUnicode::DecodeUtf16FromWords
int DecodeUtf16FromWords(const TIntV &src, TIntV &dest, const TUtf16BomHandling bomHandling=bomAllowed, const TUniByteOrder defaultByteOrder=boMachineEndian) const
Definition: unicode.h:1823

ucfDcpIdStart
Definition: unicode.h:764

TUniChDb::TUcdFileReader::TUcdFileReader
TUcdFileReader(const TUcdFileReader &r)
Definition: unicode.h:1679

TUniChDb::HangulLBase
Definition: unicode.h:1406

ucfPrSoftDotted
Definition: unicode.h:889

TUniChInfo::IsSbSep
bool IsSbSep() const
Definition: unicode.h:1136

TUniCodec::TestDecodeUtf8
void TestDecodeUtf8(TRnd &rnd, const TStr &testCaseDesc)
Definition: unicode.cpp:133

TUnicode::ExtractStarters
int ExtractStarters(TIntV &src) const
Definition: unicode.h:1953

TUniChInfo::ccAboveAttached
Definition: unicode.h:1002

TUniChInfo::IsHexDigit
bool IsHexDigit() const
Definition: unicode.h:1093

TUnicode::codec
TUniCodec codec
Definition: unicode.h:1774

TUniChDb
Definition: unicode.h:1256

TCodecWrapper::impl
TCodecImpl impl
Definition: unicode.h:381

TChA::CStr
char * CStr()
Definition: dt.h:255

ucfWbMask
Definition: unicode.h:736

TUniChInfo::IsCompatibilityDecomposition
bool IsCompatibilityDecomposition() const
Definition: unicode.h:1112

TUnicode::ToSimpleTitleCase
void ToSimpleTitleCase(TIntV &src) const
Definition: unicode.h:1979

TUnicodeException::srcChar
int srcChar
Definition: unicode.h:33

TUniChDb::PrintCharNames
void PrintCharNames(FILE *f, const TSrcVec &src, const TStr &prefix) const
Definition: unicode.h:1342

TUniCaseFolding
Definition: unicode.h:271

TEncoding_CP1250::GetName
static TStr GetName()
Definition: unicode.h:546

TUniChDb::GetScriptNameUnknown
static TStr GetScriptNameUnknown()
Definition: unicode.h:1317

TStr::IsHexInt
bool IsHexInt(const bool &Check, const int &MnVal, const int &MxVal, int &Val) const
Definition: dt.cpp:1190

TCodecWrapper::FromUnicode
virtual size_t FromUnicode(const TIntV &src, size_t srcIdx, const size_t srcCount, TChA &dest, const bool clrDest=true) const
Definition: unicode.h:395

ucfSbUpper
Definition: unicode.h:729

TVec::PutAll
void PutAll(const TVal &Val)
Sets all elements of the vector to value Val.
Definition: ds.h:1166

TUnicode::GetSimpleUpperCase
void GetSimpleUpperCase(const TIntV &src, TIntV &dest) const
Definition: unicode.h:1973

TEncoding_ISO8859_1
Definition: unicode.h:440

TCodecBaseV
TVec< PCodecBase > TCodecBaseV
Definition: unicode.h:330

TUniTrie::IsNodeTerminal
bool IsNodeTerminal(const int nodeIdx) const
Definition: unicode.h:1215

TUniChInfo::lineBreak
ushort lineBreak
Definition: unicode.h:1028

TUniChDb::GetSentenceBreakTestFn
static TStr GetSentenceBreakTestFn()
Definition: unicode.h:1307

TSIn::Load
void Load(bool &Bool)
Definition: fl.h:84

TestCurNext
#define TestCurNext(curFlag, nextFlag)

TUniChDb::TUniChDb
TUniChDb()
Definition: unicode.h:1274

TUniCodec::EncodeUtf16ToBytes
size_t EncodeUtf16ToBytes(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest, const bool insertBom, const TUniByteOrder destByteOrder=boMachineEndian) const
Definition: unicode.h:2428

TStr::SearchStr
int SearchStr(const TStr &Str, const int &BChN=0) const
Definition: dt.cpp:1065

TUniChDb::Save
void Save(TSOut &SOut) const
Definition: unicode.h:1280

TUnicode::GetUtf8CaseFolded
TStr GetUtf8CaseFolded(const TStr &s) const
Definition: unicode.h:1994

T8BitCodec::replacementChar
int replacementChar
Definition: unicode.h:568

TUniChInfo::IsAsciiHexDigit
bool IsAsciiHexDigit() const
Definition: unicode.h:1086

TUniChInfo::ccFixedPositionEnd
Definition: unicode.h:995

TUniCodec::TestUtf8
void TestUtf8()
Definition: unicode.cpp:194

DefineUniSubCat
#define DefineUniSubCat(cat, subCat, c)
Definition: unicode.h:678

TUniChDb::TUcdFileReader::PutBack
void PutBack(int c)
Definition: unicode.h:1661

TCodec_ISO8859_1
T8BitCodec< TEncoding_ISO8859_1 > TCodec_ISO8859_1
Definition: unicode.h:649

TUniTrie::TVecIdx
TUniVecIdx TVecIdx
Definition: unicode.h:1193

ucfDcpDefaultIgnorableCodePoint
Definition: unicode.h:749

TUniChDb::IsWbIgnored
bool IsWbIgnored(const int cp) const
Definition: unicode.h:1420

ucfCompositionExclusion
Definition: unicode.h:715

TUInt::GetStr
TStr GetStr() const
Definition: dt.h:1189

boBigEndian
Definition: unicode.h:42

TUniByteOrder
enum TUniByteOrder_ TUniByteOrder

THash::FNextKeyId
bool FNextKeyId(int &KeyId) const
Definition: hash.h:436

TUnicode::GetCaseFolded
void GetCaseFolded(const TIntV &src, TIntV &dest, const bool full=true) const
Definition: unicode.h:1989

TEncoding_CP1250::fromUnicodeTable3
static const int fromUnicodeTable3[3 *16]
Definition: unicode.h:547

TUnicode::ExtractStarters
int ExtractStarters(const TIntV &src, TIntV &dest) const
Definition: unicode.h:1951

Assert
#define Assert(Cond)
Definition: bd.h:251

TUniChInfo::ccAboveRightAttached
Definition: unicode.h:1003

TUniChInfo::ccAboveRight
Definition: unicode.h:1011

TUniChInfo::GetSbFlagsStr
TStr GetSbFlagsStr() const
Definition: unicode.h:1130

TUniChInfo::ccAboveLeftAttached
Definition: unicode.h:1001

bomIgnored
Definition: unicode.h:50

TUniChInfo::ccRight
Definition: unicode.h:1008

TUniChDb::ExtractStarters
size_t ExtractStarters(TSrcVec &src) const
Definition: unicode.h:1555

TUniChDb::LoadTxt_ProcessDecomposition
void LoadTxt_ProcessDecomposition(TUniChInfo &ci, TStr s)
Definition: unicode.cpp:937

TUniChDb::FindNextWordBoundary
bool FindNextWordBoundary(const TSrcVec &src, const size_t srcIdx, const size_t srcCount, size_t &position) const
Definition: unicode.h:2483

TCodecBase::FromUnicode
size_t FromUnicode(const TIntV &src, TIntV &dest, const bool clrDest=true) const
Definition: unicode.h:361

TEncoding_ISO8859_2::toUnicodeTable
static const int toUnicodeTable[6 *16]
Definition: unicode.h:452

TUniChDb::HangulTBase
Definition: unicode.h:1406

TUniChDb::TSubcatHelper
Definition: unicode.h:1720

TUniChInfo::IsExtender
bool IsExtender() const
Definition: unicode.h:1091

ucfPxIdsBinaryOperator
Definition: unicode.h:974

TUniChDb::HangulLCount
Definition: unicode.h:1407

TUniChDb::InitSpecialCasing
void InitSpecialCasing(const TStr &basePath)
Definition: unicode.cpp:1225

TUniChDb::AddDecomposition
void AddDecomposition(const int codePoint, TVec< TDestCh > &dest, const bool compatibility) const
Definition: unicode.h:3103

THash::FFirstKeyId
int FFirstKeyId() const
Definition: hash.h:236

TUniCodec::DecodeUtf8
size_t DecodeUtf8(const TSrcVec &src, TVec< TDestCh > &dest, const bool clrDest=true) const
Definition: unicode.h:136

TUniChDb::GetDerivedCorePropsFn
static TStr GetDerivedCorePropsFn()
Definition: unicode.h:1301

TUniChDb::IsSurrogate
bool IsSurrogate(const int cp) const
Definition: unicode.h:1392

TUniTrie::roots
THash< TItemTr, TInt > roots
Definition: unicode.h:1196

TUnicodeException::TUnicodeException
TUnicodeException(size_t srcIdx_, int srcChar_, const TStr &message_)
Definition: unicode.h:34

TUniChInfo::IsNoncharacter
bool IsNoncharacter() const
Definition: unicode.h:1098

TUniChInfo::LoadSChar
static void LoadSChar(TSIn &SIn, signed char &u)
Definition: unicode.h:1046

TUniChDb::GetWordBreakPropertyFn
static TStr GetWordBreakPropertyFn()
Definition: unicode.h:1306

TEncoding_CP852::fromUnicodeTable3
static const int fromUnicodeTable3[11 *16]
Definition: unicode.h:532

TUniChDb::h
THash< TInt, TUniChInfo > h
Definition: unicode.h:1263

TUniChDb::TUcdFileReader::GetNextLine
bool GetNextLine(TStrV &dest)
Definition: unicode.h:1686

TUniChDb::SbEx_AddUtf8
void SbEx_AddUtf8(const TStr &s)
Definition: unicode.h:1494

TStr::GetLc
TStr GetLc() const
Definition: dt.h:499

TUniChDb::GetSimpleLowerCase
void GetSimpleLowerCase(const TSrcVec &src, TVec< TDestCh > &dest, const bool clrDest=true) const
Definition: unicode.h:1604

TUniChDb::CanSentenceEndHere
bool CanSentenceEndHere(const TSrcVec &src, const size_t srcIdx, const size_t position) const
Definition: unicode.h:2585

TUniChInfo::Save
void Save(TSOut &SOut) const
Definition: unicode.h:1051

TUniChInfo::properties
int properties
Definition: unicode.h:1026

TUniChDb::GetSimpleLowerCase
void GetSimpleLowerCase(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest=true) const
Definition: unicode.h:1601

TUniChInfo::ccViramas
Definition: unicode.h:993

TUniChDb::GetTitleCase
void GetTitleCase(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest=true, const bool turkic=false, const bool lithuanian=false) const
Definition: unicode.h:1592

TUnicode::DecodeUtf16FromBytes
int DecodeUtf16FromBytes(const TIntV &src, TIntV &dest, const TUtf16BomHandling bomHandling=bomAllowed, const TUniByteOrder defaultByteOrder=boMachineEndian) const
Definition: unicode.h:1810

TUniChDb::TUcdFileReader::Open
void Open(const TStr &fileName)
Definition: unicode.h:1683

TUniChDb::Load
void Load(TSIn &SIn)
Definition: unicode.h:1285

ucfWbFormat
Definition: unicode.h:717

TCodec_CP852
T8BitCodec< TEncoding_CP852 > TCodec_CP852
Definition: unicode.h:653

TUniChInfo::IsDiacritic
bool IsDiacritic() const
Definition: unicode.h:1090

TUnicode::UseEnglishSentenceBoundaryExceptions
void UseEnglishSentenceBoundaryExceptions()
Definition: unicode.h:1925

TUniChInfo::IsValidSubCat
static bool IsValidSubCat(const char chCat, const char chSubCat)
Definition: unicode.h:1164

TUniChDb::TCaseConversion_
TCaseConversion_
Definition: unicode.h:1584

TEncoding_ISO8859_2::fromUnicodeTable1
static const int fromUnicodeTable1[14 *16]
Definition: unicode.h:452

TUniChInfo::ccLeft
Definition: unicode.h:1007

TUniChInfo::IsSbFlag
bool IsSbFlag(const TUniChFlags flag) const
Definition: unicode.h:1126

TEncoding_YuAscii::ToUnicode
static int ToUnicode(int c)
Definition: unicode.h:495

TUniChInfo::ClrWbAndSbFlags
void ClrWbAndSbFlags()
Definition: unicode.h:1116

TUniChInfo::LineBreak_Numeric
static const ushort LineBreak_Numeric
Definition: unicode.h:1032

uchar
unsigned char uchar
Definition: bd.h:10

TUniChDb::TUcdFileReader::GetCh
int GetCh()
Definition: unicode.h:1658

TSOut::SaveBf
void SaveBf(const void *Bf, const TSize &BfL)
Definition: fl.h:172

TUniChDb::LoadBin
void LoadBin(const TStr &fnBin)
Definition: unicode.h:1291

TUniChInfo::ccBelow
Definition: unicode.h:1005

TUniChDb::InitScripts
void InitScripts(const TStr &basePath)
Definition: unicode.cpp:1073

TCodecWrapper::ToUnicode
virtual size_t ToUnicode(const TStr &src, size_t srcIdx, const size_t srcCount, TIntV &dest, const bool clrDest=true) const
Definition: unicode.h:390

TUniChDb::TestComposition
void TestComposition(const TStr &basePath)
Definition: unicode.cpp:745

ucfCompatibilityDecomposition
Definition: unicode.h:714

TUtf16BomHandling
enum TUtf16BomHandling_ TUtf16BomHandling

TUniTrie::TNode::terminal
bool terminal
Definition: unicode.h:1186

TUniChDb::ccTitle
Definition: unicode.h:1584

TUniChDb::GetCaseFolded
void GetCaseFolded(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest, const bool full, const bool turkic=false) const
Definition: unicode.h:1629

TSOut
Definition: fl.h:128

TUniCaseFolding::cfTurkic
TIntH cfTurkic
Definition: unicode.h:274

TUniChDb::GetLineBreakFn
static TStr GetLineBreakFn()
Definition: unicode.h:1302

T8BitCodec::TVecIdx
TUniVecIdx TVecIdx
Definition: unicode.h:564

TUniTrie::TUniTrie
TUniTrie()
Definition: unicode.h:1199

TUniCaseFolding::Fold
void Fold(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest, const bool full, const bool turkic) const
Definition: unicode.h:293

TEncoding_CP852::fromUnicodeTable1
static const int fromUnicodeTable1[14 *16]
Definition: unicode.h:532

TUniCodec::skipBom
bool skipBom
Definition: unicode.h:89

TUniChDb::TUcdFileReader::ParseCodePointList
static void ParseCodePointList(const TStr &s, TIntV &dest, bool ClrDestP=true)
Definition: unicode.h:1697

TUniChDb::IsGetChInfo
bool IsGetChInfo(const int cp, TUniChInfo &ChInfo)
Definition: unicode.h:1350

TUnicode::FindNextSentenceBoundary
bool FindNextSentenceBoundary(const TIntV &src, int &position) const
Definition: unicode.h:1916

TUniChDb::SbEx_Clr
void SbEx_Clr()
Definition: unicode.h:1489

TUnicode::yuAscii
T8BitCodec< TEncoding_YuAscii > yuAscii
Definition: unicode.h:1850

TUniCaseFolding::cfCommon
TIntH cfCommon
Definition: unicode.h:274

TUniChDb::GetWordBreakTestFn
static TStr GetWordBreakTestFn()
Definition: unicode.h:1305

TUniCodec::TUniCodec
TUniCodec(TUnicodeErrorHandling errorHandling_, bool strict_, int replacementChar_, bool skipBom_)
Definition: unicode.h:95

TUniChDb::ExtractStarters
size_t ExtractStarters(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, bool clrDest=true) const
Definition: unicode.h:3215

TUniChDb::TUcdFileReader::TUcdFileReader
TUcdFileReader(const TStr &fileName)
Definition: unicode.h:1682

TUniChInfo::ccBelowAttached
Definition: unicode.h:997

ucfPrIdeographic
Definition: unicode.h:845

TUniCodec::DecodeUtf16FromBytes
size_t DecodeUtf16FromBytes(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest, const TUtf16BomHandling bomHandling=bomAllowed, const TUniByteOrder defaultByteOrder=boMachineEndian) const
Definition: unicode.h:2210

TUniChInfo::IsDash
bool IsDash() const
Definition: unicode.h:1088

TUniChDb::IsWbIgnored
static bool IsWbIgnored(const TUniChInfo &ci)
Definition: unicode.h:1419

ucfPrQuotationMark
Definition: unicode.h:886

TSOut::Save
void Save(const bool &Bool)
Definition: fl.h:173

ucfDcpMath
Definition: unicode.h:768

TUniChDb::Decompose
void Decompose(const TSrcVec &src, TVec< TDestCh > &dest, bool compatibility, bool clrDest=true) const
Definition: unicode.h:1520

TUniChInfo::IsWbFormat
bool IsWbFormat() const
Definition: unicode.h:1119

T8BitCodec::FromUnicode
size_t FromUnicode(const TSrcVec &src, TDestVec &dest, const bool clrDest=true) const
Definition: unicode.h:640

TEncoding_CP1250::ToUnicode
static int ToUnicode(int c)
Definition: unicode.h:548

TUniTrie::Get3GramRoot
int Get3GramRoot(const TItem &last, const TItem &butLast, const TItem &butButLast) const
Definition: unicode.h:1206

TUnicode::NormalizeCodecName
static TStr NormalizeCodecName(const TStr &name)
Definition: unicode.h:1870

TUniChDb::decompositions
TIntV decompositions
Definition: unicode.h:1266

TEncoding_CP1250::toUnicodeTable
static const int toUnicodeTable[8 *16]
Definition: unicode.h:547

TInt
Definition: dt.h:1044

ushort
unsigned short ushort
Definition: bd.h:13

TUniChInfo::GetSbFlagsStr
static TStr GetSbFlagsStr(const int flags)
Definition: unicode.h:1131

TUniCodec::EncodeUtf8Str
TStr EncodeUtf8Str(const TSrcVec &src) const
Definition: unicode.h:150

TUniChDb::GetCharNameS
TStr GetCharNameS(const int cp) const
Definition: unicode.h:1332

TUnicode::UnregisterCodec
void UnregisterCodec(const TStr &nameList)
Definition: unicode.h:1877

TCodec_CP437
T8BitCodec< TEncoding_CP437 > TCodec_CP437
Definition: unicode.h:654

TUniChDb::GetScript
int GetScript(const int cp) const
Definition: unicode.h:1324

TEncoding_ISO8859_2::GetName
static TStr GetName()
Definition: unicode.h:451

ucfPrDash
Definition: unicode.h:815

TUnicode::DecodeUtf8
int DecodeUtf8(const TStr &src, TIntV &dest) const
Definition: unicode.h:1788

TUnicode::GetSubCat
TUniChSubCategory GetSubCat(const int cp) const
Definition: unicode.h:2021

THash::GetKeyId
int GetKeyId(const TKey &Key) const
Definition: hash.h:424

T8BitCodec::GetName
static TStr GetName()
Definition: unicode.h:573

TUnicode::DecodeUtf8
int DecodeUtf8(const TIntV &src, TIntV &dest) const
Definition: unicode.h:1787

TVecElt< TChA >::TVector
TChA TVector
Definition: unicode.h:430

IsPeekAheadSkippable
#define IsPeekAheadSkippable(sbf)

ucfDcpGraphemeBase
Definition: unicode.h:756

TUniChInfo::IsIdStart
bool IsIdStart() const
Definition: unicode.h:1078

TEncoding_CP852::FromUnicode
static int FromUnicode(int c)
Definition: unicode.h:535

TChA
Definition: dt.h:201

TUniChDb::IsPrivateUse
DECLARE_FORWARDED_PROPERTY_METHODS bool IsPrivateUse(const int cp) const
Definition: unicode.h:1383

TUniChDb::HangulSBase
Definition: unicode.h:1406

TUniChDb::TSubcatHelper::SetCat
void SetCat(const int cp)
Definition: unicode.h:1744

ucfWbMidNum
Definition: unicode.h:721

TEncoding_ISO8859_2::FromUnicode
static int FromUnicode(int c)
Definition: unicode.h:455

TUniChDb::ToCaseFolded
void ToCaseFolded(TSrcVec &src, size_t srcIdx, const size_t srcCount, const bool turkic=false) const
Definition: unicode.h:1636

TUniChDb::GetCombiningClass
int GetCombiningClass(const int cp) const
Definition: unicode.h:1399

TUniCodec::replacementChar
int replacementChar
Definition: unicode.h:64

TEncoding_CP437::toUnicodeTable
static const int toUnicodeTable[8 *16]
Definition: unicode.h:510

TUniChDb::GetSentenceBreakPropertyFn
static TStr GetSentenceBreakPropertyFn()
Definition: unicode.h:1308

ucfDcpMask
Definition: unicode.h:775

TUniChDb::SbEx_Add
void SbEx_Add(const TSrcVec &v)
Definition: unicode.h:1490

ucfDcpXidContinue
Definition: unicode.h:774

TUniChDb::GetSimpleTitleCase
void GetSimpleTitleCase(const TSrcVec &src, TVec< TDestCh > &dest, const bool clrDest=true) const
Definition: unicode.h:1606

TUniCaseFolding::LoadTxt
void LoadTxt(const TStr &fileName)
Definition: unicode.cpp:505

TUniChDb::PrintCharNames
void PrintCharNames(FILE *f, const TSrcVec &src, size_t srcIdx, const size_t srcCount, const TStr &prefix) const
Definition: unicode.h:1336

TUniCodec::IsMachineLittleEndian
static bool IsMachineLittleEndian()
Definition: unicode.cpp:83

TPair
Definition: ds.h:32

TUniChInfo::IsHyphen
bool IsHyphen() const
Definition: unicode.h:1094

TUniChDb::TUniChDb
TUniChDb(TSIn &SIn)
Definition: unicode.h:1275

THash::AddKey
int AddKey(const TKey &Key)
Definition: hash.h:331

TUniChDb::InitWordAndSentenceBoundaryFlags
void InitWordAndSentenceBoundaryFlags(const TStr &basePath)
Definition: unicode.cpp:1100

TUniChDb::Compose
void Compose(const TSrcVec &src, TVec< TDestCh > &dest, bool clrDest=true) const
Definition: unicode.h:1532

TUniChInfo::IsXidStart
bool IsXidStart() const
Definition: unicode.h:1080

ucfDcpXidStart
Definition: unicode.h:773

ucfDcpLowercase
Definition: unicode.h:752

TUniChInfo::Load
void Load(TSIn &SIn)
Definition: unicode.h:1056

TUniTrie
Definition: unicode.h:1177

DECLARE_FORWARDED_PROPERTY_METHODS
#define DECLARE_FORWARDED_PROPERTY_METHODS
Definition: unicode.h:1367

TUniChDb::TestFindNextWordOrSentenceBoundary
void TestFindNextWordOrSentenceBoundary(const TStr &basePath, bool sentence)
Definition: unicode.cpp:649

TUniChDb::SbEx_AddMulti
int SbEx_AddMulti(const TStr &words, const bool wordsAreUtf8=true)
Definition: unicode.h:1495

TEncoding_ISO8859_3::toUnicodeTable
static const int toUnicodeTable[6 *16]
Definition: unicode.h:466

TUniChInfo::chSubCat
char chSubCat
Definition: unicode.h:1017

ucfSbOLetter
Definition: unicode.h:730

TUniChInfo::simpleLowerCaseMapping
int simpleLowerCaseMapping
Definition: unicode.h:1022

TUniChInfo
Definition: unicode.h:985

TUniChInfo::IsXidContinue
bool IsXidContinue() const
Definition: unicode.h:1081

TUniChInfo::LineBreak_Unknown
static const ushort LineBreak_Unknown
Definition: unicode.h:1032

ucfSbSTerm
Definition: unicode.h:733

TUniCodec::Utf16FirstSurrogate
Definition: unicode.h:158

TUniChDb::GetCompositionExclusionsFn
static TStr GetCompositionExclusionsFn()
Definition: unicode.h:1299

TUniVecIdx
int TUniVecIdx
Definition: unicode.h:11

ucfWbKatakana
Definition: unicode.h:718

T8BitCodec::T8BitCodec
T8BitCodec()
Definition: unicode.h:570

TUnicodeException
Definition: unicode.h:28

TCodecWrapper::ToUnicode
virtual size_t ToUnicode(const TIntV &src, size_t srcIdx, const size_t srcCount, TIntV &dest, const bool clrDest=true) const
Definition: unicode.h:388

TEncoding_ISO8859_2::ToUnicode
static int ToUnicode(int c)
Definition: unicode.h:453

boMachineEndian
Definition: unicode.h:40

TUniChInfo::IsGraphemeLink
bool IsGraphemeLink() const
Definition: unicode.h:1092

TUniChDb::TSubcatHelper::subCat
TUniChSubCategory subCat
Definition: unicode.h:1723

TUniChInfo::IsQuotationMark
bool IsQuotationMark() const
Definition: unicode.h:1099

TUniChDb::LoadTxt
void LoadTxt(const TStr &basePath)
Definition: unicode.cpp:1249

TUniChDb::scripts
TStrIntH scripts
Definition: unicode.h:1265

TStrPool::~TStrPool
~TStrPool()
Definition: dt.h:789

TUniCodec::DefineByte
Definition: unicode.h:103

TStr
Definition: dt.h:412

TCodec_CP1250
T8BitCodec< TEncoding_CP1250 > TCodec_CP1250
Definition: unicode.h:655

TUniCodec::EncodeUtf16ToWords
size_t EncodeUtf16ToWords(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest, const bool insertBom, const TUniByteOrder destByteOrder=boMachineEndian) const
Definition: unicode.h:2376

TUnicode::___UniFwd2
DECLARE_FORWARDED_PROPERTY_METHODS ___UniFwd2(IsPrivateUse, IsSurrogate) TUniChCategory GetCat(const int cp) const
Definition: unicode.h:2018

TUniChDb::TSubcatHelper::ProcessComment
void ProcessComment(TUniChDb::TUcdFileReader &reader)
Definition: unicode.h:1729

TUniChDb::ccLower
Definition: unicode.h:1584

TUniChPropertiesX_
TUniChPropertiesX_
Definition: unicode.h:961

TUnicode::cp1250
T8BitCodec< TEncoding_CP1250 > cp1250
Definition: unicode.h:1851

TEncoding_CP852
Definition: unicode.h:528

TStr::ToTrunc
TStr & ToTrunc()
Definition: dt.cpp:770

T8BitCodec::ToUnicode
size_t ToUnicode(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest=true) const
Definition: unicode.h:597

TUniChDb::FindWordBoundaries
void FindWordBoundaries(const TSrcVec &src, const size_t srcIdx, const size_t srcCount, TBoolV &dest) const
Definition: unicode.h:2561

TUniChInfo::decompOffset
int decompOffset
Definition: unicode.h:1023

TUniTrie::Has2Gram
bool Has2Gram(const TItem &last, const TItem &butLast) const
Definition: unicode.h:1205

TUniChDb::GetPropListFn
static TStr GetPropListFn()
Definition: unicode.h:1303

TUniChInfo::ccBelowLeftAttached
Definition: unicode.h:996

ucfPrPatternWhiteSpace
Definition: unicode.h:855

TUniChDb::ToSimpleLowerCase
void ToSimpleLowerCase(TSrcVec &src, size_t srcIdx, const size_t srcCount) const
Definition: unicode.h:1610

TUniChDb::TSubcatHelper::TestCat
void TestCat(const int cp)
Definition: unicode.h:1749

TCodecBase::FromUnicode
size_t FromUnicode(const TIntV &src, TStr &dest, const bool clrDest=true) const
Definition: unicode.h:363

TStr::SplitOnAllCh
void SplitOnAllCh(const char &SplitCh, TStrV &StrV, const bool &SkipEmpty=true) const
Definition: dt.cpp:926

THash< TInt, TIntV >

TUnicode::GetCharName
const char * GetCharName(const int cp) const
Definition: unicode.h:2024

TUniChInfo::ccLeftAttached
Definition: unicode.h:999

TSIn::LoadBf
void LoadBf(const void *Bf, const TSize &BfL)
Definition: fl.h:81

TVecElt
Definition: unicode.h:413

TUniChDb::TUcdFileReader::comment
TChA comment
Definition: unicode.h:1654

TUnicode::Compose
void Compose(const TIntV &src, TIntV &dest) const
Definition: unicode.h:1941

TUniChInfo::ccBelowLeft
Definition: unicode.h:1004

TUnicode::GetSimpleTitleCase
void GetSimpleTitleCase(const TIntV &src, TIntV &dest) const
Definition: unicode.h:1974

TUniCodec::DecodeUtf8
size_t DecodeUtf8(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest=true) const
Definition: unicode.h:2036

AlwaysTrue
bool AlwaysTrue()
Definition: unicode.h:3234

TUniChInfo::IsIdContinue
bool IsIdContinue() const
Definition: unicode.h:1079

ucfPrDeprecated
Definition: unicode.h:818

ucfPrExtender
Definition: unicode.h:824

TUniCaseFolding::cfSimple
TIntH cfSimple
Definition: unicode.h:274

TUniChDb::GetUpperCase
void GetUpperCase(const TSrcVec &src, TVec< TDestCh > &dest, const bool clrDest=true, const bool turkic=false, const bool lithuanian=false) const
Definition: unicode.h:1594

TEncoding_YuAscii::GetName
static TStr GetName()
Definition: unicode.h:494

TUniChDb::GetLowerCase
void GetLowerCase(const TSrcVec &src, TVec< TDestCh > &dest, const bool clrDest=true, const bool turkic=false, const bool lithuanian=false) const
Definition: unicode.h:1593

TCodecBase::New
static PCodecBase New()
Definition: unicode.h:404

THash::Clr
void Clr(const bool &DoDel=true, const int &NoDelLim=-1, const bool &ResetDat=true)
Definition: hash.h:319

TestCurNext2
#define TestCurNext2(curFlag, nextFlag, next2Flag)

TPt
Definition: bd.h:196

TCodecBase::ToUnicode
virtual size_t ToUnicode(const TIntV &src, size_t srcIdx, const size_t srcCount, TIntV &dest, const bool clrDest=true) const =0

TUniChDb::TUcdFileReader::~TUcdFileReader
~TUcdFileReader()
Definition: unicode.h:1685

TUniTrie::TItemTr
TTriple< TItem, TItem, TItem > TItemTr
Definition: unicode.h:1192

TEncoding_ISO8859_4
Definition: unicode.h:476

TUniCodec::Utf16SecondSurrogate
Definition: unicode.h:159

TEncoding_CP437
Definition: unicode.h:506

TUniChInfo::IsCased
bool IsCased() const
Definition: unicode.h:1142

TUniChDb::charNames
TStrPool charNames
Definition: unicode.h:1264

TUniChInfo::GetSbFlags
int GetSbFlags() const
Definition: unicode.h:1128

TUniChInfo::ccRightAttached
Definition: unicode.h:1000

TVec::Gen
void Gen(const TSizeTy &_Vals)
Constructs a vector (an array) of _Vals elements.
Definition: ds.h:495

TUniChInfo::IsSTerminal
bool IsSTerminal() const
Definition: unicode.h:1101

TUniChDb::GetSimpleUpperCase
void GetSimpleUpperCase(const TSrcVec &src, TVec< TDestCh > &dest, const bool clrDest=true) const
Definition: unicode.h:1605

ucfPrNoncharacterCodePoint
Definition: unicode.h:852

TUniCaseFolding::Save
void Save(TSOut &SOut) const
Definition: unicode.h:287

TUniChDb::FindSentenceBoundaries
void FindSentenceBoundaries(const TSrcVec &src, const size_t srcIdx, const size_t srcCount, TBoolV &dest) const
Definition: unicode.h:2793

TUnicode::Decompose
void Decompose(const TIntV &src, TIntV &dest, bool compatibility) const
Definition: unicode.h:1934

TUnicode::EncodeUtf8
int EncodeUtf8(const TIntV &src, TIntV &dest) const
Definition: unicode.h:1792

TEncoding_ISO8859_4::toUnicodeTable
static const int toUnicodeTable[6 *16]
Definition: unicode.h:480

TUniChInfo::IsProperty
bool IsProperty(const TUniChProperties flag) const
Definition: unicode.h:1084

ucfSbSep
Definition: unicode.h:725

ucfPxRadical
Definition: unicode.h:976

TUnicode::GetTitleCase
void GetTitleCase(const TIntV &src, TIntV &dest) const
Definition: unicode.h:1967

TUniCodec::TestUtf16
void TestUtf16()
Definition: unicode.cpp:408

TEncoding_CP437::FromUnicode
static int FromUnicode(int c)
Definition: unicode.h:513

TStr::SplitOnWs
void SplitOnWs(TStrV &StrV) const
Definition: dt.cpp:972

TUniChInfo::IsWbFlag
bool IsWbFlag(const TUniChFlags flag) const
Definition: unicode.h:1115

TUniChDb::TUcdFileReader::TUcdFileReader
TUcdFileReader()
Definition: unicode.h:1681

TUniChInfo::script
signed char script
Definition: unicode.h:1021

TUniChInfo::nameOffset
int nameOffset
Definition: unicode.h:1024

ucfPxUnifiedIdeograph
Definition: unicode.h:977

TUnicode::GetAllCodecs
void GetAllCodecs(TCodecBaseV &dest) const
Definition: unicode.h:1887

TUniChDb::scriptUnknown
int scriptUnknown
Definition: unicode.h:1272

TUnicode::TUnicode
TUnicode(const TStr &fnBinUcd)
Definition: unicode.h:1778

T8BitCodec::errorHandling
TUnicodeErrorHandling errorHandling
Definition: unicode.h:567

TStr::CStr
char * CStr()
Definition: dt.h:476

TUniChInfo::SaveSChar
static void SaveSChar(TSOut &SOut, signed char u)
Definition: unicode.h:1048

TEncoding_CP1250::FromUnicode
static int FromUnicode(int c)
Definition: unicode.h:550

THash::IsKey
bool IsKey(const TKey &Key) const
Definition: hash.h:216

T8BitCodec::ToUnicode
size_t ToUnicode(const TSrcVec &src, TVec< TDestCh > &dest, const bool clrDest=true) const
Definition: unicode.h:610

TCodecBase::CRef
TCRef CRef
Definition: unicode.h:335

TUniChInfo::IsTerminalPunctuation
bool IsTerminalPunctuation() const
Definition: unicode.h:1102

TUniChInfo::IsMath
bool IsMath() const
Definition: unicode.h:1074

ucfSbATerm
Definition: unicode.h:732

TVec::Add
TSizeTy Add()
Adds a new element at the end of the vector, after its current last element.
Definition: ds.h:574

TEncoding_ISO8859_4::fromUnicodeTable2
static const int fromUnicodeTable2[2 *16]
Definition: unicode.h:480

TUnicode::EncodeUtf16ToBytes
int EncodeUtf16ToBytes(const TIntV &src, TIntV &dest, const bool insertBom, const TUniByteOrder destByteOrder=boMachineEndian) const
Definition: unicode.h:1838

TUniChDb::caseFolding
TUniCaseFolding caseFolding
Definition: unicode.h:1268

TEncoding_ISO8859_2::fromUnicodeTable2
static const int fromUnicodeTable2[2 *16]
Definition: unicode.h:452

TUniTrie::TNode::TNode
TNode(const TItem &item_, const int child_, const int sib_, const bool terminal_)
Definition: unicode.h:1188

TUniChInfo::IsIdeographic
bool IsIdeographic() const
Definition: unicode.h:1095

TUniChInfo::TUniChInfo
TUniChInfo()
Definition: unicode.h:1062

ucfSbLower
Definition: unicode.h:728

TUniChDb::specialCasingTitle
TIntIntVH specialCasingTitle
Definition: unicode.h:1271

TUniTrie::TNode::TNode
TNode()
Definition: unicode.h:1187

TUniChInfo::IsJoinControl
bool IsJoinControl() const
Definition: unicode.h:1096

TUniCaseFolding::FoldInPlace
void FoldInPlace(TSrcVec &src, size_t srcIdx, const size_t srcCount, const bool turkic) const
Definition: unicode.h:307

boLittleEndian
Definition: unicode.h:41

TUniChInfo::flags
int flags
Definition: unicode.h:1025

TUnicode::EncodeUtf16ToWords
int EncodeUtf16ToWords(const TIntV &src, TIntV &dest, const bool insertBom, const TUniByteOrder destByteOrder=boMachineEndian) const
Definition: unicode.h:1834

TUniChDb::DecomposeAndCompose
void DecomposeAndCompose(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, bool compatibility, bool clrDest=true) const
Definition: unicode.h:3148

TUnicodeErrorHandling_
TUnicodeErrorHandling_
Definition: unicode.h:18

TEncoding_ISO8859_3::fromUnicodeTable1
static const int fromUnicodeTable1[14 *16]
Definition: unicode.h:466

AlwaysFalse
bool AlwaysFalse()
Definition: unicode.h:3227

TEncoding_CP1250::fromUnicodeTable1
static const int fromUnicodeTable1[14 *16]
Definition: unicode.h:547

TCodecBase::~TCodecBase
virtual ~TCodecBase()
Definition: unicode.h:338

TUniChInfo::ccAboveLeft
Definition: unicode.h:1009

ucfPrTerminalPunctuation
Definition: unicode.h:917

ucfPxOtherLowercase
Definition: unicode.h:970

TUnicode::codecs
THash< TStr, PCodecBase > codecs
Definition: unicode.h:1869

TEncoding_CP437::fromUnicodeTable4
static const int fromUnicodeTable4[11 *16]
Definition: unicode.h:510

TUniChDb::SbEx_Set
void SbEx_Set(const TUniTrie< TInt > &newTrie)
Definition: unicode.h:1498

TUniChInfo::ccOverlaysAndInterior
Definition: unicode.h:990

TUniChInfo::ccAbove
Definition: unicode.h:1010

TUniChInfo::ccDoubleBelow
Definition: unicode.h:1012

THash::GetKey
const TKey & GetKey(const int &KeyId) const
Definition: hash.h:210

TEncoding_ISO8859_3::fromUnicodeTable2
static const int fromUnicodeTable2[2]
Definition: unicode.h:466

TUniChDb::TestWbFindNonIgnored
void TestWbFindNonIgnored() const
Definition: unicode.cpp:619

ucfSbFormat
Definition: unicode.h:726

ucfWbMidLetter
Definition: unicode.h:720

TUniChDb::TUcdFileReader::ParseCodePoint
static int ParseCodePoint(const TStr &s)
Definition: unicode.h:1695

TUniChDb::TSubcatHelper::hasCat
bool hasCat
Definition: unicode.h:1723

TEncoding_ISO8859_4::GetName
static TStr GetName()
Definition: unicode.h:479

TUniChProperties_
TUniChProperties_
Definition: unicode.h:780

TCodecWrapper::FromUnicode
virtual size_t FromUnicode(const TIntV &src, size_t srcIdx, const size_t srcCount, TIntV &dest, const bool clrDest=true) const
Definition: unicode.h:393

TUniChDb::GetCaseFoldingFn
static TStr GetCaseFoldingFn()
Definition: unicode.h:1296

TEncoding_CP437::ToUnicode
static int ToUnicode(int c)
Definition: unicode.h:511

TCodecWrapper::TCodecImpl
TCodecImpl_ TCodecImpl
Definition: unicode.h:380

TUniChInfo::IsSoftDotted
bool IsSoftDotted() const
Definition: unicode.h:1100

TUniChInfo::IsBidiControl
bool IsBidiControl() const
Definition: unicode.h:1087

TUniChDb::TUcdFileReader::putBackCh
int putBackCh
Definition: unicode.h:1657

TUniTrie::GetChild
int GetChild(const int parentIdx, const TItem &item) const
Definition: unicode.h:1209

ucfPrLogicalOrderException
Definition: unicode.h:850

TestPrevCurNext
#define TestPrevCurNext(prevFlag, curFlag, nextFlag)

TUniCodec::EncodeUtf8
size_t EncodeUtf8(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest=true) const
Definition: unicode.h:2152

TUniChDb::GetTitleCase
void GetTitleCase(const TSrcVec &src, TVec< TDestCh > &dest, const bool clrDest=true, const bool turkic=false, const bool lithuanian=false) const
Definition: unicode.h:1595

TVec
Vector is a sequence TVal objects representing an array that can change in size.
Definition: ds.h:429

TUniChDb::GetAuxiliaryDir
static TStr GetAuxiliaryDir()
Definition: unicode.h:1304

ucfWbExtendNumLet
Definition: unicode.h:723

TUnicode::iso8859_2
T8BitCodec< TEncoding_ISO8859_2 > iso8859_2
Definition: unicode.h:1847

TUniChDb::GetScriptNameHiragana
static TStr GetScriptNameHiragana()
Definition: unicode.h:1319

TUniChDb::TVecIdx
TUniVecIdx TVecIdx
Definition: unicode.h:1260

TUniChInfo::GetLineBreakCode
static ushort GetLineBreakCode(char c1, char c2)
Definition: unicode.h:1031

DefineUniCat
#define DefineUniCat(cat, c)
Definition: unicode.h:664

TUniChDb::SbEx_SetStdEnglish
int SbEx_SetStdEnglish()
Definition: unicode.h:1499

TUniChDb::GetCaseConverted
void GetCaseConverted(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest, const TCaseConversion how, const bool turkic, const bool lithuanian) const
Definition: unicode.h:2817

uehIgnore
Definition: unicode.h:21

TCodecBase::ToUnicode
size_t ToUnicode(const TIntV &src, TIntV &dest, const bool clrDest=true) const
Definition: unicode.h:352

T8BitCodec::TEncoding
TEncoding_ TEncoding
Definition: unicode.h:566

TUniCodec::WordsToBytes
void WordsToBytes(const TIntV &src, TIntV &dest)
Definition: unicode.cpp:274