SNAP Library 6.0, Developer Reference  2020-12-09 16:24:20
SNAP, a general purpose, high performance system for analysis and manipulation of large networks
unicode.h
Go to the documentation of this file.
1 #include "bd.h"
2 
3 //#ifndef unicode_h
4 //#define unicode_h
5 
7 // Includes
8 //#include "base.h"
9 #include <new>
10 
11 typedef int TUniVecIdx;
12 
13 //-----------------------------------------------------------------------------
14 // TUniCodec -- an UTF-8 and UTF-16 Encoder/Decoder
15 //-----------------------------------------------------------------------------
16 
17 // Error handling modes for the TUniCodec class.
19 {
20  // What happens when an error occurs:
21  uehIgnore = 0, // - it is silently ignored (nothing is added to the output vector)
22  uehThrow = 1, // - an exception is thrown (TUnicodeException)
23  uehReplace = 2, // - the replacement character is added to the output vector
24  uehAbort = 3 // - the encoding/decoding process stops immediately
25 }
27 
29 {
30 public:
31  TStr message; // error message
32  size_t srcIdx; // the position in the source vector where the error occurred
33  int srcChar; // the source character at the position srcIdx
34  TUnicodeException(size_t srcIdx_, int srcChar_, const TStr& message_) :
35  message(message_), srcIdx(srcIdx_), srcChar(srcChar_) { }
36 };
37 
38 typedef enum TUniByteOrder_
39 {
43 }
45 
46 typedef enum TUtf16BomHandling_
47 {
48  bomAllowed = 0, // if a BOM is present, it is used to determine the byte order; otherwise, the default byte order is used
49  bomRequired = 1, // if a BOM is present, it is used to determine the byte order; otherwise, an error is reported
50  bomIgnored = 2 // the default byte order is used; if a BOM is present, it is treated like any other character
51 }
53 
54 class TUniCodec
55 {
56 public:
57  // 0xfffd is defined as the replacement character by the Unicode standard.
58  // By default, it is rendered as a question mark inside a diamond: "<?>".
59  enum { DefaultReplacementChar = 0xfffd };
60 
61  // The replacement character is inserted into the destination vector
62  // if an error occurs in the source vector. By default, this is set
63  // to DefaultReplacementChar.
65  // The error handling mode.
67  // There are a number of situations where there is strictly speaking an error in
68  // the source data although it can still be decoded in a reasonably meaningful way.
69  // If strict == true, these situations are treated as errors. Examples:
70  // - when decoding UTF-8:
71  // - a codepoint represented by more bytes than necessary (e.g. one of the characters 0..127
72  // encoded as a two-byte sequence)
73  // - a codepoint > 0x10ffff
74  // - when decoding UTF-16:
75  // - a codepoint from the range reserved for the second character of a surrogate pair
76  // is not preceded by a codepoint from the range reserved for the first character of a surrogate pair
77  // - when encoding UTF-8:
78  // - a codepoint > 0x10ffff
79  // - when encoding UTF-16:
80  // - a codepoint from the range reserved from the second character of a surrogate pair
81  // [note that a codepoint > 0x10ffff, or from the range reserved for the first character of a
82  // surrogate pair, is always an error, even with strict == false]
83  bool strict;
84  // skipBom == true means: If a byte-order-mark (0xfffe or 0xfeff) occurs at the beginning
85  // of the source vector, it is skipped (when decoding).
86  // - Note: a BOM is not really useful in UTF-8 encoded data. However, the .NET UTF8Encoding
87  // emits 0xfeff by default as a kind of preamble. It gets encoded as 3 bytes, ef bb bf,
88  // and can be helpful to make the data easier to recognize as UTF-8 encoded data.
89  bool skipBom;
90 
91  TUniCodec() : replacementChar(DefaultReplacementChar), errorHandling(uehIgnore), strict(false), skipBom(true)
92  {
93  }
94 
95  TUniCodec(TUnicodeErrorHandling errorHandling_, bool strict_, int replacementChar_, bool skipBom_) :
96  replacementChar(replacementChar_), errorHandling(errorHandling_), strict(strict_), skipBom(skipBom_)
97  {
98  }
99 
100 protected:
101  enum {
102 #define DefineByte(b7, b6, b5, b4, b3, b2, b1, b0) _ ## b7 ## b6 ## b5 ## b4 ## _ ## b3 ## b2 ## b1 ## b0 = (b7 << 7) | (b6 << 6) | (b5 << 5) | (b4 << 4) | (b3 << 3) | (b2 << 2) | (b1 << 1) | b0
103  DefineByte(1, 0, 0, 0, 0, 0, 0, 0),
104  DefineByte(1, 1, 0, 0, 0, 0, 0, 0),
105  DefineByte(1, 1, 1, 0, 0, 0, 0, 0),
106  DefineByte(1, 1, 1, 1, 0, 0, 0, 0),
107  DefineByte(1, 1, 1, 1, 1, 0, 0, 0),
108  DefineByte(1, 1, 1, 1, 1, 1, 0, 0),
109  DefineByte(1, 1, 1, 1, 1, 1, 1, 0),
110  DefineByte(0, 0, 1, 1, 1, 1, 1, 1),
111  DefineByte(0, 0, 0, 1, 1, 1, 1, 1),
112  DefineByte(0, 0, 0, 0, 1, 1, 1, 1),
113  DefineByte(0, 0, 0, 0, 0, 1, 1, 1),
114  DefineByte(0, 0, 0, 0, 0, 0, 1, 1)
115 #undef DefineByte
116  };
117 
119  //friend class TUniChDb;
120  friend class TUniCaseFolding;
121  friend class TUnicode;
122 
123 public:
124 
125  //-----------------------------------------------------------------------
126  // UTF-8
127  //-----------------------------------------------------------------------
128 
129  // Returns the number of characters that have been successfully decoded.
130  // This does not include any replacement characters that may have been inserted into 'dest'.
131  template<typename TSrcVec, typename TDestCh>
132  size_t DecodeUtf8(
133  const TSrcVec& src, size_t srcIdx, const size_t srcCount,
134  TVec<TDestCh>& dest, const bool clrDest = true) const;
135  template<typename TSrcVec, typename TDestCh>
136  size_t DecodeUtf8(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true) const { return DecodeUtf8(src, 0, src.Len(), dest, clrDest); }
137 
138  // Returns the number of characters that have been successfully encoded.
139  // This does not include any replacement characters that may have been inserted into 'dest'.
140  template<typename TSrcVec, typename TDestCh>
141  size_t EncodeUtf8(
142  const TSrcVec& src, size_t srcIdx, const size_t srcCount,
143  TVec<TDestCh>& dest, const bool clrDest = true) const;
144  template<typename TSrcVec, typename TDestCh>
145  size_t EncodeUtf8(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true) const { return EncodeUtf8(src, 0, src.Len(), dest, clrDest); }
146 
147  // The following wrappers around the UTF-8 encoder return a TStr containing
148  // the UTF-8-encoded version of the input string.
149  template<typename TSrcVec> TStr EncodeUtf8Str(const TSrcVec& src, size_t srcIdx, const size_t srcCount) const { TVec<char> temp; EncodeUtf8(src, srcIdx, srcCount, temp); TStr retVal = &(temp[0]); return retVal; }
150  template<typename TSrcVec> TStr EncodeUtf8Str(const TSrcVec& src) const { TVec<char> temp; EncodeUtf8(src, temp); temp.Add(0); TStr retVal = &(temp[0]); return retVal; }
151 
152  //-----------------------------------------------------------------------
153  // UTF-16 Decoder
154  //-----------------------------------------------------------------------
155 
156 protected:
157  enum {
160  };
161 
162  static bool IsMachineLittleEndian();
163 
164 public:
165 
166  // Returns the number of characters that have been successfully decoded.
167  // This does not include any replacement characters that may have been inserted into 'dest'.
168  // Each element of 'src' is assumed to contain one byte of data.
169  // srcCount must be even (though srcIdx doesn't need to be).
170  template<typename TSrcVec, typename TDestCh>
171  size_t DecodeUtf16FromBytes(
172  const TSrcVec& src, size_t srcIdx, const size_t srcCount,
173  TVec<TDestCh>& dest, const bool clrDest,
174  const TUtf16BomHandling bomHandling = bomAllowed,
175  const TUniByteOrder defaultByteOrder = boMachineEndian) const;
176 
177  // Here, each element of 'src' is treated as a 16-bit word. The byte-order settings
178  // are used to determine if the two bytes of each word should be swapped before further
179  // processing. For example, if a BOM is present, it must have the value 0xfeff; if it
180  // actually has the value 0xfffe, this means that the two bytes of each word must be swapped.
181  // Basically, the combination of the byteOrder parameter and the byte order mark (if present) at the
182  // beginning of the source data is used to determine the "original" byte order of the data;
183  // if this doesn't match the byte order of the local machine, the two bytes of each word will
184  // be swapped during the decoding process.
185  template<typename TSrcVec, typename TDestCh>
186  size_t DecodeUtf16FromWords(
187  const TSrcVec& src, size_t srcIdx, const size_t srcCount,
188  TVec<TDestCh>& dest, bool clrDest,
189  const TUtf16BomHandling bomHandling = bomAllowed,
190  const TUniByteOrder defaultByteOrder = boMachineEndian) const;
191 
192  //-----------------------------------------------------------------------
193  // UTF-16 Encoder
194  //-----------------------------------------------------------------------
195 
196  // Returns the number of characters that have been successfully encoded.
197  // This does not include any replacement characters that may have been inserted into 'dest'.
198  //
199  // Notes:
200  // - UTF-16 *cannot* encode characters above 0x10ffff, so their presence is always
201  // treated as an error, regardless of the value of 'strict'.
202  // - Characters from the range Utf16FirstSurrogate through Utf16FirstSurrogate + 1023
203  // cannot be encoded by UTF-16 either, as they would be misinterpreted during decoding
204  // as the first character of a surrogate pair.
205  // - Characters from the range Utf16SecondSurrogate through Utf16SecondSurrogate + 1023
206  // can be encoded in principle; however, if strict == true, they are treated as errors.
207  template<typename TSrcVec, typename TDestCh>
208  size_t EncodeUtf16ToWords(
209  const TSrcVec& src, size_t srcIdx, const size_t srcCount,
210  TVec<TDestCh>& dest, const bool clrDest, const bool insertBom,
211  const TUniByteOrder destByteOrder = boMachineEndian) const;
212 
213  template<typename TSrcVec, typename TDestCh>
214  size_t EncodeUtf16ToBytes(
215  const TSrcVec& src, size_t srcIdx, const size_t srcCount,
216  TVec<TDestCh>& dest, const bool clrDest, const bool insertBom,
217  const TUniByteOrder destByteOrder = boMachineEndian) const;
218 
219  //-----------------------------------------------------------------------
220  // Helper declarations for the test drivers
221  //-----------------------------------------------------------------------
222 
223 protected:
224 
225  static uint GetRndUint(TRnd& rnd);
226  static uint GetRndUint(TRnd& rnd, uint minVal, uint maxVal);
227 
228  //-----------------------------------------------------------------------
229  // UTF-8 Test Driver
230  //-----------------------------------------------------------------------
231 
232 protected:
233  void TestUtf8(bool decode, size_t expectedRetVal, bool expectedThrow, const TIntV& src, const TIntV& expectedDest, FILE *f);
234  // Generates a random UTF-8-encoded stream according to the specifications in 'testCaseDesc',
235  // then calls TestUtf8 to make sure that DecodeUtf8 reacts as expected.
236  void TestDecodeUtf8(TRnd& rnd, const TStr& testCaseDesc);
237 public:
238  void TestUtf8();
239 
240  //-----------------------------------------------------------------------
241  // UTF-16 Test Driver
242  //-----------------------------------------------------------------------
243 
244 protected:
245  void WordsToBytes(const TIntV& src, TIntV& dest);
246  void TestUtf16(bool decode, size_t expectedRetVal, bool expectedThrow, const TIntV& src, const TIntV& expectedDest,
247  // Note: insertBom is only used with the encoder. When encoding, 'defaultByteOrder' is used as the destination byte order.
248  const TUtf16BomHandling bomHandling, const TUniByteOrder defaultByteOrder, const bool insertBom,
249  FILE *f);
250  static inline int SwapBytes(int x) {
251  return ((x >> 8) & 0xff) | ((x & 0xff) << 8); }
252  // Generates a random UTF-16-encoded stream according to the specifications in 'testCaseDesc',
253  // then calls TestUtf16 to make sure that DecodeUtf16 reacts as expected.
254  void TestDecodeUtf16(TRnd& rnd, const TStr& testCaseDesc,
255  const TUtf16BomHandling bomHandling,
256  const TUniByteOrder defaultByteOrder,
257  const bool insertBom);
258 public:
259  void TestUtf16();
260 
261 };
262 
263 //-----------------------------------------------------------------------------
264 // Case folding
265 //-----------------------------------------------------------------------------
266 // Note: there's no need to access this class directly.
267 // Use TUniChDb::GetCaseFolded() instead.
268 
270 
272 {
273 protected:
275  TIntIntVH cfFull;
276 
277  template<typename TSrcDat, typename TDestDat>
278  inline static void AppendVector(const TVec<TSrcDat>& src, TVec<TDestDat>& dest) {
279  for (int i = 0; i < src.Len(); i++) dest.Add(src[i]); }
280  friend class TUniChDb;
282 
283 public:
285  explicit TUniCaseFolding(TSIn& SIn) : cfCommon(SIn), cfSimple(SIn), cfTurkic(SIn), cfFull(SIn) { SIn.LoadCs(); }
286  void Load(TSIn& SIn) { cfCommon.Load(SIn); cfSimple.Load(SIn); cfFull.Load(SIn); cfTurkic.Load(SIn); SIn.LoadCs(); }
287  void Save(TSOut& SOut) const { cfCommon.Save(SOut); cfSimple.Save(SOut); cfFull.Save(SOut); cfTurkic.Save(SOut); SOut.SaveCs(); }
288  void Clr() { cfCommon.Clr(); cfSimple.Clr(); cfFull.Clr(); cfTurkic.Clr(); }
289  void LoadTxt(const TStr& fileName);
290 
291  // Use 'turkic' when processing text in a Turkic language (tr, az). This only affects the uppercase I and I-with-dot-above.
292  template<typename TSrcVec, typename TDestCh>
293  void Fold(const TSrcVec& src, size_t srcIdx, const size_t srcCount,
294  TVec<TDestCh>& dest, const bool clrDest, const bool full, const bool turkic) const
295  {
296  for (const size_t srcEnd = srcIdx + srcCount; srcIdx < srcEnd; )
297  {
298  int c = src[TVecIdx(srcIdx)], i; srcIdx++;
299  if (turkic && ((i = cfTurkic.GetKeyId(c)) >= 0)) { dest.Add(cfTurkic[i]); continue; }
300  if (full && ((i = cfFull.GetKeyId(c)) >= 0)) { AppendVector(cfFull[i], dest); continue; }
301  if ((! full) && ((i = cfSimple.GetKeyId(c)) >= 0)) { dest.Add(cfSimple[i]); continue; }
302  i = cfCommon.GetKeyId(c); if (i >= 0) dest.Add(cfCommon[i]); else dest.Add(c);
303  }
304  }
305 
306  template<typename TSrcVec>
307  void FoldInPlace(TSrcVec& src, size_t srcIdx, const size_t srcCount, const bool turkic) const
308  {
309  for (const size_t srcEnd = srcIdx + srcCount; srcIdx < srcEnd; srcIdx++)
310  {
311  int c = src[TVecIdx(srcIdx)], i;
312  if (turkic && ((i = cfTurkic.GetKeyId(c)) >= 0)) { src[TVecIdx(srcIdx)] = cfTurkic[i]; continue; }
313  if ((i = cfSimple.GetKeyId(c)) >= 0) { src[TVecIdx(srcIdx)] = cfSimple[i]; continue; }
314  i = cfCommon.GetKeyId(c); if (i >= 0) src[TVecIdx(srcIdx)] = cfCommon[i];
315  }
316  }
317 
318 protected:
319  void Test(const TIntV& src, const TIntV& expectedDest, const bool full, const bool turkic, FILE *f);
320 public:
321  void Test();
322 };
323 
324 //-----------------------------------------------------------------------------
325 // TCodecBase -- an abstract base class for codecs
326 //-----------------------------------------------------------------------------
327 
331 
333 {
334 protected:
336  friend class TPt<TCodecBase>;
337 public:
338  virtual ~TCodecBase() { }
339 
340  template<class TCodecImpl>
341  static PCodecBase New(); /* {
342  return new TCodecWrapper<TCodecImpl>(); } */
343 
344  virtual TStr GetName() const = 0;
345  virtual void Test() const { }
346 
347  // Returns the number of characters that have been successfully decoded.
348  // This does not include any replacement characters that may have been inserted into 'dest'.
349  virtual size_t ToUnicode(const TIntV& src, size_t srcIdx, const size_t srcCount, TIntV& dest, const bool clrDest = true) const = 0;
350  virtual size_t ToUnicode(const TStr& src, size_t srcIdx, const size_t srcCount, TIntV& dest, const bool clrDest = true) const = 0;
351 
352  size_t ToUnicode(const TIntV& src, TIntV& dest, const bool clrDest = true) const { return ToUnicode(src, 0, src.Len(), dest, clrDest); }
353  size_t ToUnicode(const TStr& src, TIntV& dest, const bool clrDest = true) const { return ToUnicode(src, 0, src.Len(), dest, clrDest); }
354 
355  // Returns the number of characters that have been successfully encoded.
356  // This does not include any replacement characters that may have been inserted into 'dest'.
357  virtual size_t FromUnicode(const TIntV& src, size_t srcIdx, const size_t srcCount, TIntV& dest, const bool clrDest = true) const = 0;
358  virtual size_t FromUnicode(const TIntV& src, size_t srcIdx, const size_t srcCount, TChA& dest, const bool clrDest = true) const = 0;
359  virtual size_t FromUnicode(const TIntV& src, size_t srcIdx, const size_t srcCount, TStr& dest, const bool clrDest = true) const = 0;
360 
361  size_t FromUnicode(const TIntV& src, TIntV& dest, const bool clrDest = true) const { return FromUnicode(src, 0, src.Len(), dest, clrDest); }
362  size_t FromUnicode(const TIntV& src, TChA& dest, const bool clrDest = true) const { return FromUnicode(src, 0, src.Len(), dest, clrDest); }
363  size_t FromUnicode(const TIntV& src, TStr& dest, const bool clrDest = true) const { return FromUnicode(src, 0, src.Len(), dest, clrDest); }
364 };
365 
366 //-----------------------------------------------------------------------------
367 // TCodecWrapper -- a descendant of TCodecBase; relies on a template
368 // parameter class for the actual implementation of the codec.
369 //-----------------------------------------------------------------------------
370 // Thus, if you know in advance that you'll need ISO-8859-2, just use
371 // T8BitCodec<TEncoding_ISO8859_2>. If you don't know the encoding
372 // in advance, use a PCodecBase pointing to a suitable specialization
373 // of TCodecWrapper<...>. You can TUnicode::GetCodec(TStr& name)
374 // to obtain a suitable pointer.
375 
376 template<class TCodecImpl_>
377 class TCodecWrapper : public TCodecBase
378 {
379 public:
380  typedef TCodecImpl_ TCodecImpl;
381  TCodecImpl impl;
382 public:
383 
384  virtual TStr GetName() const { return impl.GetName(); }
385 
386  virtual void Test() const { impl.Test(); }
387 
388  virtual size_t ToUnicode(const TIntV& src, size_t srcIdx, const size_t srcCount, TIntV& dest, const bool clrDest = true) const {
389  return impl.ToUnicode(src, srcIdx, srcCount, dest, clrDest); }
390  virtual size_t ToUnicode(const TStr& src, size_t srcIdx, const size_t srcCount, TIntV& dest, const bool clrDest = true) const {
391  return impl.ToUnicode(src, srcIdx, srcCount, dest, clrDest); }
392 
393  virtual size_t FromUnicode(const TIntV& src, size_t srcIdx, const size_t srcCount, TIntV& dest, const bool clrDest = true) const {
394  return impl.FromUnicode(src, srcIdx, srcCount, dest, clrDest); }
395  virtual size_t FromUnicode(const TIntV& src, size_t srcIdx, const size_t srcCount, TChA& dest, const bool clrDest = true) const {
396  return impl.FromUnicode(src, srcIdx, srcCount, dest, clrDest); }
397  virtual size_t FromUnicode(const TIntV& src, size_t srcIdx, const size_t srcCount, TStr& dest, const bool clrDest = true) const {
398  TChA buf; size_t retVal = impl.FromUnicode(src, srcIdx, srcCount, buf, false);
399  if (clrDest) dest += buf.CStr(); else dest = buf.CStr();
400  return retVal; }
401 };
402 
403 template<class TCodecImpl>
404 PCodecBase TCodecBase::New() {
405  return new TCodecWrapper<TCodecImpl>();
406 }
407 
408 //-----------------------------------------------------------------------------
409 // TVecElt -- a template for determining the type of a vector's elements
410 //-----------------------------------------------------------------------------
411 
412 template<class TVector_>
413 class TVecElt
414 {
415 };
416 
417 template<class TDat>
418 class TVecElt<TVec<TDat> >
419 {
420 public:
422  typedef TDat TElement;
423  static inline void Add(TVector& vector, const TElement& element) { vector.Add(element); }
424 };
425 
426 template<>
427 class TVecElt<TChA>
428 {
429 public:
430  typedef TChA TVector;
431  typedef char TElement;
432  static inline void Add(TVector& vector, const TElement& element) { vector += element; }
433 };
434 
435 
436 //-----------------------------------------------------------------------------
437 // T8BitCodec -- a class for converting between 8-bit encodings and Unicode
438 //-----------------------------------------------------------------------------
439 
441 {
442 public:
443  static inline TStr GetName() { return "ISO-8859-1"; }
444  static int ToUnicode(int c) { Assert(0 <= c && c <= 255); return c; }
445  static int FromUnicode(int c) { if (0 <= c && c <= 255) return c; else return -1; }
446 };
447 
448 class TEncoding_ISO8859_2 // ISO Latin 2
449 {
450 public:
451  static inline TStr GetName() { return "ISO-8859-2"; }
452  static const int toUnicodeTable[6 * 16], fromUnicodeTable1[14 * 16], fromUnicodeTable2[2 * 16];
453  static int ToUnicode(int c) { Assert(0 <= c && c <= 255);
454  if (c < 0xa0) return c; else return toUnicodeTable[c - 0xa0]; }
455  static int FromUnicode(int c) {
456  if (0 <= c && c < 0xa0) return c;
457  else if (0xa0 <= c && c < 0x180) return fromUnicodeTable1[c - 0xa0];
458  else if (0x2c0 <= c && c < 0x2e0) return fromUnicodeTable2[c - 0x2c0];
459  else return -1; }
460 };
461 
463 {
464 public:
465  static inline TStr GetName() { return "ISO-8859-3"; }
466  static const int toUnicodeTable[6 * 16], fromUnicodeTable1[14 * 16], fromUnicodeTable2[2];
467  static int ToUnicode(int c) { Assert(0 <= c && c <= 255);
468  if (c < 0xa0) return c; else return toUnicodeTable[c - 0xa0]; }
469  static int FromUnicode(int c) {
470  if (0 <= c && c < 0xa0) return c;
471  else if (0xa0 <= c && c < 0x180) return fromUnicodeTable1[c - 0xa0];
472  else if (0x2d8 <= c && c < 0x2da) return fromUnicodeTable2[c - 0x2d8];
473  else return -1; }
474 };
475 
477 {
478 public:
479  static inline TStr GetName() { return "ISO-8859-4"; }
480  static const int toUnicodeTable[6 * 16], fromUnicodeTable1[14 * 16], fromUnicodeTable2[2 * 16];
481  static int ToUnicode(int c) { Assert(0 <= c && c <= 255);
482  if (c < 0xa0) return c; else return toUnicodeTable[c - 0xa0]; }
483  static int FromUnicode(int c) {
484  if (0 <= c && c < 0xa0) return c;
485  else if (0xa0 <= c && c < 0x180) return fromUnicodeTable1[c - 0xa0];
486  else if (0x2c0 <= c && c < 0x2e0) return fromUnicodeTable2[c - 0x2c0];
487  else return -1; }
488 };
489 
491 {
492 public:
493  static const int uniChars[10], yuAsciiChars[10];
494  static inline TStr GetName() { return "YU-ASCII"; }
495  static int ToUnicode(int c) { Assert(0 <= c && c <= 255);
496  for (int i = 0; i < int(sizeof(yuAsciiChars) / sizeof(yuAsciiChars[0])); i++)
497  if (c == yuAsciiChars[i]) return uniChars[i];
498  return c; }
499  static int FromUnicode(int c) {
500  for (int i = 0; i < int(sizeof(uniChars) / sizeof(uniChars[0])); i++)
501  if (c == uniChars[i]) return yuAsciiChars[i];
502  else if(c == yuAsciiChars[i]) return -1;
503  if (0 <= c && c <= 255) return c; else return -1; }
504 };
505 
506 class TEncoding_CP437 // DOS US
507 {
508 public:
509  static inline TStr GetName() { return "CP437"; }
510  static const int toUnicodeTable[8 * 16], fromUnicodeTable1[6 * 16], fromUnicodeTable2[4 * 16], fromUnicodeTable3[6 * 16], fromUnicodeTable4[11 * 16];
511  static int ToUnicode(int c) { Assert(0 <= c && c <= 255);
512  if (c < 0x80) return c; else return toUnicodeTable[c - 0x80]; }
513  static int FromUnicode(int c) {
514  if (0 <= c && c < 0x80) return c;
515  else if (0xa0 <= c && c < 0x100) return fromUnicodeTable1[c - 0xa0];
516  else if (0x390 <= c && c < 0x3d0) return fromUnicodeTable2[c - 0x390];
517  else if (0x2210 <= c && c < 0x2270) return fromUnicodeTable3[c - 0x2210];
518  else if (0x2500 <= c && c < 0x25b0) return fromUnicodeTable4[c - 0x2500];
519  else if (c == 0x192) return 0x9f;
520  else if (c == 0x207f) return 0xfc;
521  else if (c == 0x20a7) return 0x9e;
522  else if (c == 0x2310) return 0xa9;
523  else if (c == 0x2320) return 0xf4;
524  else if (c == 0x2321) return 0xf5;
525  else return -1; }
526 };
527 
528 class TEncoding_CP852 // DOS Latin 2
529 {
530 public:
531  static inline TStr GetName() { return "CP852"; }
532  static const int toUnicodeTable[8 * 16], fromUnicodeTable1[14 * 16], fromUnicodeTable2[2 * 16], fromUnicodeTable3[11 * 16];
533  static int ToUnicode(int c) { Assert(0 <= c && c <= 255);
534  if (c < 0x80) return c; else return toUnicodeTable[c - 0x80]; }
535  static int FromUnicode(int c) {
536  if (0 <= c && c < 0x80) return c;
537  else if (0xa0 <= c && c < 0x180) return fromUnicodeTable1[c - 0xa0];
538  else if (0x2c0 <= c && c < 0x2e0) return fromUnicodeTable2[c - 0x2c0];
539  else if (0x2500 <= c && c < 0x25b0) return fromUnicodeTable3[c - 0x2500];
540  else return -1; }
541 };
542 
543 class TEncoding_CP1250 // Windows-1250, similar to ISO Latin 2
544 {
545 public:
546  static inline TStr GetName() { return "CP1250"; }
547  static const int toUnicodeTable[8 * 16], fromUnicodeTable1[14 * 16], fromUnicodeTable2[2 * 16], fromUnicodeTable3[3 * 16];
548  static int ToUnicode(int c) { Assert(0 <= c && c <= 255);
549  if (c < 0x80) return c; else return toUnicodeTable[c - 0x80]; }
550  static int FromUnicode(int c) {
551  if (0 <= c && c < 0x80) return c;
552  else if (0xa0 <= c && c < 0x180) return fromUnicodeTable1[c - 0xa0];
553  else if (0x2c0 <= c && c < 0x2e0) return fromUnicodeTable2[c - 0x2c0];
554  else if (0x2010 <= c && c < 0x2040) return fromUnicodeTable3[c - 0x2010];
555  else if (c == 0x20ac) return 0x80;
556  else if (c == 0x2122) return 0x99;
557  else return -1; }
558 };
559 
560 template<class TEncoding_>
562 {
563 protected:
565 public:
566  typedef TEncoding_ TEncoding;
569 
570  T8BitCodec() : errorHandling(uehIgnore), replacementChar(TUniCodec::DefaultReplacementChar) { }
571  T8BitCodec(TUnicodeErrorHandling errorHandling_, int replacementChar_ = TUniCodec::DefaultReplacementChar) :
572  errorHandling(errorHandling_), replacementChar(replacementChar_) { }
573  static TStr GetName() { return TEncoding::GetName(); }
574 
575  void Test() const
576  {
577  int nDecoded = 0;
578  for (int c = 0; c <= 255; c++) {
579  int cu = TEncoding::ToUnicode(c); if (cu == -1) continue;
580  nDecoded++;
581  IAssert(0 <= cu && cu < 0x110000);
582  int c2 = TEncoding::FromUnicode(cu);
583  IAssert(c2 == c); }
584  int nEncoded = 0;
585  for (int cu = 0; cu < 0x110000; cu++) {
586  int c = TEncoding::FromUnicode(cu); if (c == -1) continue;
587  nEncoded++;
588  IAssert(0 <= c && c <= 255);
589  int cu2 = TEncoding::ToUnicode(c);
590  IAssert(cu2 == cu); }
591  IAssert(nDecoded == nEncoded);
592  }
593 
594  // Returns the number of characters that have been successfully decoded.
595  // This does not include any replacement characters that may have been inserted into 'dest'.
596  template<typename TSrcVec, typename TDestCh>
597  size_t ToUnicode(
598  const TSrcVec& src, size_t srcIdx, const size_t srcCount,
599  TVec<TDestCh>& dest, const bool clrDest = true) const
600  {
601  if (clrDest) dest.Clr();
602  size_t toDo = srcCount;
603  while (toDo-- > 0) {
604  int chSrc = ((int) src[TVecIdx(srcIdx)]) & 0xff; srcIdx++;
605  int chDest = TEncoding::ToUnicode(chSrc);
606  dest.Add(chDest); }
607  return srcCount;
608  }
609  template<typename TSrcVec, typename TDestCh>
610  size_t ToUnicode(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true) const { return ToUnicode(src, 0, src.Len(), dest, clrDest); }
611 
612  size_t ToUnicode(const TIntV& src, TIntV& dest, const bool clrDest = true) const { return ToUnicode(src, 0, src.Len(), dest, clrDest); }
613  size_t ToUnicode(const TStr& src, TIntV& dest, const bool clrDest = true) const { return ToUnicode(src, 0, src.Len(), dest, clrDest); }
614 
615  // Returns the number of characters that have been successfully encoded.
616  // This does not include any replacement characters that may have been inserted into 'dest'.
617  template<typename TSrcVec, typename TDestVec>
618  size_t FromUnicode(
619  const TSrcVec& src, size_t srcIdx, const size_t srcCount,
620  TDestVec& dest, const bool clrDest = true) const
621  {
622  typedef typename TVecElt<TDestVec>::TElement TDestCh;
623  if (clrDest) dest.Clr();
624  size_t toDo = srcCount, nEncoded = 0;
625  while (toDo-- > 0) {
626  int chSrc = (int) src[TVecIdx(srcIdx)]; srcIdx++;
627  int chDest = TEncoding::FromUnicode(chSrc);
628  if (chDest < 0) {
629  switch (errorHandling) {
630  case uehThrow: throw TUnicodeException(srcIdx - 1, chSrc, "Invalid character for encoding into " + GetName() + ".");
631  case uehAbort: return nEncoded;
632  case uehReplace: TVecElt<TDestVec>::Add(dest, TDestCh(replacementChar)); continue;
633  case uehIgnore: continue;
634  default: Fail; } }
635  TVecElt<TDestVec>::Add(dest, TDestCh(chDest)); nEncoded++; }
636  return nEncoded;
637  }
638 
639  template<typename TSrcVec, typename TDestVec>
640  size_t FromUnicode(const TSrcVec& src, TDestVec& dest, const bool clrDest = true) const { return FromUnicode(src, 0, src.Len(), dest, clrDest); }
641 
642  size_t UniToStr(const TIntV& src, size_t srcIdx, const size_t srcCount, TStr& dest, const bool clrDest = true) const {
643  TChA buf; size_t retVal = FromUnicode(src, srcIdx, srcCount, buf, false);
644  if (clrDest) dest += buf.CStr(); else dest = buf.CStr();
645  return retVal; }
646  size_t UniToStr(const TIntV& src, TStr& dest, const bool clrDest = true) const { return UniToStr(src, 0, src.Len(), dest, clrDest); }
647 };
648 
657 
658 //-----------------------------------------------------------------------------
659 // Various declarations used by the Unicode Character Database
660 //-----------------------------------------------------------------------------
661 
662 typedef enum TUniChCategory_
663 {
664 #define DefineUniCat(cat, c) uc ## cat = (int(uchar(c)) & 0xff)
665  DefineUniCat(Letter, 'L'), // ucLetter
666  DefineUniCat(Mark, 'M'),
667  DefineUniCat(Number, 'N'),
668  DefineUniCat(Punctuation, 'P'),
669  DefineUniCat(Symbol, 'S'),
670  DefineUniCat(Separator, 'Z'),
671  DefineUniCat(Other, 'C')
672 #undef DefineUniCat
673 }
675 
676 typedef enum TUniChSubCategory_
677 {
678 #define DefineUniSubCat(cat, subCat, c) uc ## cat ## subCat = ((uc ## cat) << 8) | (int(uchar(c)) & 0xff)
679  DefineUniSubCat(Letter, Uppercase, 'u'), // ucLetterUppercase
680  DefineUniSubCat(Letter, Lowercase, 'l'),
681  DefineUniSubCat(Letter, Titlecase, 't'),
682  DefineUniSubCat(Letter, Modifier, 'm'),
683  DefineUniSubCat(Letter, Other, 'o'),
684  DefineUniSubCat(Mark, Nonspacing, 'n'),
685  DefineUniSubCat(Mark, SpacingCombining, 'c'),
686  DefineUniSubCat(Mark, Enclosing, 'e'),
687  DefineUniSubCat(Number, DecimalDigit, 'd'),
688  DefineUniSubCat(Number, Letter, 'l'),
689  DefineUniSubCat(Number, Other, 'o'),
690  DefineUniSubCat(Punctuation, Connector, 'c'),
691  DefineUniSubCat(Punctuation, Dash, 'd'),
692  DefineUniSubCat(Punctuation, Open, 's'),
693  DefineUniSubCat(Punctuation, Close, 'e'),
694  DefineUniSubCat(Punctuation, InitialQuote, 'i'),
695  DefineUniSubCat(Punctuation, FinalQuote, 'f'),
696  DefineUniSubCat(Punctuation, Other, 'o'),
697  DefineUniSubCat(Symbol, Math, 'm'),
698  DefineUniSubCat(Symbol, Currency, 'c'),
699  DefineUniSubCat(Symbol, Modifier, 'k'),
700  DefineUniSubCat(Symbol, Other, 'o'),
701  DefineUniSubCat(Separator, Space, 's'),
702  DefineUniSubCat(Separator, Line, 'l'),
703  DefineUniSubCat(Separator, Paragraph, 'p'),
704  DefineUniSubCat(Other, Control, 'c'),
705  DefineUniSubCat(Other, Format, 'f'),
706  DefineUniSubCat(Other, Surrogate, 's'),
707  DefineUniSubCat(Other, PrivateUse, 'o'),
708  DefineUniSubCat(Other, NotAssigned, 'n')
709 }
711 
712 typedef enum TUniChFlags_
713 {
714  ucfCompatibilityDecomposition = 1, // if this flag is not set, the decomposition is canonical
715  ucfCompositionExclusion = 1 << 1, // from CompositionExclusions.txt
716  // Flags used when searching for word boundaries. See UAX #29.
717  ucfWbFormat = 1 << 2,
718  ucfWbKatakana = 1 << 3,
719  ucfWbALetter = 1 << 4,
720  ucfWbMidLetter = 1 << 5,
721  ucfWbMidNum = 1 << 6,
722  ucfWbNumeric = 1 << 7,
724  // Flags used with sentence boundaries (Sep is also used with word boundaries). See UAX #29.
725  ucfSbSep = 1 << 9,
726  ucfSbFormat = 1 << 10,
727  ucfSbSp = 1 << 11,
728  ucfSbLower = 1 << 12,
729  ucfSbUpper = 1 << 13,
730  ucfSbOLetter = 1 << 14,
731  ucfSbNumeric = 1 << 15,
732  ucfSbATerm = 1 << 16,
733  ucfSbSTerm = 1 << 17,
734  ucfSbClose = 1 << 18,
737  // Flags from DerivedCoreProperties.txt.
738  // [The comments are from UCD.html.]
739  // - Characters with the Alphabetic property. For more information, see Chapter 4 in [Unicode].
740  // Generated from: Other_Alphabetic + Lu + Ll + Lt + Lm + Lo + Nl
741  ucfDcpAlphabetic = 1 << 19,
742  // - For programmatic determination of default-ignorable code points.
743  // New characters that should be ignored in processing (unless explicitly supported)
744  // will be assigned in these ranges, permitting programs to correctly handle the default
745  // behavior of such characters when not otherwise supported. For more information, see
746  // UAX #29: Text Boundaries [Breaks].
747  // Generated from Other_Default_Ignorable_Code_Point + Cf + Cc + Cs + Noncharacters - White_Space - annotation characters
748  // [Examples: soft hyphen, zero-width space, noncharacters (e.g. U+fffe, U+ffff, U+1fffe, U+1ffff, etc.), surrogates, language tags, variation selectors]
750  // - Characters with the Lowercase property. For more information, see Chapter 4 in [Unicode].
751  // Generated from: Other_Lowercase + Ll
752  ucfDcpLowercase = 1 << 21,
753  // - For programmatic determination of grapheme cluster boundaries.
754  // For more information, see UAX #29: Text Boundaries [Breaks].
755  // Generated from: [0..10FFFF] - Cc - Cf - Cs - Co - Cn - Zl - Zp - Grapheme_Extend
757  // - For programmatic determination of grapheme cluster boundaries.
758  // For more information, see UAX #29: Text Boundaries [Breaks].
759  // Generated from: Other_Grapheme_Extend + Me + Mn
760  // Note: depending on an application's interpretation of Co (private use), they may be either
761  // in Grapheme_Base, or in Grapheme_Extend, or in neither.
763  // - Used to determine programming identifiers, as described in UAX #31: Identifier and Pattern Syntax.
764  ucfDcpIdStart = 1 << 24,
765  ucfDcpIdContinue = 1 << 25,
766  // - Characters with the Math property. For more information, see Chapter 4 in [Unicode].
767  // Generated from: Sm + Other_Math
768  ucfDcpMath = 1 << 26,
769  // - Characters with the Uppercase property. For more information, see Chapter 4 in [Unicode].
770  // Generated from: Lu + Other_Uppercase
771  ucfDcpUppercase = 1 << 27,
772  // - Used to determine programming identifiers, as described in UAX #31: Identifier and Pattern Syntax.
773  ucfDcpXidStart = 1 << 28,
774  ucfDcpXidContinue = 1 << 29,
777 }
779 
780 typedef enum TUniChProperties_
781 {
782  // The flags from PropList.txt.
783  // [The comments are from UCD.html.]
784  // - ASCII characters commonly used for the representation of hexadecimal numbers.
785  // [= 0123456789abcdefABCDEF]
787  // - Those format control characters which have specific functions in the Bidirectional Algorithm.
789  // - Those punctuation characters explicitly called out as dashes in the Unicode Standard,
790  // plus compatibility equivalents to those. Most of these have the Pd General Category,
791  // but some have the Sm General Category because of their use in mathematics.
792  // U+0002d HYPHEN-MINUS
793  // U+0058a ARMENIAN HYPHEN
794  // U+005be HEBREW PUNCTUATION MAQAF
795  // U+01806 MONGOLIAN TODO SOFT HYPHEN
796  // U+02010 HYPHEN
797  // U+02011 NON-BREAKING HYPHEN
798  // U+02012 FIGURE DASH
799  // U+02013 EN DASH
800  // U+02014 EM DASH
801  // U+02015 HORIZONTAL BAR
802  // U+02053 SWUNG DASH
803  // U+0207b SUPERSCRIPT MINUS
804  // U+0208b SUBSCRIPT MINUS
805  // U+02212 MINUS SIGN
806  // U+02e17 DOUBLE OBLIQUE HYPHEN
807  // U+0301c WAVE DASH
808  // U+03030 WAVY DASH
809  // U+030a0 KATAKANA-HIRAGANA DOUBLE HYPHEN
810  // U+0fe31 PRESENTATION FORM FOR VERTICAL EM DASH
811  // U+0fe32 PRESENTATION FORM FOR VERTICAL EN DASH
812  // U+0fe58 SMALL EM DASH
813  // U+0fe63 SMALL HYPHEN-MINUS
814  // U+0ff0d FULLWIDTH HYPHEN-MINUS
816  // - For a machine-readable list of deprecated characters. No characters will ever be removed
817  // from the standard, but the usage of deprecated characters is strongly discouraged.
819  // - Characters that linguistically modify the meaning of another character to which they apply.
820  // Some diacritics are not combining characters, and some combining characters are not diacritics.
822  // - Characters whose principal function is to extend the value or shape of a preceding alphabetic
823  // character. Typical of these are length and iteration marks.
825  // - Used in determining default grapheme cluster boundaries. For more information, see UAX #29: Text Boundaries.
827  // - Characters commonly used for the representation of hexadecimal numbers, plus their compatibility equivalents.
828  // [= AsciiHexDigit + fullwidth digit {0..9} + fullwidth latin {small|capital} letter {a..f}]
830  // - Those dashes used to mark connections between pieces of words, plus the Katakana middle dot.
831  // The Katakana middle dot functions like a hyphen, but is shaped like a dot rather than a dash.
832  // U+0002d HYPHEN-MINUS
833  // U+000ad SOFT HYPHEN
834  // U+0058a ARMENIAN HYPHEN
835  // U+01806 MONGOLIAN TODO SOFT HYPHEN
836  // U+02010 HYPHEN
837  // U+02011 NON-BREAKING HYPHEN
838  // U+02e17 DOUBLE OBLIQUE HYPHEN
839  // U+030fb KATAKANA MIDDLE DOT
840  // U+0fe63 SMALL HYPHEN-MINUS
841  // U+0ff0d FULLWIDTH HYPHEN-MINUS
842  // U+0ff65 HALFWIDTH KATAKANA MIDDLE DOT
843  ucfPrHyphen = 0x100,
844  // - Characters considered to be CJKV (Chinese, Japanese, Korean, and Vietnamese) ideographs.
846  // - Those format control characters which have specific functions for control of cursive joining and ligation.
848  // - There are a small number of characters that do not use logical order.
849  // These characters require special handling in most processing.
851  // - Code points that are permanently reserved for internal use.
853  // - Used for pattern syntax as described in UAX #31: Identifier and Pattern Syntax.
856  // - Those punctuation characters that function as quotation marks.
857  // U+00022 QUOTATION MARK
858  // U+00027 APOSTROPHE
859  // U+000ab LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
860  // U+000bb RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
861  // U+02018 LEFT SINGLE QUOTATION MARK
862  // U+02019 RIGHT SINGLE QUOTATION MARK
863  // U+0201a SINGLE LOW-9 QUOTATION MARK
864  // U+0201b SINGLE HIGH-REVERSED-9 QUOTATION MARK
865  // U+0201c LEFT DOUBLE QUOTATION MARK
866  // U+0201d RIGHT DOUBLE QUOTATION MARK
867  // U+0201e DOUBLE LOW-9 QUOTATION MARK
868  // U+0201f DOUBLE HIGH-REVERSED-9 QUOTATION MARK
869  // U+02039 SINGLE LEFT-POINTING ANGLE QUOTATION MARK
870  // U+0203a SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
871  // U+0300c LEFT CORNER BRACKET
872  // U+0300d RIGHT CORNER BRACKET
873  // U+0300e LEFT WHITE CORNER BRACKET
874  // U+0300f RIGHT WHITE CORNER BRACKET
875  // U+0301d REVERSED DOUBLE PRIME QUOTATION MARK
876  // U+0301e DOUBLE PRIME QUOTATION MARK
877  // U+0301f LOW DOUBLE PRIME QUOTATION MARK
878  // U+0fe41 PRESENTATION FORM FOR VERTICAL LEFT CORNER BRACKET
879  // U+0fe42 PRESENTATION FORM FOR VERTICAL RIGHT CORNER BRACKET
880  // U+0fe43 PRESENTATION FORM FOR VERTICAL LEFT WHITE CORNER BRACKET
881  // U+0fe44 PRESENTATION FORM FOR VERTICAL RIGHT WHITE CORNER BRACKET
882  // U+0ff02 FULLWIDTH QUOTATION MARK
883  // U+0ff07 FULLWIDTH APOSTROPHE
884  // U+0ff62 HALFWIDTH LEFT CORNER BRACKET
885  // U+0ff63 HALFWIDTH RIGHT CORNER BRACKET
887  // - Characters with a "soft dot", like i or j. An accent placed on these characters causes the dot to disappear.
888  // An explicit _dot above_ can be added where required, such as in Lithuanian.
889  ucfPrSoftDotted = 0x10000,
890  // - Sentence Terminal. Used in UAX #29: Text Boundaries.
891  // U+00021 EXCLAMATION MARK
892  // U+0002e FULL STOP
893  // U+0003f QUESTION MARK
894  // U+0203c DOUBLE EXCLAMATION MARK
895  // U+0203d INTERROBANG
896  // U+02047 DOUBLE QUESTION MARK
897  // U+02048 QUESTION EXCLAMATION MARK
898  // U+02049 EXCLAMATION QUESTION MARK
899  // U+03002 IDEOGRAPHIC FULL STOP
900  // [plus many characters from other writing systems]
901  ucfPrSTerm = 0x20000,
902  // - Those punctuation characters that generally mark the end of textual units.
903  // [JB note: this set contains more character than STerm. For example, it contains
904  // the comma, colon and semicolon, whereas STerm doesn't.]
905  // U+00021 EXCLAMATION MARK
906  // U+0002c COMMA
907  // U+0002e FULL STOP
908  // U+0003a COLON
909  // U+0003b SEMICOLON
910  // U+0003f QUESTION MARK
911  // U+0203c DOUBLE EXCLAMATION MARK
912  // U+0203d INTERROBANG
913  // U+02047 DOUBLE QUESTION MARK
914  // U+02048 QUESTION EXCLAMATION MARK
915  // U+02049 EXCLAMATION QUESTION MARK
916  // [plus *lots* of charcters from other writing systems]
918  // - Indicates all those characters that qualify as Variation Selectors.
919  // For details on the behavior of these characters, see StandardizedVariants.html and
920  // Section 16.4, Variation Selectors in [Unicode].
922  // - Those separator characters and control characters which should be treated by
923  // programming languages as "white space" for the purpose of parsing elements.
924  // Note: ZERO WIDTH SPACE and ZERO WIDTH NO-BREAK SPACE are not included,
925  // since their functions are restricted to line-break control.
926  // Their names are unfortunately misleading in this respect.
927  // Note: There are other senses of "whitespace" that encompass a different set of characters.
928  // [JB note: e.g. there's a BIDI class for whitespace ('WS') in UnicodeData.txt.
929  // There's also a "Sp" class in the sentence boundary algorithm, see UAX #29, sec. 5.1.]
930  // This includes the following characters:
931  // U+0009 <control>
932  // U+000a <control>
933  // U+000b <control>
934  // U+000c <control>
935  // U+000d <control>
936  // U+0020 SPACE
937  // U+0085 <control>
938  // U+00a0 NO-BREAK SPACE
939  // U+1680 OGHAM SPACE MARK
940  // U+180e MONGOLIAN VOWEL SEPARATOR
941  // U+2000 EN QUAD
942  // U+2001 EM QUAD
943  // U+2002 EN SPACE
944  // U+2003 EM SPACE
945  // U+2004 THREE-PER-EM SPACE
946  // U+2005 FOUR-PER-EM SPACE
947  // U+2006 SIX-PER-EM SPACE
948  // U+2007 FIGURE SPACE
949  // U+2008 PUNCTUATION SPACE
950  // U+2009 THIN SPACE
951  // U+200a HAIR SPACE
952  // U+2028 LINE SEPARATOR
953  // U+2029 PARAGRAPH SEPARATOR
954  // U+202f NARROW NO-BREAK SPACE
955  // U+205f MEDIUM MATHEMATICAL SPACE
956  // U+3000 IDEOGRAPHIC SPACE
957  ucfPrWhiteSpace = 0x100000
958 }
960 
961 typedef enum TUniChPropertiesX_
962 {
963  // More properties from PropList.txt.
964  // - Used to derive the properties in DerivedCoreProperties.txt.
973  // - Used in ideographic description sequences.
976  ucfPxRadical = 0x400,
978 }
980 
981 //-----------------------------------------------------------------------------
982 // TUniChInfo -- contains information about a single Unicode codepoint
983 //-----------------------------------------------------------------------------
984 
986 {
987 public:
988  enum { // combining classes (for 'combClass'); from UnicodeData.txt
989  ccStarter = 0, // 0: Spacing, split, enclosing, reordrant, and Tibetan subjoined
991  ccNuktas = 7,
994  ccFixedPositionStart = 10, // Start of fixed position classes
995  ccFixedPositionEnd = 199, // End of fixed position classes
999  ccLeftAttached = 208, // Left attached (reordrant around single base character)
1005  ccBelow = 220,
1007  ccLeft = 224, // Left (reordrant around single base character)
1008  ccRight = 226,
1010  ccAbove = 230,
1014  ccBelowIotaSubscript = 240, // Below (iota subscript)
1015  ccInvalid = 255 // not defined by Unicode
1016  };
1017  char chCat, chSubCat; // chCat + chSubCat together comprise the general category (from UnicodeData.txt)
1018  uchar combClass; // canonical combining class
1019  TUniChCategory cat; // = TUniChCategory(chCat)
1020  TUniChSubCategory subCat; // = TUniChSubCategory(cat << 8 | subCat)
1021  signed char script; // keyId into 'TUniChDb.scriptNames'; -1 if unknown
1023  int decompOffset; // offset into 'TUniChDb.decompositions'; or -1 if the character doesn't change during decomposition
1024  int nameOffset; // offset into 'TUniChDb.charNames'
1025  int flags; // a combination of TUniChFlags
1026  int properties; // a combination of TUniChProperties
1027  int propertiesX; // a combination of TUniChPropertiesX
1028  ushort lineBreak; // from LineBreak.txt
1029 
1030  // Converts a 2-letter linebreak code into a 16-bit integer.
1031  static inline ushort GetLineBreakCode(char c1, char c2) { return ((static_cast<ushort>(static_cast<uchar>(c1)) & 0xff) << 8) | ((static_cast<ushort>(static_cast<uchar>(c2)) & 0xff)); }
1033 
1034 public:
1035  void InitAfterLoad() {
1036  cat = (TUniChCategory) chCat;
1037  subCat = (TUniChSubCategory) (((static_cast<int>(static_cast<uchar>(chCat)) & 0xff) << 8) | (static_cast<int>(static_cast<uchar>(chSubCat)) & 0xff)); }
1038  void SetCatAndSubCat(const TUniChSubCategory catAndSubCat) {
1039  cat = (TUniChCategory) ((int(catAndSubCat) >> 8) & 0xff);
1040  subCat = catAndSubCat;
1041  chCat = (char) cat; chSubCat = (char) (int(subCat) & 0xff); }
1042  friend class TUniChDb;
1043 
1044  // Inexplicably missing from TSIn/TSOut...
1045  static inline void LoadUShort(TSIn& SIn, ushort& u) { SIn.LoadBf(&u, sizeof(u)); }
1046  static inline void LoadSChar(TSIn& SIn, signed char& u) { SIn.LoadBf(&u, sizeof(u)); }
1047  static inline void SaveUShort(TSOut& SOut, ushort u) { SOut.SaveBf(&u, sizeof(u)); }
1048  static inline void SaveSChar(TSOut& SOut, signed char u) { SOut.SaveBf(&u, sizeof(u)); }
1049 
1050 public:
1051  void Save(TSOut& SOut) const {
1052  SOut.Save(chCat); SOut.Save(chSubCat); SOut.Save(combClass); SaveSChar(SOut, script);
1053  SOut.Save(simpleUpperCaseMapping); SOut.Save(simpleLowerCaseMapping); SOut.Save(simpleTitleCaseMapping);
1054  SOut.Save(decompOffset); SOut.Save(nameOffset);
1055  SOut.Save(flags); SOut.Save(properties); SOut.Save(propertiesX); SaveUShort(SOut, lineBreak); }
1056  void Load(TSIn& SIn) {
1057  SIn.Load(chCat); SIn.Load(chSubCat); SIn.Load(combClass); LoadSChar(SIn, script);
1058  SIn.Load(simpleUpperCaseMapping); SIn.Load(simpleLowerCaseMapping); SIn.Load(simpleTitleCaseMapping);
1059  SIn.Load(decompOffset); SIn.Load(nameOffset);
1060  SIn.Load(flags); SIn.Load(properties); SIn.Load(propertiesX); LoadUShort(SIn, lineBreak); InitAfterLoad(); }
1061  explicit TUniChInfo(TSIn& SIn) { Load(SIn); }
1062  TUniChInfo() : chCat(char(ucOther)), chSubCat(char(ucOtherNotAssigned & 0xff)), combClass(ccInvalid),
1063  script(-1),simpleUpperCaseMapping(-1), simpleLowerCaseMapping(-1), simpleTitleCaseMapping(-1),
1064  decompOffset(-1), nameOffset(-1), flags(0), properties(0), propertiesX(0), lineBreak(LineBreak_Unknown) {
1065  InitAfterLoad(); }
1066 
1067  // DerivedCoreProperties flags.
1068  bool IsDcpFlag(const TUniChFlags flag) const { Assert((flag & ucfDcpMask) == flag); return (flags & flag) == flag; }
1069  void ClrDcpFlags() { flags = flags & ~ucfDcpMask; }
1070  void SetDcpFlag(const TUniChFlags flag) { Assert((flag & ucfDcpMask) == flag); flags |= flag; }
1071  bool IsAlphabetic() const { return IsDcpFlag(ucfDcpAlphabetic); }
1072  bool IsUppercase() const { return IsDcpFlag(ucfDcpUppercase); }
1073  bool IsLowercase() const { return IsDcpFlag(ucfDcpLowercase); }
1074  bool IsMath() const { return IsDcpFlag(ucfDcpMath); }
1076  bool IsGraphemeBase() const { return IsDcpFlag(ucfDcpGraphemeBase); }
1078  bool IsIdStart() const { return IsDcpFlag(ucfDcpIdStart); }
1079  bool IsIdContinue() const { return IsDcpFlag(ucfDcpIdContinue); }
1080  bool IsXidStart() const { return IsDcpFlag(ucfDcpXidStart); }
1081  bool IsXidContinue() const { return IsDcpFlag(ucfDcpXidContinue); }
1082 
1083  // PropList.txt flags.
1084  bool IsProperty(const TUniChProperties flag) const { return (properties & flag) == flag; }
1085  void SetProperty(const TUniChProperties flag) { properties |= flag; }
1087  bool IsBidiControl() const { return IsProperty(ucfPrBidiControl); }
1088  bool IsDash() const { return IsProperty(ucfPrDash); }
1089  bool IsDeprecated() const { return IsProperty(ucfPrDeprecated); }
1090  bool IsDiacritic() const { return IsProperty(ucfPrDiacritic); }
1091  bool IsExtender() const { return IsProperty(ucfPrExtender); }
1092  bool IsGraphemeLink() const { return IsProperty(ucfPrGraphemeLink); }
1093  bool IsHexDigit() const { return IsProperty(ucfPrHexDigit); }
1094  bool IsHyphen() const { return IsProperty(ucfPrHyphen); }
1095  bool IsIdeographic() const { return IsProperty(ucfPrIdeographic); }
1096  bool IsJoinControl() const { return IsProperty(ucfPrJoinControl); }
1100  bool IsSoftDotted() const { return IsProperty(ucfPrSoftDotted); }
1101  bool IsSTerminal() const { return IsProperty(ucfPrSTerm); }
1104  bool IsWhiteSpace() const { return IsProperty(ucfPrWhiteSpace); }
1105 
1106  // Additional PropList.txt flags.
1107  bool IsPropertyX(const TUniChPropertiesX flag) const { return (propertiesX & flag) == flag; }
1108  void SetPropertyX(const TUniChPropertiesX flag) { propertiesX |= flag; }
1109 
1110  // Miscellaneous flags.
1113 
1114  // Word-boundary flags.
1115  bool IsWbFlag(const TUniChFlags flag) const { Assert((flag & ucfWbMask) == flag); return (flags & flag) == flag; }
1116  void ClrWbAndSbFlags() { flags = flags & ~(ucfWbMask | ucfSbMask); }
1117  void SetWbFlag(const TUniChFlags flag) { Assert((flag & ucfWbMask) == flag); flags |= flag; }
1118  int GetWbFlags() const { return flags & ucfWbMask; }
1119  bool IsWbFormat() const { return IsWbFlag(ucfWbFormat); }
1121  static TStr GetWbFlagsStr(const int flags) { return TStr("") + (flags & ucfWbALetter ? "A" : "") +
1122  (flags & ucfWbFormat ? "F" : "") + (flags & ucfWbKatakana ? "K" : "") + (flags & ucfWbMidLetter ? "M" : "") +
1123  (flags & ucfWbMidNum ? "m" : "") + (flags & ucfWbNumeric ? "N" : "") + (flags & ucfWbExtendNumLet ? "E" : ""); }
1124 
1125  // Sentence-boundary flags.
1126  bool IsSbFlag(const TUniChFlags flag) const { Assert((flag & ucfSbMask) == flag); return (flags & flag) == flag; }
1127  void SetSbFlag(const TUniChFlags flag) { Assert((flag & ucfSbMask) == flag); flags |= flag; }
1128  int GetSbFlags() const { return flags & ucfSbMask; }
1129  bool IsSbFormat() const { return IsSbFlag(ucfSbFormat); }
1131  static TStr GetSbFlagsStr(const int flags) { return TStr("") + (flags & ucfSbSep ? "S" : "") +
1132  (flags & ucfSbFormat ? "F" : "") + (flags & ucfSbSp ? "_" : "") + (flags & ucfSbLower ? "L" : "") +
1133  (flags & ucfSbUpper ? "U" : "") + (flags & ucfSbOLetter ? "O" : "") + (flags & ucfSbNumeric ? "N" : "") +
1134  (flags & ucfSbATerm ? "A" : "") + (flags & ucfSbSTerm ? "T" : "") + (flags & ucfSbClose ? "C" : ""); }
1135 
1136  bool IsSbSep() const { return (flags & ucfSbSep) == ucfSbSep; }
1137 
1138  // Grapheme-boundary flags.
1139  bool IsGbExtend() const { return IsGraphemeExtend(); }
1140 
1141  // Sec. 3.13, D47: C is cased iff it is uppercase, lowercase, or general_category == titlecase_letter.
1142  bool IsCased() const { return IsUppercase() || IsLowercase() || (subCat == ucLetterTitlecase); }
1143 
1144  // Character categories.
1145  TUniChCategory GetCat() const { return (TUniChCategory) cat; }
1147  // The following characters belong to the 'symbol/currency' subcategory:
1148  // U+00024 DOLLAR SIGN
1149  // U+000a2 CENT SIGN
1150  // U+000a3 POUND SIGN
1151  // U+000a4 CURRENCY SIGN
1152  // U+000a5 YEN SIGN
1153  // U+020a3 FRENCH FRANC SIGN
1154  // U+020a4 LIRA SIGN
1155  // U+020ac EURO SIGN
1156  // [and plenty of others]
1157  bool IsCurrency() const { return subCat == ucSymbolCurrency; }
1158  // Note: most private-use and surrogate characters aren't listed explicitly in UnicodeData.txt.
1159  // Thus, it's better to call TUniChDb's versions of these methods, which are aware of
1160  // the full ranges of private-use and surrogate characters.
1161  bool IsPrivateUse() const { return subCat == ucOtherPrivateUse; }
1162  bool IsSurrogate() const { return subCat == ucOtherSurrogate; }
1163 
1164  inline static bool IsValidSubCat(const char chCat, const char chSubCat) {
1165  static const char s[] = "LuLlLtLmLoMnMcMeNdNlNoPcPdPsPePiPfPoSmScSkSoZsZlZpCcCfCsCoCn";
1166  for (const char *p = s; *p; p += 2)
1167  if (chCat == p[0] && chSubCat == p[1]) return true;
1168  return false; }
1169 };
1170 
1171 //-----------------------------------------------------------------------------
1172 // TUniTrie -- a trie for suffixes that should not appear at the end
1173 // of a sentence
1174 //-----------------------------------------------------------------------------
1175 
1176 template<typename TItem_>
1178 {
1179 public:
1180  typedef TItem_ TItem;
1181 protected:
1182  class TNode {
1183  public:
1184  TItem item;
1185  int child, sib;
1186  bool terminal;
1187  TNode() : child(-1), sib(-1), terminal(false) { }
1188  TNode(const TItem& item_, const int child_, const int sib_, const bool terminal_) : item(item_), child(child_), sib(sib_), terminal(terminal_) { }
1189  };
1197  TNodeV nodes;
1198 public:
1199  TUniTrie() { }
1200  void Clr() { singles.Clr(); pairs.Clr(); roots.Clr(); nodes.Clr(); }
1201 
1202  bool Empty() const { return singles.Empty() && pairs.Empty() && roots.Empty(); }
1203 
1204  bool Has1Gram(const TItem& item) const { return singles.IsKey(item); }
1205  bool Has2Gram(const TItem& last, const TItem& butLast) const { return pairs.IsKey(TItemPr(last, butLast)); }
1206  int Get3GramRoot(const TItem& last, const TItem& butLast, const TItem& butButLast) const {
1207  int keyId = roots.GetKeyId(TItemTr(last, butLast, butButLast));
1208  if (keyId < 0) return 0; else return roots[keyId]; }
1209  int GetChild(const int parentIdx, const TItem& item) const {
1210  for (int childIdx = nodes[parentIdx].child; childIdx >= 0; ) {
1211  const TNode &node = nodes[childIdx];
1212  if (node.item == item) return childIdx;
1213  childIdx = node.sib; }
1214  return -1; }
1215  bool IsNodeTerminal(const int nodeIdx) const { return nodes[nodeIdx].terminal; }
1216 
1217  // Adds a new string to the trie. Note that the last characters appear
1218  // closer to the root of the trie.
1219  template<typename TSrcVec>
1220  void Add(const TSrcVec& src, const size_t srcIdx, const size_t srcCount)
1221  {
1222  IAssert(srcCount > 0);
1223  if (srcCount == 1) { singles.AddKey(TItem(src[TVecIdx(srcIdx)])); return; }
1224  if (srcCount == 2) { pairs.AddKey(TItemPr(TItem(src[TVecIdx(srcIdx + 1)]), TItem(src[TVecIdx(srcIdx)]))); return; }
1225  size_t srcLast = srcIdx + (srcCount - 1);
1226  TItemTr tr = TItemTr(TItem(src[TVecIdx(srcLast)]), TItem(src[TVecIdx(srcLast - 1)]), TItem(src[TVecIdx(srcLast - 2)]));
1227  int keyId = roots.GetKeyId(tr), curNodeIdx = -1;
1228  if (keyId >= 0) curNodeIdx = roots[keyId];
1229  else { curNodeIdx = nodes.Add(TNode(TItem(0), -1, -1, false)); roots.AddDat(tr, curNodeIdx); }
1230  //
1231  if (srcCount > 3) for (size_t srcPos = srcLast - 3; ; )
1232  {
1233  const TItem curItem = src[TVecIdx(srcPos)];
1234  int childNodeIdx = nodes[curNodeIdx].child;
1235  while (childNodeIdx >= 0) {
1236  TNode &childNode = nodes[childNodeIdx];
1237  if (childNode.item == curItem) break;
1238  childNodeIdx = childNode.sib; }
1239  if (childNodeIdx < 0) {
1240  childNodeIdx = nodes.Add(TNode(curItem, -1, nodes[curNodeIdx].child, false));
1241  nodes[curNodeIdx].child = childNodeIdx; }
1242  curNodeIdx = childNodeIdx;
1243  if (srcPos == srcIdx) break; else srcPos--;
1244  }
1245  nodes[curNodeIdx].terminal = true;
1246  }
1247 
1248  template<typename TSrcVec>
1249  void Add(const TSrcVec& src) { Add(src, 0, (size_t) src.Len()); }
1250 };
1251 
1252 //-----------------------------------------------------------------------------
1253 // TUniChDb -- provides access to the Unicode Character Database
1254 //-----------------------------------------------------------------------------
1255 
1257 {
1258 protected:
1259  void InitAfterLoad();
1261 
1262 public:
1263  THash<TInt, TUniChInfo> h; // key: codepoint
1265  TStrIntH scripts; // keyID = used in TUniChInfo.script; key = script name; dat = number of characters (informative only)
1269  // These hash tables contain only the unconditional mappings from SpecialCasing.txt.
1270  // The conditional mappings are hardcoded into GetCaseConverted().
1272  int scriptUnknown; // = scripts.GetKey("Unknown")
1273 
1274  TUniChDb() : scriptUnknown(-1) { }
1275  explicit TUniChDb(TSIn& SIn) { Load(SIn); }
1276  void Clr() {
1277  h.Clr(); charNames.Clr(); decompositions.Clr(); inverseDec.Clr(); caseFolding.Clr();
1278  specialCasingLower.Clr(); specialCasingUpper.Clr(); specialCasingTitle.Clr();
1279  scripts.Clr(); }
1280  void Save(TSOut& SOut) const {
1281  h.Save(SOut); charNames.Save(SOut); decompositions.Save(SOut);
1282  inverseDec.Save(SOut); caseFolding.Save(SOut); scripts.Save(SOut);
1283  specialCasingLower.Save(SOut); specialCasingUpper.Save(SOut); specialCasingTitle.Save(SOut);
1284  SOut.SaveCs(); }
1285  void Load(TSIn& SIn) {
1286  h.Load(SIn); charNames.~TStrPool(); new (&charNames) TStrPool(SIn);
1287  decompositions.Load(SIn);
1288  inverseDec.Load(SIn); caseFolding.Load(SIn); scripts.Load(SIn);
1289  specialCasingLower.Load(SIn); specialCasingUpper.Load(SIn); specialCasingTitle.Load(SIn);
1290  SIn.LoadCs(); InitAfterLoad(); }
1291  void LoadBin(const TStr& fnBin) {
1292  PSIn SIn = TFIn::New(fnBin); Load(*SIn); }
1293  void Test(const TStr& basePath);
1294 
1295  // File names used by LoadTxt() and its subroutines.
1296  static TStr GetCaseFoldingFn() { return "CaseFolding.txt"; }
1297  static TStr GetSpecialCasingFn() { return "SpecialCasing.txt"; }
1298  static TStr GetUnicodeDataFn() { return "UnicodeData.txt"; }
1299  static TStr GetCompositionExclusionsFn() { return "CompositionExclusions.txt"; }
1300  static TStr GetScriptsFn() { return "Scripts.txt"; }
1301  static TStr GetDerivedCorePropsFn() { return "DerivedCoreProperties.txt"; }
1302  static TStr GetLineBreakFn() { return "LineBreak.txt"; }
1303  static TStr GetPropListFn() { return "PropList.txt"; }
1304  static TStr GetAuxiliaryDir() { return "auxiliary"; }
1305  static TStr GetWordBreakTestFn() { return "WordBreakTest.txt"; }
1306  static TStr GetWordBreakPropertyFn() { return "WordBreakProperty.txt"; }
1307  static TStr GetSentenceBreakTestFn() { return "SentenceBreakTest.txt"; }
1308  static TStr GetSentenceBreakPropertyFn() { return "SentenceBreakProperty.txt"; }
1309  static TStr GetNormalizationTestFn() { return "NormalizationTest.txt"; }
1310  static TStr GetBinFn() { return "UniChDb.bin"; } // used only by Test()
1311 
1312  //-------------------------------------------------------------------------
1313  // Script names
1314  //-------------------------------------------------------------------------
1315 
1316  // These constants are used when initializing from the text files.
1317  static TStr GetScriptNameUnknown() { return "Unknown"; }
1318  static TStr GetScriptNameKatakana() { return "Katakana"; }
1319  static TStr GetScriptNameHiragana() { return "Hiragana"; }
1320  //
1321  const TStr& GetScriptName(const int scriptId) const { return scripts.GetKey(scriptId); }
1322  int GetScriptByName(const TStr& scriptName) const { return scripts.GetKeyId(scriptName); }
1323  int GetScript(const TUniChInfo& ci) const { int s = ci.script; if (s < 0) s = scriptUnknown; return s; }
1324  int GetScript(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return scriptUnknown; else return GetScript(h[i]); }
1325 
1326  //-------------------------------------------------------------------------
1327  // Character namesnames
1328  //-------------------------------------------------------------------------
1329 
1330  // GetCharName returns 0 if the name is unknown; GetCharNameS returns a string of the form "U+1234".
1331  const char *GetCharName(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return 0; int ofs = h[i].nameOffset; return ofs < 0 ? 0 : charNames.GetCStr(ofs); }
1332  TStr GetCharNameS(const int cp) const {
1333  // ToDo: Add special processing for precomposed Hangul syllables (UAX #15, sec. 16).
1334  const char *p = GetCharName(cp); if (p) return p;
1335  char buf[20]; sprintf(buf, "U+%04x", cp); return TStr(buf); }
1336  template<class TSrcVec> void PrintCharNames(FILE *f, const TSrcVec& src, size_t srcIdx, const size_t srcCount, const TStr& prefix) const {
1337  if (! f) f = stdout;
1338  for (const size_t srcEnd = srcIdx + srcCount; srcIdx < srcEnd; srcIdx++) {
1339  fprintf(f, "%s", prefix.CStr());
1340  int cp = src[TVecIdx(srcIdx)]; fprintf(f, (cp >= 0x10000 ? "U+%05x" : "U+%04x "), cp);
1341  fprintf(f, " %s\n", GetCharNameS(cp).CStr()); }}
1342  template<class TSrcVec> void PrintCharNames(FILE *f, const TSrcVec& src, const TStr& prefix) const { PrintCharNames(f, src, 0, src.Len(), prefix); }
1343 
1344  //-------------------------------------------------------------------------
1345  // Character information
1346  //-------------------------------------------------------------------------
1347  // These methods provide access to a subset of the functionality
1348  // available in TUniChInfo.
1349 
1350  bool IsGetChInfo(const int cp, TUniChInfo& ChInfo) {
1351  int i = h.GetKeyId(cp);
1352  if (i < 0) return false; else { ChInfo=h[i]; return true; }}
1353  TUniChCategory GetCat(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return ucOther; else return h[i].cat; }
1354  TUniChSubCategory GetSubCat(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return ucOtherNotAssigned; else return h[i].subCat; }
1355 
1356  bool IsWbFlag(const int cp, const TUniChFlags flag) const { int i = h.GetKeyId(cp); if (i < 0) return false; else return h[i].IsWbFlag(flag); }
1357  int GetWbFlags(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return 0; else return h[i].GetWbFlags(); }
1358  bool IsSbFlag(const int cp, const TUniChFlags flag) const { int i = h.GetKeyId(cp); if (i < 0) return false; else return h[i].IsSbFlag(flag); }
1359  int GetSbFlags(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return 0; else return h[i].GetSbFlags(); }
1360 
1361 #define ___UniFwd1(name) bool name(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return false; else return h[i].name(); }
1362 #define ___UniFwd2(name1, name2) ___UniFwd1(name1) ___UniFwd1(name2)
1363 #define ___UniFwd3(name1, name2, name3) ___UniFwd2(name1, name2) ___UniFwd1(name3)
1364 #define ___UniFwd4(name1, name2, name3, name4) ___UniFwd3(name1, name2, name3) ___UniFwd1(name4)
1365 #define ___UniFwd5(name1, name2, name3, name4, name5) ___UniFwd4(name1, name2, name3, name4) ___UniFwd1(name5)
1366 
1367 #define DECLARE_FORWARDED_PROPERTY_METHODS \
1368  ___UniFwd5(IsAsciiHexDigit, IsBidiControl, IsDash, IsDeprecated, IsDiacritic) \
1369  ___UniFwd5(IsExtender, IsGraphemeLink, IsHexDigit, IsHyphen, IsIdeographic) \
1370  ___UniFwd5(IsJoinControl, IsLogicalOrderException, IsNoncharacter, IsQuotationMark, IsSoftDotted) \
1371  ___UniFwd4(IsSTerminal, IsTerminalPunctuation, IsVariationSelector, IsWhiteSpace) \
1372  ___UniFwd5(IsAlphabetic, IsUppercase, IsLowercase, IsMath, IsDefaultIgnorable) \
1373  ___UniFwd4(IsGraphemeBase, IsGraphemeExtend, IsIdStart, IsIdContinue) \
1374  ___UniFwd2(IsXidStart, IsXidContinue) \
1375  ___UniFwd3(IsCompositionExclusion, IsCompatibilityDecomposition, IsSbSep) \
1376  ___UniFwd1(IsGbExtend) \
1377  ___UniFwd2(IsCased, IsCurrency)
1378 
1380 
1381 #undef ___UniFwd1
1382 
1383  bool IsPrivateUse(const int cp) const {
1384  int i = h.GetKeyId(cp); if (i >= 0) return h[i].IsPrivateUse();
1385  return (0xe000 <= cp && cp <= 0xf8ff) || // plane 0 private-use area
1386  // Planes 15 and 16 are entirely for private use.
1387  (0xf0000 <= cp && cp <= 0xffffd) || (0x100000 <= cp && cp <= 0x10fffd); }
1388  // Note: d800..dbff are high surrogates, dc00..dfff are low surrogates.
1389  // For db80..dbff it is clear that the surrogate pair containing this high surrogate
1390  // will refer to a private-use codepoint, but IsPrivateUse nevertheless returns false
1391  // for db80..dbff. This is consistent with the category codes assigned in UnicodeData.txt.
1392  bool IsSurrogate(const int cp) const {
1393  int i = h.GetKeyId(cp); if (i >= 0) return h[i].IsSurrogate();
1394  return 0xd800 <= cp && cp <= 0xdcff; }
1395 
1396  // Note: in particular, all Hangul characters (HangulLBase..HangulLBase + HangulLCount - 1
1397  // and HangulSBase..HangulSBase + HangulSCount - 1) should be treated as starters
1398  // for composition to work correctly.
1399  int GetCombiningClass(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return TUniChInfo::ccStarter; else return h[i].combClass; }
1400 
1401  //-------------------------------------------------------------------------
1402  // Hangul constants
1403  //-------------------------------------------------------------------------
1404 
1405  enum {
1406  HangulSBase = 0xAC00, HangulLBase = 0x1100, HangulVBase = 0x1161, HangulTBase = 0x11A7,
1410  };
1411 
1412  //-------------------------------------------------------------------------
1413  // Word boundaries (UAX #29)
1414  //-------------------------------------------------------------------------
1415 
1416 protected:
1417  // UAX #29, rule WB3: ignore Format and Extend characters.
1418  // [Note: rule SB5 for sentence boundaries is identical, and thus these methods will also be used for sentence-boundary detection.]
1419  static bool IsWbIgnored(const TUniChInfo& ci) { return ci.IsGbExtend() || ci.IsWbFormat(); }
1420  bool IsWbIgnored(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return false; else return IsWbIgnored(h[i]); }
1421  // Sets 'position' to the smallest index from 'position..srcEnd-1' that contains a non-ignored character.
1422  template<typename TSrcVec> void WbFindCurOrNextNonIgnored(const TSrcVec& src, size_t& position, const size_t srcEnd) const {
1423  while (position < srcEnd && IsWbIgnored(src[TVecIdx(position)])) position++; }
1424  // Sets 'position' to the smallest index from 'position+1..srcEnd-1' that contains a non-ignored character.
1425  template<typename TSrcVec> void WbFindNextNonIgnored(const TSrcVec& src, size_t& position, const size_t srcEnd) const {
1426  if (position >= srcEnd) return;
1427  position++; while (position < srcEnd && IsWbIgnored(src[TVecIdx(position)])) position++; }
1428  // Sets 'position' to the smallest index from 'position+1..srcEnd-1' that contains a non-ignored character.
1429  template<typename TSrcVec> void WbFindNextNonIgnoredS(const TSrcVec& src, size_t& position, const size_t srcEnd) const {
1430  if (position >= srcEnd) return;
1431  if (IsSbSep(src[TVecIdx(position)])) { position++; return; }
1432  position++; while (position < srcEnd && IsWbIgnored(src[TVecIdx(position)])) position++; }
1433  // Sets 'position' to the largest index from 'srcStart..position-1' that contains a non-ignored character.
1434  template<typename TSrcVec> bool WbFindPrevNonIgnored(const TSrcVec& src, const size_t srcStart, size_t& position) const {
1435  if (position <= srcStart) return false;
1436  while (position > srcStart) {
1437  position--; if (! IsWbIgnored(src[TVecIdx(position)])) return true; }
1438  return false; }
1439  // Test driver for WbFind*NonIgnored.
1440  void TestWbFindNonIgnored(const TIntV& src) const;
1441  void TestWbFindNonIgnored() const;
1442 public:
1443  // Finds the next word boundary strictly after 'position'.
1444  // Note that there is a valid word boundary at 'srcIdx + srcCount'.
1445  // If there is no such word boundary, it returns 'false' and sets 'position' to 'srcIdx + srcCount'.
1446  template<typename TSrcVec>
1447  bool FindNextWordBoundary(const TSrcVec& src, const size_t srcIdx, const size_t srcCount, size_t &position) const;
1448  // Creates, in 'dest', a vector of 'srcCount + 1' elements, where 'dest[i]' tells if there is a word
1449  // boundary between 'src[srcIdx + i - 1]' and 'src[srcIdx + i]'. Note that 'dest[0]' and 'dest[srcCount]' are
1450  // always set to 'true'.
1451  template<typename TSrcVec>
1452  void FindWordBoundaries(const TSrcVec& src, const size_t srcIdx, const size_t srcCount, TBoolV& dest) const;
1453 protected:
1454  void TestFindNextWordOrSentenceBoundary(const TStr& basePath, bool sentence);
1455 
1456  //-------------------------------------------------------------------------
1457  // Sentence boundaries (UAX #29)
1458  //-------------------------------------------------------------------------
1459 
1460 protected:
1462 
1463  // Checks whether a sentence that ended at src[position - 1]
1464  // would end in one of the suffixes from sbExTrie.
1465  template<typename TSrcVec>
1466  bool CanSentenceEndHere(const TSrcVec& src, const size_t srcIdx, const size_t position) const;
1467 
1468 public:
1469  // Finds the next sentence boundary strictly after 'position'.
1470  // Note that there is a valid sentence boundary at 'srcIdx + srcCount'.
1471  // If there is no such sentence boundary, it returns 'false' and sets 'position' to 'srcIdx + srcCount'.
1472  template<typename TSrcVec>
1473  bool FindNextSentenceBoundary(const TSrcVec& src, const size_t srcIdx, const size_t srcCount, size_t &position) const;
1474  // Creates, in 'dest', a vector of 'srcCount + 1' elements, where 'dest[i]' tells if there is a sentence
1475  // boundary between 'src[srcIdx + i - 1]' and 'src[srcIdx + i]'. Note that 'dest[0]' and 'dest[srcCount]' are
1476  // always set to 'true'.
1477  template<typename TSrcVec>
1478  void FindSentenceBoundaries(const TSrcVec& src, const size_t srcIdx, const size_t srcCount, TBoolV& dest) const;
1479 
1480  // These methods allow the user to define a set of sentence boundary exceptions.
1481  // This is a set of strings, stored in 'sbExTrie'. If the Unicode rules require
1482  // a sentence boundary in a position that would cause the sentence to end with
1483  // 'x (STerm | ATerm) Close* Sp* Sep?', where 'x' is a word from 'sbExTrie',
1484  // we will *not* place a sentence boundary there.
1485  //
1486  // NOTE: sbExTrie is not saved or loaded by the Save() and Load() methods.
1487  // By default, it is empty. Use SbEx_Clr() to clear it, and SbEx_SetStdEnglish() to obtain
1488  // a standard set of English-language exceptions.
1489  void SbEx_Clr() { sbExTrie.Clr(); }
1490  template<class TSrcVec> void SbEx_Add(const TSrcVec& v) { sbExTrie.Add(v); }
1491  // template<> void SbEx_Add(const TStr& s) {
1492  void SbEx_Add(const TStr& s) {
1493  TIntV v; int n = s.Len(); v.Gen(n); for (int i = 0; i < n; i++) v[i] = int(uchar(s[i])); SbEx_Add(v); }
1494  void SbEx_AddUtf8(const TStr& s) { TUniCodec codec; TIntV v; codec.DecodeUtf8(s, v); SbEx_Add(v); }
1495  int SbEx_AddMulti(const TStr& words, const bool wordsAreUtf8 = true) { TStrV vec; words.SplitOnAllCh('|', vec);
1496  for (int i = 0; i < vec.Len(); i++) if (wordsAreUtf8) SbEx_AddUtf8(vec[i]); else SbEx_Add(vec[i]);
1497  return vec.Len(); }
1498  void SbEx_Set(const TUniTrie<TInt>& newTrie) { sbExTrie = newTrie; }
1500  static const TStr data = "Ms|Mrs|Mr|Rev|Dr|Prof|Gov|Sen|Rep|Gen|Brig|Col|Capt|Lieut|Lt|Sgt|Pvt|Cmdr|Adm|Corp|St|Mt|Ft|e.g|e. g.|i.e|i. e|ib|ibid|s.v|s. v|s.vv|s. vv";
1501  SbEx_Clr(); return SbEx_AddMulti(data, false); }
1502 
1503  //-------------------------------------------------------------------------
1504  // Normalization, decomposition, etc. (UAX #15)
1505  //-------------------------------------------------------------------------
1506 
1507 protected:
1508  // Adds, to 'dest', the decomposition of 'codePoint' (calling itself recursively if necessary).
1509  // If 'compatibility == false', only canonical decompositions are used.
1510  template<typename TDestCh>
1511  void AddDecomposition(const int codePoint, TVec<TDestCh>& dest, const bool compatibility) const;
1512 public:
1513  // This appends, to 'dest', the decomposed form of the source string.
1514  // - for normalization form D (NFD), i.e. canonical decomposition: use compatibility == false;
1515  // - for normalization form KD (NFKD), i.e. compatibility decomposition: use compatibility == true.
1516  template<typename TSrcVec, typename TDestCh>
1517  void Decompose(const TSrcVec& src, size_t srcIdx, const size_t srcCount,
1518  TVec<TDestCh>& dest, bool compatibility, bool clrDest = true) const;
1519  template<typename TSrcVec, typename TDestCh>
1520  void Decompose(const TSrcVec& src, TVec<TDestCh>& dest, bool compatibility, bool clrDest = true) const {
1521  Decompose(src, 0, src.Len(), dest, compatibility, clrDest); }
1522  // This performs canonical composition on the source string, and appends
1523  // the result to the destination string. The source string should be the
1524  // result of a (canonical or compatibility) decomposition; if this is the
1525  // case, the composition will lead to a normalization form C (NFC) or
1526  // normalization form KC (NFKC), depending on whether canonical or compatibility
1527  // decomposition was used.
1528  template<typename TSrcVec, typename TDestCh>
1529  void Compose(const TSrcVec& src, size_t srcIdx, const size_t srcCount,
1530  TVec<TDestCh>& dest, bool clrDest = true) const;
1531  template<typename TSrcVec, typename TDestCh>
1532  void Compose(const TSrcVec& src, TVec<TDestCh>& dest, bool clrDest = true) const {
1533  Compose(src, 0, src.Len(), dest, clrDest); }
1534  // Calls Decompose, followed by Compose; thus the result is the NFC (if
1535  // compatibility == false) or NFKC (if compatibility == true) of the source string.
1536  // A temporary TIntV is used to contain the intermediate NF(K)D form of the
1537  // source string.
1538  template<typename TSrcVec, typename TDestCh>
1539  void DecomposeAndCompose(const TSrcVec& src, size_t srcIdx, const size_t srcCount,
1540  TVec<TDestCh>& dest, bool compatibility, bool clrDest = true) const;
1541  template<typename TSrcVec, typename TDestCh>
1542  void DecomposeAndCompose(const TSrcVec& src, TVec<TDestCh>& dest, bool compatibility, bool clrDest = true) const {
1543  DecomposeAndCompose(src, 0, src.Len(), dest, compatibility, clrDest); }
1544  // Copies the starter characters from 'src' to 'dest'; the other
1545  // characters are skipped. 'src' should already have been decomposed.
1546  // Returns the number of characters extracted.
1547  template<typename TSrcVec, typename TDestCh>
1548  size_t ExtractStarters(const TSrcVec& src, size_t srcIdx, const size_t srcCount,
1549  TVec<TDestCh>& dest, bool clrDest = true) const;
1550  template<typename TSrcVec, typename TDestCh>
1551  size_t ExtractStarters(const TSrcVec& src, TVec<TDestCh>& dest, bool clrDest = true) const {
1552  return ExtractStarters(src, 0, src.Len(), dest, clrDest); }
1553  // Extracts the starters into a temporary vector and then copies it into 'src'.
1554  template<typename TSrcVec>
1555  size_t ExtractStarters(TSrcVec& src) const {
1556  TIntV temp; size_t retVal = ExtractStarters(src, temp);
1557  src.Clr(); for (int i = 0; i < temp.Len(); i++) src.Add(temp[i]);
1558  return retVal; }
1559 
1560 protected:
1561  void TestComposition(const TStr& basePath);
1562 
1563  //-------------------------------------------------------------------------
1564  // Initialization from the text files
1565  //-------------------------------------------------------------------------
1566 
1567 protected:
1568  void InitWordAndSentenceBoundaryFlags(const TStr& basePath);
1569  void InitScripts(const TStr& basePath);
1570  void InitLineBreaks(const TStr& basePath);
1571  void InitDerivedCoreProperties(const TStr& basePath);
1572  void InitPropList(const TStr& basePath);
1573  void InitSpecialCasing(const TStr& basePath);
1575 public:
1576  void LoadTxt(const TStr& basePath);
1577  void SaveBin(const TStr& fnBinUcd);
1578 
1579  //-------------------------------------------------------------------------
1580  // Case conversions
1581  //-------------------------------------------------------------------------
1582 
1583 public:
1584  typedef enum TCaseConversion_ { ccLower = 0, ccUpper = 1, ccTitle = 2, ccMax = 3 } TCaseConversion;
1585  // Appends the case-converted form of 'src' to 'dest'.
1586  // 'how' defines what kind of case conversion is required.
1587  // 'turkic' should be set to true iff the text is in Turkic ('tr') or Azeri ('ar').
1588  // 'lithuanian' should be set to true iff the text is in Lithuanian ('lt').
1589  template<typename TSrcVec, typename TDestCh> void GetCaseConverted(const TSrcVec& src, size_t srcIdx, const size_t srcCount, TVec<TDestCh>& dest, const bool clrDest, const TCaseConversion how, const bool turkic, const bool lithuanian) const;
1590  template<typename TSrcVec, typename TDestCh> void GetLowerCase(const TSrcVec& src, size_t srcIdx, const size_t srcCount, TVec<TDestCh>& dest, const bool clrDest = true, const bool turkic = false, const bool lithuanian = false) const { GetCaseConverted(src, srcIdx, srcCount, dest, clrDest, ccLower, turkic, lithuanian); }
1591  template<typename TSrcVec, typename TDestCh> void GetUpperCase(const TSrcVec& src, size_t srcIdx, const size_t srcCount, TVec<TDestCh>& dest, const bool clrDest = true, const bool turkic = false, const bool lithuanian = false) const { GetCaseConverted(src, srcIdx, srcCount, dest, clrDest, ccUpper, turkic, lithuanian); }
1592  template<typename TSrcVec, typename TDestCh> void GetTitleCase(const TSrcVec& src, size_t srcIdx, const size_t srcCount, TVec<TDestCh>& dest, const bool clrDest = true, const bool turkic = false, const bool lithuanian = false) const { GetCaseConverted(src, srcIdx, srcCount, dest, clrDest, ccTitle, turkic, lithuanian); }
1593  template<typename TSrcVec, typename TDestCh> void GetLowerCase(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true, const bool turkic = false, const bool lithuanian = false) const { GetLowerCase(src, 0, src.Len(), dest, clrDest, turkic, lithuanian); }
1594  template<typename TSrcVec, typename TDestCh> void GetUpperCase(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true, const bool turkic = false, const bool lithuanian = false) const { GetUpperCase(src, 0, src.Len(), dest, clrDest, turkic, lithuanian); }
1595  template<typename TSrcVec, typename TDestCh> void GetTitleCase(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true, const bool turkic = false, const bool lithuanian = false) const { GetTitleCase(src, 0, src.Len(), dest, clrDest, turkic, lithuanian); }
1596 
1597  // GetSimpleCaseConverted uses only the simple case mappings (from UnicodeData.txt).
1598  // This is simpler and faster. Since each character now maps into exactly one
1599  // character, case conversion can also be done in place (see ToSimpleCaseConverted, etc.).
1600  template<typename TSrcVec, typename TDestCh> void GetSimpleCaseConverted(const TSrcVec& src, size_t srcIdx, const size_t srcCount, TVec<TDestCh>& dest, const bool clrDest, const TCaseConversion how) const;
1601  template<typename TSrcVec, typename TDestCh> void GetSimpleLowerCase(const TSrcVec& src, size_t srcIdx, const size_t srcCount, TVec<TDestCh>& dest, const bool clrDest = true) const { GetSimpleCaseConverted(src, srcIdx, srcCount, dest, clrDest, ccLower); }
1602  template<typename TSrcVec, typename TDestCh> void GetSimpleUpperCase(const TSrcVec& src, size_t srcIdx, const size_t srcCount, TVec<TDestCh>& dest, const bool clrDest = true) const { GetSimpleCaseConverted(src, srcIdx, srcCount, dest, clrDest, ccUpper); }
1603  template<typename TSrcVec, typename TDestCh> void GetSimpleTitleCase(const TSrcVec& src, size_t srcIdx, const size_t srcCount, TVec<TDestCh>& dest, const bool clrDest = true) const { GetSimpleCaseConverted(src, srcIdx, srcCount, dest, clrDest, ccTitle); }
1604  template<typename TSrcVec, typename TDestCh> void GetSimpleLowerCase(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true) const { GetSimpleLowerCase(src, 0, src.Len(), dest, clrDest); }
1605  template<typename TSrcVec, typename TDestCh> void GetSimpleUpperCase(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true) const { GetSimpleUpperCase(src, 0, src.Len(), dest, clrDest); }
1606  template<typename TSrcVec, typename TDestCh> void GetSimpleTitleCase(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true) const { GetSimpleTitleCase(src, 0, src.Len(), dest, clrDest); }
1607 
1608  template<typename TSrcVec> void ToSimpleCaseConverted(TSrcVec& src, size_t srcIdx, const size_t srcCount, const TCaseConversion how) const;
1609  template<typename TSrcVec> void ToSimpleUpperCase(TSrcVec& src, size_t srcIdx, const size_t srcCount) const { ToSimpleCaseConverted(src, srcIdx, srcCount, ccUpper); }
1610  template<typename TSrcVec> void ToSimpleLowerCase(TSrcVec& src, size_t srcIdx, const size_t srcCount) const { ToSimpleCaseConverted(src, srcIdx, srcCount, ccLower); }
1611  template<typename TSrcVec> void ToSimpleTitleCase(TSrcVec& src, size_t srcIdx, const size_t srcCount) const { ToSimpleCaseConverted(src, srcIdx, srcCount, ccTitle); }
1612  template<typename TSrcVec> void ToSimpleUpperCase(TSrcVec& src) const { ToSimpleUpperCase(src, 0, src.Len()); }
1613  template<typename TSrcVec> void ToSimpleLowerCase(TSrcVec& src) const { ToSimpleLowerCase(src, 0, src.Len()); }
1614  template<typename TSrcVec> void ToSimpleTitleCase(TSrcVec& src) const { ToSimpleTitleCase(src, 0, src.Len()); }
1615 
1616 public:
1617  friend class TUniCaseFolding;
1618 
1619  // Case folding is an alternative to the above functions. It is intended primarily
1620  // to produce strings that are suitable for comparisons. For example,
1621  // ToLowerCase(sigma) = sigma, ToLowerCase(final-sigma) = final-sigma;
1622  // but ToCaseFolder(sigma) = sigma, ToCaseFolded(final-sigma) = sigma.
1623  // - 'turkic' enables special processing for Turkic languages (I-dot and i-dotless).
1624  // - 'full' enables full case mappings -- i.e. sometimes a character may be mapped
1625  // into a string of two or more characters.
1626  // - Note: For best results, perform NFD(CaseFold(NFD(x)) or NFKD(CaseFold(NFKD(x)) on
1627  // each string before comparing them (see sec. 3.13 of the standard).
1628  template<typename TSrcVec, typename TDestCh>
1629  void GetCaseFolded(const TSrcVec& src, size_t srcIdx, const size_t srcCount,
1630  TVec<TDestCh>& dest, const bool clrDest, const bool full, const bool turkic = false) const { caseFolding.Fold(src, srcIdx, srcCount, dest, clrDest, full, turkic); }
1631  template<typename TSrcVec, typename TDestCh>
1632  void GetCaseFolded(const TSrcVec& src, TVec<TDestCh>& dest, const bool clrDest = true, const bool full = true, const bool turkic = false) const {
1633  GetCaseFolded(src, 0, src.Len(), dest, clrDest, full, turkic); }
1634  // ToCaseFolded folds the string in place. However, this means that only the simple
1635  // case foldings can be used (the full ones could increase the length of the string).
1636  template<typename TSrcVec> void ToCaseFolded(TSrcVec& src, size_t srcIdx, const size_t srcCount, const bool turkic = false) const { caseFolding.FoldInPlace(src, srcIdx, srcCount, turkic); }
1637  template<typename TSrcVec> void ToCaseFolded(TSrcVec& src, const bool turkic = false) const { ToCaseFolded(src, 0, src.Len(), turkic); }
1638 
1639 protected:
1640  void TestCaseConversion(const TStr& source, const TStr& trueLc, const TStr& trueTc, const TStr& trueUc, bool turkic, bool lithuanian);
1641  void TestCaseConversions();
1642 
1643  //-------------------------------------------------------------------------
1644  // Text file reader for the Unicode character database
1645  //-------------------------------------------------------------------------
1646 
1647 protected:
1648 
1650  {
1651  protected:
1653  public:
1654  TChA comment; // contains '#' and everything after it
1655  protected:
1656  FILE *f;
1658  int GetCh() {
1659  if (putBackCh >= 0) { int c = putBackCh; putBackCh = EOF; return c; }
1660  return fgetc(f); }
1661  void PutBack(int c) { Assert(putBackCh == EOF); putBackCh = c; }
1662  // Returns 'false' iff the EOF was encountered before anything was read.
1663  bool ReadNextLine() {
1664  buf.Clr(); comment.Clr();
1665  bool inComment = false, first = true;
1666  while (true) {
1667  int c = GetCh();
1668  if (c == EOF) return ! first;
1669  else if (c == 13) {
1670  c = GetCh(); if (c != 10) PutBack(c);
1671  return true; }
1672  else if (c == 10) return true;
1673  else if (c == '#') inComment = true;
1674  if (! inComment) buf += char(c);
1675  else comment += char(c); }
1676  /*first = false;*/}
1677  private:
1680  public:
1681  TUcdFileReader() : f(0) { }
1682  TUcdFileReader(const TStr& fileName) : f(0), putBackCh(EOF) { Open(fileName); }
1683  void Open(const TStr& fileName) { Close(); f = fopen(fileName.CStr(), "rt"); IAssertR(f, fileName); putBackCh = EOF; }
1684  void Close() { putBackCh = EOF; if (f) { fclose(f); f = 0; }}
1686  bool GetNextLine(TStrV& dest) {
1687  dest.Clr();
1688  while (true) {
1689  if (! ReadNextLine()) return false;
1690  TStr line = buf; line.ToTrunc();
1691  if (line.Len() <= 0) continue;
1692  line.SplitOnAllCh(';', dest, false);
1693  for (int i = 0; i < dest.Len(); i++) dest[i].ToTrunc();
1694  return true; }}
1695  static int ParseCodePoint(const TStr& s) {
1696  int c; bool ok = s.IsHexInt(true, 0, 0x10ffff, c); IAssertR(ok, s); return c; }
1697  static void ParseCodePointList(const TStr& s, TIntV& dest, bool ClrDestP = true) { // space-separated list
1698  if (ClrDestP) dest.Clr();
1699  TStrV parts; s.SplitOnWs(parts);
1700  for (int i = 0; i < parts.Len(); i++) {
1701  int c; bool ok = parts[i].IsHexInt(true, 0, 0x10ffff, c); IAssertR(ok, s);
1702  dest.Add(c); } }
1703  static void ParseCodePointRange(const TStr& s, int& from, int &to) { // xxxx or xxxx..yyyy
1704  int i = s.SearchStr(".."); if (i < 0) { from = ParseCodePoint(s); to = from; return; }
1705  from = ParseCodePoint(s.GetSubStr(0, i - 1));
1706  to = ParseCodePoint(s.GetSubStr(i + 2, s.Len() - 1)); }
1707  };
1708 
1709  //-------------------------------------------------------------------------
1710  // Helper class for processing the text files
1711  //-------------------------------------------------------------------------
1712  // Files such as DerivedCoreProps.txt often refer to ranges of codepoints,
1713  // and not all codepoints from the range have also been listed in
1714  // UnicodeData.txt. Thus, new TUniChInfo instances will be created
1715  // when processing DerivedCoreProps.txt and similar files.
1716  // To assign the correct (sub)categories to these new codepoints,
1717  // the following class will extract the subcategory info from the
1718  // comments in DerivedCoreProps.txt and similar files.
1719 
1721  {
1722  public:
1726 
1727  TSubcatHelper(TUniChDb &owner_) : owner(owner_) { }
1728 
1730  {
1731  hasCat = false; subCat = ucOtherNotAssigned;
1732  if (reader.comment.Len() > 3)
1733  {
1734  IAssert(reader.comment[0] == '#');
1735  IAssert(reader.comment[1] == ' ');
1736  char chCat = reader.comment[2], chSubCat = reader.comment[3];
1737  if (reader.comment.Len() > 4) IAssert(isspace(uchar(reader.comment[4])));
1738  if (TUniChInfo::IsValidSubCat(chCat, chSubCat)) {
1739  hasCat = true; subCat = (TUniChSubCategory) ((int(uchar(chCat)) << 8) | (int(uchar(chSubCat)))); }
1740  else invalidCatCodes.AddKey(TStr(chCat) + TStr(chSubCat));
1741  }
1742  }
1743 
1744  void SetCat(const int cp) {
1745  int i = owner.h.GetKeyId(cp); IAssert(i >= 0);
1746  IAssert(owner.h[i].subCat == ucOtherNotAssigned);
1747  IAssert(hasCat);
1748  owner.h[i].SetCatAndSubCat(subCat); }
1749  void TestCat(const int cp) {
1750  if (! hasCat) return;
1751  int i = owner.h.GetKeyId(cp); IAssert(i >= 0);
1752  IAssert(owner.h[i].subCat == subCat); }
1753 
1755  {
1756  if (invalidCatCodes.IsKey("L&")) invalidCatCodes.DelKey("L&");
1757  // Output any unexpected ones (there shouldn't be any).
1758  if (! invalidCatCodes.Empty()) {
1759  printf("Invalid cat code(s) in the comments: ");
1760  for (int i = invalidCatCodes.FFirstKeyId(); invalidCatCodes.FNextKeyId(i); )
1761  printf(" \"%s\"", invalidCatCodes.GetKey(i).CStr());
1762  printf("\n"); }
1763  }
1764  };
1765 };
1766 
1767 //-----------------------------------------------------------------------------
1768 // TUnicode -- a sadly emasculated wrapper around TUniCodec and TUniChDb
1769 //-----------------------------------------------------------------------------
1770 
1772 {
1773 public:
1776 
1777  TUnicode() { Init(); }
1778  explicit TUnicode(const TStr& fnBinUcd) { ucd.LoadBin(fnBinUcd); Init(); }
1779  void Init() { InitCodecs(); }
1780 
1781  //-----------------------------------------------------------------------
1782  // UTF-8
1783  //-----------------------------------------------------------------------
1784 
1785  // Returns the number of characters that have been successfully decoded.
1786  // This does not include any replacement characters that may have been inserted into 'dest'.
1787  int DecodeUtf8(const TIntV& src, TIntV& dest) const { return (int) codec.DecodeUtf8(src, dest); }
1788  int DecodeUtf8(const TStr& src, TIntV& dest) const { return (int) codec.DecodeUtf8(src, dest); }
1789 
1790  // Returns the number of characters that have been successfully encoded.
1791  // This does not include any replacement characters that may have been inserted into 'dest'.
1792  int EncodeUtf8(const TIntV& src, TIntV& dest) const { return (int) codec.EncodeUtf8(src, dest); }
1793 
1794  // The following wrapper around the UTF-8 encoder returns a TStr containing
1795  // the UTF-8-encoded version of the input string.
1796  TStr EncodeUtf8Str(const TIntV& src) const { return codec.EncodeUtf8Str(src); }
1797 
1798  // encoding one character to UTF8
1799  static void EncodeUtf8(const uint& Ch, TChA& Dest);
1800  static TStr EncodeUtf8(const uint& Ch);
1801 
1802  //-----------------------------------------------------------------------
1803  // UTF-16 Decoder
1804  //-----------------------------------------------------------------------
1805 
1806  // Returns the number of characters that have been successfully decoded.
1807  // This does not include any replacement characters that may have been inserted into 'dest'.
1808  // Each element of 'src' is assumed to contain one byte of data.
1809  // srcCount must be even (though srcIdx doesn't need to be).
1810  int DecodeUtf16FromBytes(const TIntV& src, TIntV& dest,
1811  const TUtf16BomHandling bomHandling = bomAllowed,
1812  const TUniByteOrder defaultByteOrder = boMachineEndian) const {
1813  return (int) codec.DecodeUtf16FromBytes(src, 0, src.Len(), dest, true, bomHandling, defaultByteOrder); }
1814 
1815  // Here, each element of 'src' is treated as a 16-bit word. The byte-order settings
1816  // are used to determine if the two bytes of each word should be swapped before further
1817  // processing. For example, if a BOM is present, it must have the value 0xfeff; if it
1818  // actually has the value 0xfffe, this means that the two bytes of each word must be swapped.
1819  // Basically, the combination of the byteOrder parameter and the byte order mark (if present) at the
1820  // beginning of the source data is used to determine the "original" byte order of the data;
1821  // if this doesn't match the byte order of the local machine, the two bytes of each word will
1822  // be swapped during the decoding process.
1823  int DecodeUtf16FromWords(const TIntV& src, TIntV& dest,
1824  const TUtf16BomHandling bomHandling = bomAllowed,
1825  const TUniByteOrder defaultByteOrder = boMachineEndian) const {
1826  return (int) codec.DecodeUtf16FromWords(src, 0, src.Len(), dest, true, bomHandling, defaultByteOrder); }
1827 
1828  //-----------------------------------------------------------------------
1829  // UTF-16 Encoder
1830  //-----------------------------------------------------------------------
1831 
1832  // Returns the number of characters that have been successfully encoded.
1833  // This does not include any replacement characters that may have been inserted into 'dest'.
1834  int EncodeUtf16ToWords(const TIntV& src, TIntV& dest, const bool insertBom,
1835  const TUniByteOrder destByteOrder = boMachineEndian) const {
1836  return (int) codec.EncodeUtf16ToWords(src, 0, src.Len(), dest, true, insertBom, destByteOrder); }
1837 
1838  int EncodeUtf16ToBytes(const TIntV& src, TIntV& dest, const bool insertBom,
1839  const TUniByteOrder destByteOrder = boMachineEndian) const {
1840  return (int) codec.EncodeUtf16ToBytes(src, 0, src.Len(), dest, true, insertBom, destByteOrder); }
1841 
1842  //-----------------------------------------------------------------------
1843  // 8-bit codecs
1844  //-----------------------------------------------------------------------
1845 
1854 
1855  //-----------------------------------------------------------------------
1856  // Codec registry
1857  //-----------------------------------------------------------------------
1858  // If you know you'll need ISO-8859-2, just use
1859  // TUnicode unicode;
1860  // unicode.iso8859_2.Encode(...);
1861  // If you don't know what you'll need, use:
1862  // TUnicode unicode;
1863  // PCodecBase myCodec = unicode.GetCodec(myCodecName);
1864  // myCodec->Encode(...);
1865  // Note that the first approach is slightly more efficient because there
1866  // aren't any virtual method calls involved.
1867 
1868 protected:
1870  static inline TStr NormalizeCodecName(const TStr& name) {
1871  TStr s = name.GetLc(); s.ChangeStrAll("_", ""); s.ChangeStrAll("-", ""); return s; }
1872 public:
1873  void RegisterCodec(const TStr& nameList, const PCodecBase& codec) {
1874  TStrV names; nameList.SplitOnWs(names);
1875  for (int i = 0; i < names.Len(); i++)
1876  codecs.AddDat(NormalizeCodecName(names[i]), codec); }
1877  void UnregisterCodec(const TStr& nameList) {
1878  TStrV names; nameList.SplitOnWs(names);
1879  for (int i = 0; i < names.Len(); i++)
1880  codecs.DelKey(NormalizeCodecName(names[i])); }
1881  void ClrCodecs() { codecs.Clr(); }
1882  void InitCodecs();
1883  PCodecBase GetCodec(const TStr& name) const {
1884  TStr s = NormalizeCodecName(name);
1885  PCodecBase p; if (! codecs.IsKeyGetDat(s, p)) p.Clr();
1886  return p; }
1887  void GetAllCodecs(TCodecBaseV& dest) const {
1888  dest.Clr();
1889  for (int i = codecs.FFirstKeyId(); codecs.FNextKeyId(i); ) {
1890  PCodecBase codec = codecs[i]; bool found = false;
1891  for (int j = 0; j < dest.Len(); j++) if (dest[j]() == codec()) { found = true; break; }
1892  if (! found) dest.Add(codec); }}
1893 
1894  //-------------------------------------------------------------------------
1895  // Word boundaries (UAX #29)
1896  //-------------------------------------------------------------------------
1897 
1898  // Finds the next word boundary strictly after 'position'.
1899  // Note that there are valid word boundaries at 0 and at 'src.Len()'.
1900  // If there is no such word boundary, it returns 'false' and sets 'position' to 'src.Len()'.
1901  bool FindNextWordBoundary(const TIntV& src, int &position) const {
1902  if (position < 0) { position = 0; return true; }
1903  size_t position_; bool retVal = ucd.FindNextWordBoundary(src, 0, src.Len(), position_); position = int(position_); return retVal; }
1904  // Creates, in 'dest', a vector of 'src.Len() + 1' elements, where 'dest[i]' tells if there is a word
1905  // boundary between 'src[i - 1]' and 'src[i]'. Note that 'dest[0]' and 'dest[src.Len()]' are
1906  // always set to 'true'.
1907  void FindWordBoundaries(const TIntV& src, TBoolV& dest) const { ucd.FindWordBoundaries(src, 0, src.Len(), dest); }
1908 
1909  //-------------------------------------------------------------------------
1910  // Sentence boundaries (UAX #29)
1911  //-------------------------------------------------------------------------
1912 
1913  // Finds the next sentence boundary strictly after 'position'.
1914  // Note that there are valid sentence boundaries at 0 and at 'src.Len()'.
1915  // If there is no such sentence boundary, it returns 'false' and sets 'position' to 'src.Len()'.
1916  bool FindNextSentenceBoundary(const TIntV& src, int &position) const {
1917  if (position < 0) { position = 0; return true; }
1918  size_t position_; bool retVal = ucd.FindNextSentenceBoundary(src, 0, src.Len(), position_); position = int(position_); return retVal; }
1919  // Creates, in 'dest', a vector of 'src.Len() + 1' elements, where 'dest[i]' tells if there is a sentence
1920  // boundary between 'src[i - 1]' and 'src[i]'. Note that 'dest[0]' and 'dest[src.Len()]' are
1921  // always set to 'true'.
1922  void FindSentenceBoundaries(const TIntV& src, TBoolV& dest) const { ucd.FindSentenceBoundaries(src, 0, src.Len(), dest); }
1923 
1926 
1927  //-------------------------------------------------------------------------
1928  // Normalization, decomposition, etc. (UAX #15)
1929  //-------------------------------------------------------------------------
1930 
1931  // This sets 'dest' to the decomposed form of the source string.
1932  // - for normalization form D (NFD), i.e. canonical decomposition: use compatibility == false;
1933  // - for normalization form KD (NFKD), i.e. compatibility decomposition: use compatibility == true.
1934  void Decompose(const TIntV& src, TIntV& dest, bool compatibility) const { ucd.Decompose(src, dest, compatibility, true); }
1935  // This performs canonical composition on the source string, and stores
1936  // the result in the destination vector. The source string should be the
1937  // result of a (canonical or compatibility) decomposition; if this is the
1938  // case, the composition will lead to a normalization form C (NFC) or
1939  // normalization form KC (NFKC), depending on whether canonical or compatibility
1940  // decomposition was used.
1941  void Compose(const TIntV& src, TIntV& dest) const { return ucd.Compose(src, dest, true); }
1942  // Calls Decompose, followed by Compose; thus the result is the NFC (if
1943  // compatibility == false) or NFKC (if compatibility == true) of the source string.
1944  // A temporary TIntV is used to contain the intermediate NF(K)D form of the
1945  // source string.
1946  void DecomposeAndCompose(const TIntV& src, TIntV& dest, bool compatibility) const { return ucd.DecomposeAndCompose(src, dest, compatibility); }
1947  // Copies the starter characters from 'src' to 'dest'; the other
1948  // characters are skipped. 'src' should already have been decomposed.
1949  // Returns the number of characters extracted. This function can be
1950  // used to remove diacritical marks from a string (after it has been decomposed!).
1951  int ExtractStarters(const TIntV& src, TIntV& dest) const { return (int) ucd.ExtractStarters(src, dest); }
1952  // Extracts the starters into a temporary vector and then copies it into 'src'.
1953  int ExtractStarters(TIntV& src) const { return (int) ucd.ExtractStarters(src); }
1954 
1955  //-------------------------------------------------------------------------
1956  // Case conversions
1957  //-------------------------------------------------------------------------
1958  // NOTE: if you will be dealing with Turkish, Azeri or Lithuanian text,
1959  // use the case-conversion methods in TUniChDb, which allow the caller
1960  // to request language-specific case mappings for these languages.
1961 
1962 public:
1964  // Sets 'dest' to the case-converted form of 'src'.
1965  void GetLowerCase(const TIntV& src, TIntV& dest) const { ucd.GetLowerCase(src, dest, true, false, false); }
1966  void GetUpperCase(const TIntV& src, TIntV& dest) const { ucd.GetUpperCase(src, dest, true, false, false); }
1967  void GetTitleCase(const TIntV& src, TIntV& dest) const { ucd.GetTitleCase(src, dest, true, false, false); }
1968 
1969  // GetSimpleCaseConverted uses only the simple case mappings (from UnicodeData.txt).
1970  // This is simpler and faster. Since each character now maps into exactly one
1971  // character, case conversion can also be done in place (see ToSimpleCaseConverted, etc.).
1972  void GetSimpleLowerCase(const TIntV& src, TIntV& dest) const { ucd.GetSimpleLowerCase(src, dest, true); }
1973  void GetSimpleUpperCase(const TIntV& src, TIntV& dest) const { ucd.GetSimpleUpperCase(src, dest, true); }
1974  void GetSimpleTitleCase(const TIntV& src, TIntV& dest) const { ucd.GetSimpleTitleCase(src, dest, true); }
1975 
1976  // These functions perform simple case-conversions in-place.
1977  void ToSimpleUpperCase(TIntV& src) const { ucd.ToSimpleUpperCase(src); }
1978  void ToSimpleLowerCase(TIntV& src) const { ucd.ToSimpleLowerCase(src); }
1979  void ToSimpleTitleCase(TIntV& src) const { ucd.ToSimpleTitleCase(src); }
1980 
1981  // Case folding is an alternative to the above functions. It is intended primarily
1982  // to produce strings that are suitable for comparisons. For example,
1983  // ToLowerCase(sigma) = sigma, ToLowerCase(final-sigma) = final-sigma;
1984  // but ToCaseFolded(sigma) = sigma, ToCaseFolded(final-sigma) = sigma.
1985  // - 'full' enables full case mappings -- i.e. sometimes a character may be mapped
1986  // into a string of two or more characters.
1987  // - Note: For best results, perform NFD(CaseFold(NFD(x)) or NFKD(CaseFold(NFKD(x)) on
1988  // each string before comparing them (see sec. 3.13 of the standard).
1989  void GetCaseFolded(const TIntV& src, TIntV& dest, const bool full = true) const { return ucd.GetCaseFolded(src, dest, true, full, false); }
1990  // ToCaseFolded folds the string in place. However, this means that only the simple
1991  // case foldings can be used (the full ones could increase the length of the string).
1992  void ToCaseFolded(TIntV& src) const { return ucd.ToCaseFolded(src, false); }
1993 
1994  TStr GetUtf8CaseFolded(const TStr& s) const {
1995  bool isAscii = true;
1996  for (int i = 0, n = s.Len(); i < n; i++) if (uchar(s[i]) >= 128) { isAscii = false; break; }
1997  if (isAscii) return s.GetLc();
1998  TIntV src; DecodeUtf8(s, src);
1999  TIntV dest; GetCaseFolded(src, dest);
2000  return EncodeUtf8Str(dest); }
2001 
2002  //-------------------------------------------------------------------------
2003  // Character properties
2004  //-------------------------------------------------------------------------
2005  // These methods simply call the corresponding TUniChDb method
2006  // (which typically calls the corresponding method of TUniChInfo).
2007  // See the declaration for DECLARE_FORWARDED_PROPERTY_METHODS for a complete list.
2008  // They are all of the form bool IsXxxx(const int cp) const
2009  // Some of the more notable ones include:
2010  // - IsAlphabetic, IsUppercase, IsLowercase, IsMath, IsAsciiHexDigit
2011  // IsDash, IsDeprecated, IsDiacritic, IsHexDigit, IsHyphen, IsIdeographic
2012  // IsNoncharacter, IsQuotationMark, IsSoftDotted, IsTerminalPunctuation, IsWhiteSpace
2013 
2014 #define ___UniFwd1(name) bool name(const int cp) const { return ucd.name(cp); }
2016 #undef DECLARE_FORWARDED_PROPERTY_METHODS
2017 #undef __UniFwd1
2018  ___UniFwd2(IsPrivateUse, IsSurrogate)
2019 
2020  TUniChCategory GetCat(const int cp) const { return ucd.GetCat(cp); }
2021  TUniChSubCategory GetSubCat(const int cp) const { return ucd.GetSubCat(cp); }
2022 
2023  // GetCharName returns 0 if the name is unknown; GetCharNameS returns a string of the form "U+1234".
2024  const char *GetCharName(const int cp) const { return ucd.GetCharName(cp); }
2025  TStr GetCharNameS(const int cp) const { return ucd.GetCharNameS(cp); }
2026 
2027 };
2028 
2029 //-----------------------------------------------------------------------------
2030 // TUniCodec -- UTF-8 Decoder
2031 //-----------------------------------------------------------------------------
2032 
2033 // Returns the number of characters that have been successfully decoded.
2034 // This does not include any replacement characters that may have been inserted into 'dest'.
2035 template<typename TSrcVec, typename TDestCh>
2037  const TSrcVec& src, size_t srcIdx, const size_t srcCount,
2038  TVec<TDestCh>& dest, const bool clrDest) const
2039 {
2040  size_t nDecoded = 0;
2041  if (clrDest) dest.Clr();
2042  const size_t origSrcIdx = srcIdx;
2043  const size_t srcEnd = srcIdx + srcCount;
2044  while (srcIdx < srcEnd)
2045  {
2046  const size_t charSrcIdx = srcIdx;
2047  uint c = src[TVecIdx(srcIdx)] & 0xff; srcIdx++;
2048  if ((c & _1000_0000) == 0) {
2049  // c is one of the characters 0..0x7f, encoded as a single byte.
2050  dest.Add(TDestCh(c)); nDecoded++; continue; }
2051  else if ((c & _1100_0000) == _1000_0000) {
2052  // No character in a valid UTF-8-encoded string should begin with a byte of the form 10xxxxxx.
2053  // We must have been thrown into the middle of a multi-byte character.
2054  switch (errorHandling) {
2055  case uehThrow: throw TUnicodeException(charSrcIdx, c, "Invalid character: 10xxxxxx.");
2056  case uehAbort: return nDecoded;
2057  case uehReplace: dest.Add(TDestCh(replacementChar)); continue;
2058  case uehIgnore: continue;
2059  default: Fail; } }
2060  else
2061  {
2062  // c introduces a sequence of 2..6 bytes, depending on how many
2063  // of the most significant bits of c are set.
2064  uint nMoreBytes = 0, nBits = 0, minVal = 0;
2065  if ((c & _1110_0000) == _1100_0000) nMoreBytes = 1, nBits = 5, minVal = 0x80;
2066  else if ((c & _1111_0000) == _1110_0000) nMoreBytes = 2, nBits = 4, minVal = 0x800;
2067  else if ((c & _1111_1000) == _1111_0000) nMoreBytes = 3, nBits = 3, minVal = 0x10000;
2068  else if ((c & _1111_1100) == _1111_1000) nMoreBytes = 4, nBits = 2, minVal = 0x200000;
2069  else if ((c & _1111_1110) == _1111_1100) nMoreBytes = 5, nBits = 1, minVal = 0x4000000;
2070  else {
2071  // c is of the form 1111111x, which is invalid even in the early definitions of UTF-8
2072  // (which allowed the encoding of codepoints up to 2^31 - 1). However, in principle this
2073  // could be used to encode 32-bit integers with the msb set: 1aaabbbbccccddddeeeeffffgggghhhh
2074  // could be encoded as 1111111a 10aabbbb 10ccccdd 10ddeeee 10ffffgg 10gghhhh.
2075  if (strict) {
2076  switch (errorHandling) {
2077  case uehThrow: throw TUnicodeException(charSrcIdx, c, "Invalid character: 1111111x.");
2078  case uehAbort: return nDecoded;
2079  // In the case of uehReplace and uehIgnore, we'll read the next 5 bytes
2080  // and try to decode the character. Then, since 'strict' is true and
2081  // the codepoint is clearly >= 2^31, we'll notice this as an error later
2082  // and (in the case of uehReplace) insert a replacement character then.
2083  // This is probably better than inserting a replacement character right
2084  // away and then trying to read the next byte as if a new character
2085  // was beginning there -- if the current byte is really followed by five
2086  // 10xxxxxx bytes, we'll just get six replacement characters in a row.
2087  case uehReplace: break; //dest.Add(TDestCh(replacementChar)); continue;
2088  case uehIgnore: break; // continue;
2089  default: Fail; } }
2090  nMoreBytes = 5; nBits = 2; minVal = 0x80000000u; }
2091  // Decode this multi-byte sequence.
2092  uint cOut = c & ((1 << nBits) - 1); // First extract the nBits least significant bits from c.
2093  bool cancel = false;
2094  for (uint i = 0; i < nMoreBytes && ! cancel; i++) {
2095  // See if there are enough bytes left in the source vector.
2096  if (! (srcIdx < srcEnd)) {
2097  switch (errorHandling) {
2098  case uehThrow: throw TUnicodeException(charSrcIdx, c, TInt::GetStr(nMoreBytes) + " more bytes expected, only " + TInt::GetStr(int(srcEnd - charSrcIdx - 1)) + " available.");
2099  case uehAbort: return nDecoded;
2100  case uehReplace: dest.Add(TDestCh(replacementChar)); cancel = true; continue;
2101  case uehIgnore: cancel = true; continue;
2102  default: Fail; } }
2103  // Read the next byte.
2104  c = src[TVecIdx(srcIdx)] & 0xff; srcIdx++;
2105  if ((c & _1100_0000) != _1000_0000) { // Each subsequent byte should be of the form 10xxxxxx.
2106  switch (errorHandling) {
2107  case uehThrow: throw TUnicodeException(charSrcIdx, c, "Byte " + TInt::GetStr(i) + " of " + TInt::GetStr(nMoreBytes) + " extra bytes should begin with 10xxxxxx.");
2108  case uehAbort: return nDecoded;
2109  case uehReplace: dest.Add(TDestCh(replacementChar)); srcIdx--; cancel = true; continue;
2110  case uehIgnore: srcIdx--; cancel = true; continue;
2111  default: Fail; } }
2112  cOut <<= 6; cOut |= (c & _0011_1111); }
2113  if (cancel) continue;
2114  if (strict) {
2115  // err1: This codepoint has been represented by more bytes than it should have been.
2116  // For example, cOut in the range 0..127 should be represented by a single byte,
2117  // not by two or more bytes.
2118  // - For example, this may happen in the "modified UTF-8" sometimes used for Java
2119  // serialization, where the codepoint 0 is encoded as 11000000 10000000 to avoid
2120  // the appearance of null bytes in the encoded stream.
2121  bool err1 = (cOut < minVal);
2122  // err2: Early definitions of UTF-8 allowed any 31-bit integer to be encoded, using up to 6 bytes.
2123  // However, later this was restricted to the codepoints 0..0x10ffff only, because only these
2124  // are valid Unicode codepoints. Thus, no more than 4 bytes are ever necessary.
2125  bool err2 = (nMoreBytes > 3 || (nMoreBytes == 3 && cOut > 0x10ffff));
2126  if (err1 || err2) switch (errorHandling) {
2127  case uehThrow:
2128  if (err1) throw TUnicodeException(charSrcIdx, c, "The codepoint 0x" + TInt::GetStr(cOut, "%08x") + " has been represented by too many bytes (" + TInt::GetStr(nMoreBytes + 1) + ").");
2129  else if (err2) throw TUnicodeException(charSrcIdx, c, "Invalid multibyte sequence: it decodes into 0x" + TInt::GetStr(cOut, "%08x") + ", but only codepoints 0..0x10ffff are valid.");
2130  else { Fail; break; }
2131  case uehAbort: return nDecoded;
2132  case uehReplace: dest.Add(TDestCh(replacementChar)); continue;
2133  case uehIgnore: continue;
2134  default: Fail; } }
2135  // Add the decoded codepoint to the destination vector.
2136  // If this is the first decoded character, and it's one of the byte-order marks
2137  // (0xfffe and 0xfeff), we will skip it (unless skipBom is false).
2138  if (! (skipBom && (cOut == 0xfffe || cOut == 0xfeff) && charSrcIdx == origSrcIdx)) {
2139  dest.Add(cOut); nDecoded++; }
2140  } // else (multi-byte sequence)
2141  } // while
2142  return nDecoded;
2143 }
2144 
2145 //-----------------------------------------------------------------------
2146 // TUniCodec -- UTF-8 Encoder
2147 //-----------------------------------------------------------------------
2148 
2149 // Returns the number of characters that have been successfully encoded.
2150 // This does not include any replacement characters that may have been inserted into 'dest'.
2151 template<typename TSrcVec, typename TDestCh>
2153  const TSrcVec& src, size_t srcIdx, const size_t srcCount,
2154  TVec<TDestCh>& dest, const bool clrDest) const
2155 {
2156  size_t nEncoded = 0;
2157  for (const size_t srcEnd = srcIdx + srcCount; srcIdx < srcEnd; srcIdx++)
2158  {
2159  uint c = uint(src[TVecIdx(srcIdx)]);
2160  bool err = false;
2161  if (strict && c > 0x10ffff) {
2162  err = true;
2163  switch (errorHandling) {
2164  case uehThrow: throw TUnicodeException(srcIdx, c, "Invalid character (0x" + TInt::GetStr(c, "%x") + "; only characters in the range 0..0x10ffff are allowed).");
2165  case uehAbort: return nEncoded;
2166  case uehReplace: c = replacementChar; break;
2167  case uehIgnore: continue;
2168  default: Fail; } }
2169  if (c < 0x80u)
2170  dest.Add(TDestCh(c & 0xffu));
2171  else if (c < 0x800u) {
2172  dest.Add(TDestCh(_1100_0000 | ((c >> 6) & _0001_1111)));
2173  dest.Add(TDestCh(_1000_0000 | (c & _0011_1111))); }
2174  else if (c < 0x10000u) {
2175  dest.Add(TDestCh(_1110_0000 | ((c >> 12) & _0000_1111)));
2176  dest.Add(TDestCh(_1000_0000 | ((c >> 6) & _0011_1111)));
2177  dest.Add(TDestCh(_1000_0000 | (c & _0011_1111))); }
2178  else if (c < 0x200000u) {
2179  dest.Add(TDestCh(_1111_0000 | ((c >> 18) & _0000_0111)));
2180  dest.Add(TDestCh(_1000_0000 | ((c >> 12) & _0011_1111)));
2181  dest.Add(TDestCh(_1000_0000 | ((c >> 6) & _0011_1111)));
2182  dest.Add(TDestCh(_1000_0000 | (c & _0011_1111))); }
2183  else if (c < 0x4000000u) {
2184  dest.Add(TDestCh(_1111_1000 | ((c >> 24) & _0000_0011)));
2185  dest.Add(TDestCh(_1000_0000 | ((c >> 18) & _0011_1111)));
2186  dest.Add(TDestCh(_1000_0000 | ((c >> 12) & _0011_1111)));
2187  dest.Add(TDestCh(_1000_0000 | ((c >> 6) & _0011_1111)));
2188  dest.Add(TDestCh(_1000_0000 | (c & _0011_1111))); }
2189  else {
2190  dest.Add(TDestCh(_1111_1100 | ((c >> 30) & _0000_0011)));
2191  dest.Add(TDestCh(_1000_0000 | ((c >> 24) & _0011_1111)));
2192  dest.Add(TDestCh(_1000_0000 | ((c >> 18) & _0011_1111)));
2193  dest.Add(TDestCh(_1000_0000 | ((c >> 12) & _0011_1111)));
2194  dest.Add(TDestCh(_1000_0000 | ((c >> 6) & _0011_1111)));
2195  dest.Add(TDestCh(_1000_0000 | (c & _0011_1111))); }
2196  if (! err) nEncoded++;
2197  }
2198  return nEncoded;
2199 }
2200 
2201 //-----------------------------------------------------------------------
2202 // TUniCodec -- UTF-16 Encoder
2203 //-----------------------------------------------------------------------
2204 
2205 // Returns the number of characters that have been successfully decoded.
2206 // This does not include any replacement characters that may have been inserted into 'dest'.
2207 // Each element of 'src' is assumed to contain one byte of data.
2208 // srcCount must be even (though srcIdx doesn't need to be).
2209 template<typename TSrcVec, typename TDestCh>
2211  const TSrcVec& src, size_t srcIdx, const size_t srcCount,
2212  TVec<TDestCh>& dest, const bool clrDest,
2213  const TUtf16BomHandling bomHandling,
2214  const TUniByteOrder defaultByteOrder) const
2215 {
2216  IAssert(srcCount % 2 == 0);
2217  IAssert(bomHandling == bomAllowed || bomHandling == bomRequired || bomHandling == bomIgnored);
2218  IAssert(defaultByteOrder == boMachineEndian || defaultByteOrder == boBigEndian || defaultByteOrder == boLittleEndian);
2219  if (clrDest) dest.Clr();
2220  size_t nDecoded = 0;
2221  if (srcCount <= 0) return nDecoded;
2222  const size_t origSrcIdx = srcIdx, srcEnd = srcIdx + srcCount;
2223  bool littleEndian = false;
2224  bool leDefault = (defaultByteOrder == boLittleEndian || (defaultByteOrder == boMachineEndian && IsMachineLittleEndian()));
2225  if (bomHandling == bomIgnored) littleEndian = leDefault;
2226  else if (bomHandling == bomAllowed || bomHandling == bomRequired)
2227  {
2228  int byte1 = uint(src[TVecIdx(srcIdx)]) & 0xff, byte2 = uint(src[TVecIdx(srcIdx + 1)]) & 0xff;
2229  if (byte1 == 0xfe && byte2 == 0xff) { littleEndian = false; if (skipBom) srcIdx += 2; }
2230  else if (byte1 == 0xff && byte2 == 0xfe) { littleEndian = true; if (skipBom) srcIdx += 2; }
2231  else if (bomHandling == bomAllowed) littleEndian = leDefault;
2232  else { // Report an error.
2233  switch (errorHandling) {
2234  case uehThrow: throw TUnicodeException(srcIdx, byte1, "BOM expected at the beginning of the input vector (" + TInt::GetStr(byte1, "%02x") + " " + TInt::GetStr(byte2, "%02x") + " found instead).");
2235  case uehAbort: case uehReplace: case uehIgnore: return size_t(-1);
2236  default: Fail; } }
2237  }
2238  else Fail;
2239  while (srcIdx < srcEnd)
2240  {
2241  const size_t charSrcIdx = srcIdx;
2242  uint byte1 = uint(src[TVecIdx(srcIdx)]) & 0xff, byte2 = uint(src[TVecIdx(srcIdx + 1)]) & 0xff; srcIdx += 2;
2243  uint c = littleEndian ? (byte1 | (byte2 << 8)) : (byte2 | (byte1 << 8));
2244  if (Utf16FirstSurrogate <= c && c <= Utf16FirstSurrogate + 1023)
2245  {
2246  // c is the first character in a surrogate pair. Read the next character.
2247  if (! (srcIdx + 2 <= srcEnd)) {
2248  switch (errorHandling) {
2249  case uehThrow: throw TUnicodeException(charSrcIdx, c, "The second character of a surrogate pair is missing.");
2250  case uehAbort: return nDecoded;
2251  case uehReplace: dest.Add(TDestCh(replacementChar)); continue;
2252  case uehIgnore: continue;
2253  default: Fail; } }
2254  uint byte1 = uint(src[TVecIdx(srcIdx)]) & 0xff, byte2 = uint(src[TVecIdx(srcIdx + 1)]) & 0xff; srcIdx += 2;
2255  uint c2 = littleEndian ? (byte1 | (byte2 << 8)) : (byte2 | (byte1 << 8));
2256  // c2 should be the second character of the surrogate pair.
2257  if (c2 < Utf16SecondSurrogate || Utf16SecondSurrogate + 1023 < c2) {
2258  switch (errorHandling) {
2259  case uehThrow: throw TUnicodeException(charSrcIdx + 2, c2, "The second character of a surrogate pair should be in the range " + TInt::GetStr(Utf16SecondSurrogate, "%04x") + ".." + TInt::GetStr(Utf16SecondSurrogate + 1023, "%04x") + ", not " + TInt::GetStr(c2, "04x") + ".");
2260  case uehAbort: return nDecoded;
2261  // with uehReplace and uehIgnore, we'll just skip the first character of the surrogate pair; we'll process the second one during the next iteration, this time as an ordinary character
2262  case uehReplace: dest.Add(TDestCh(replacementChar)); srcIdx -= 2; continue;
2263  case uehIgnore: srcIdx -= 2; continue;
2264  default: Fail; } }
2265  // c and c2 each contain 10 bits of information.
2266  uint cc = ((c - Utf16FirstSurrogate) << 10) | (c2 - Utf16SecondSurrogate);
2267  cc += 0x10000;
2268  dest.Add(TDestCh(cc)); nDecoded++; continue;
2269  }
2270  else if (strict && Utf16SecondSurrogate <= c && c <= Utf16SecondSurrogate + 1023) {
2271  switch (errorHandling) {
2272  case uehThrow: throw TUnicodeException(charSrcIdx, c, "This 16-bit value should be used only as the second character of a surrogate pair.");
2273  case uehAbort: return nDecoded;
2274  case uehReplace: dest.Add(TDestCh(replacementChar)); continue;
2275  case uehIgnore: continue;
2276  default: Fail; } }
2277  // If 'c' is the first character in the input stream, and it's a BOM, we might have to skip it.
2278  if (charSrcIdx == origSrcIdx && (c == 0xfffeu || c == 0xfeffu) && skipBom) continue;
2279  // Otherwise, store 'c' to the destination vector.
2280  dest.Add(TDestCh(c)); nDecoded++;
2281  }
2282  return nDecoded;
2283 }
2284 
2285 // Here, each element of 'src' is treated as a 16-bit word. The byte-order settings
2286 // are used to determine if the two bytes of each word should be swapped before further
2287 // processing. For example, if a BOM is present, it must have the value 0xfeff; if it
2288 // actually has the value 0xfffe, this means that the two bytes of each word must be swapped.
2289 // Basically, the combination of the byteOrder parameter and the byte order mark (if present) at the
2290 // beginning of the source data is used to determine the "original" byte order of the data;
2291 // if this doesn't match the byte order of the local machine, the two bytes of each word will
2292 // be swapped during the decoding process.
2293 template<typename TSrcVec, typename TDestCh>
2295  const TSrcVec& src, size_t srcIdx, const size_t srcCount,
2296  TVec<TDestCh>& dest, bool clrDest,
2297  const TUtf16BomHandling bomHandling,
2298  const TUniByteOrder defaultByteOrder) const
2299 {
2300  IAssert(bomHandling == bomAllowed || bomHandling == bomRequired || bomHandling == bomIgnored);
2301  IAssert(defaultByteOrder == boMachineEndian || defaultByteOrder == boBigEndian || defaultByteOrder == boLittleEndian);
2302  if (clrDest) dest.Clr();
2303  size_t nDecoded = 0;
2304  if (srcCount <= 0) return nDecoded;
2305  const size_t origSrcIdx = srcIdx, srcEnd = srcIdx + srcCount;
2306  bool swap = false;
2307  bool isMachineLe = IsMachineLittleEndian();
2308  bool isDefaultLe = (defaultByteOrder == boLittleEndian || (defaultByteOrder == boMachineEndian && isMachineLe));
2309  if (bomHandling == bomIgnored) swap = (isDefaultLe != isMachineLe);
2310  else if (bomHandling == bomAllowed || bomHandling == bomRequired)
2311  {
2312  int c = uint(src[TVecIdx(srcIdx)]) & 0xffff;
2313  if (c == 0xfeff) { swap = false; if (skipBom) srcIdx += 1; }
2314  else if (c == 0xfffe) { swap = true; if (skipBom) srcIdx += 1; }
2315  else if (bomHandling == bomAllowed) swap = (isMachineLe != isDefaultLe);
2316  else { // Report an error.
2317  switch (errorHandling) {
2318  case uehThrow: throw TUnicodeException(srcIdx, c, "BOM expected at the beginning of the input vector (" + TInt::GetStr(c, "%04x") + " found instead).");
2319  case uehAbort: case uehReplace: case uehIgnore: return size_t(-1);
2320  default: Fail; } }
2321  }
2322  else Fail;
2323  while (srcIdx < srcEnd)
2324  {
2325  const size_t charSrcIdx = srcIdx;
2326  uint c = uint(src[TVecIdx(srcIdx)]) & 0xffffu; srcIdx++;
2327  if (swap) c = ((c >> 8) & 0xff) | ((c & 0xff) << 8);
2328  if (Utf16FirstSurrogate <= c && c <= Utf16FirstSurrogate + 1023)
2329  {
2330  // c is the first character in a surrogate pair. Read the next character.
2331  if (! (srcIdx < srcEnd)) {
2332  switch (errorHandling) {
2333  case uehThrow: throw TUnicodeException(charSrcIdx, c, "The second character of a surrogate pair is missing.");
2334  case uehAbort: return nDecoded;
2335  case uehReplace: dest.Add(TDestCh(replacementChar)); continue;
2336  case uehIgnore: continue;
2337  default: Fail; } }
2338  uint c2 = uint(src[TVecIdx(srcIdx)]) & 0xffffu; srcIdx++;
2339  if (swap) c2 = ((c2 >> 8) & 0xff) | ((c2 & 0xff) << 8);
2340  // c2 should be the second character of the surrogate pair.
2341  if (c2 < Utf16SecondSurrogate || Utf16SecondSurrogate + 1023 < c2) {
2342  switch (errorHandling) {
2343  case uehThrow: throw TUnicodeException(charSrcIdx + 1, c2, "The second character of a surrogate pair should be in the range " + TInt::GetStr(Utf16SecondSurrogate, "%04x") + ".." + TInt::GetStr(Utf16SecondSurrogate + 1023, "%04x") + ", not " + TInt::GetStr(c2, "04x") + ".");
2344  case uehAbort: return nDecoded;
2345  // with uehReplace and uehIgnore, we'll just skip the first character of the surrogate pair; we'll process the second one during the next iteration, this time as an ordinary character
2346  case uehReplace: dest.Add(TDestCh(replacementChar)); srcIdx -= 1; continue;
2347  case uehIgnore: srcIdx -= 1; continue;
2348  default: Fail; } }
2349  // c and c2 each contain 10 bits of information.
2350  uint cc = ((c - Utf16FirstSurrogate) << 10) | (c2 - Utf16SecondSurrogate);
2351  cc += 0x10000;
2352  dest.Add(TDestCh(cc)); nDecoded++; continue;
2353  }
2354  else if (strict && Utf16SecondSurrogate <= c && c <= Utf16SecondSurrogate + 1023) {
2355  switch (errorHandling) {
2356  case uehThrow: throw TUnicodeException(charSrcIdx, c, "This 16-bit value should be used only as the second character of a surrogate pair.");
2357  case uehAbort: return nDecoded;
2358  case uehReplace: dest.Add(TDestCh(replacementChar)); continue;
2359  case uehIgnore: continue;
2360  default: Fail; } }
2361  // If 'c' is the first character in the input stream, and it's a BOM, we might have to skip it.
2362  if (charSrcIdx == origSrcIdx && (c == 0xfffeu || c == 0xfeffu) && skipBom) continue;
2363  // Otherwise, store 'c' to the destination vector.
2364  dest.Add(TDestCh(c)); nDecoded++;
2365  }
2366  return nDecoded;
2367 }
2368 
2369 //-----------------------------------------------------------------------
2370 // TUniCodec -- UTF-16 Encoder
2371 //-----------------------------------------------------------------------
2372 
2373 // Returns the number of characters that have been successfully encoded.
2374 // This does not include any replacement characters that may have been inserted into 'dest'.
2375 template<typename TSrcVec, typename TDestCh>
2377  const TSrcVec& src, size_t srcIdx, const size_t srcCount,
2378  TVec<TDestCh>& dest, const bool clrDest, const bool insertBom,
2379  const TUniByteOrder destByteOrder) const
2380 {
2381  bool isMachineLe = IsMachineLittleEndian();
2382  bool swap = (destByteOrder == boLittleEndian && ! isMachineLe) || (destByteOrder == boBigEndian && isMachineLe);
2383  size_t nEncoded = 0, srcEnd = srcIdx + srcCount;
2384  if (insertBom) { dest.Add(TDestCh(swap ? 0xfffeu : 0xfeffu)); nEncoded++; }
2385  while (srcIdx < srcEnd)
2386  {
2387  uint c = uint(src[TVecIdx(srcIdx)]); srcIdx++;
2388  if (! (c <= 0x10ffffu)) {
2389  switch (errorHandling) {
2390  case uehThrow: throw TUnicodeException(srcIdx - 1, c, "UTF-16 only supports characters in the range 0..10ffff (not " + TUInt::GetStr(c, "%08x") + ").");
2391  case uehAbort: return nEncoded;
2392  case uehReplace: dest.Add(TDestCh(swap ? SwapBytes(replacementChar) : replacementChar)); continue;
2393  case uehIgnore: continue;
2394  default: Fail; } }
2395  if (Utf16FirstSurrogate <= c && c < Utf16FirstSurrogate + 1023) {
2396  switch (errorHandling) {
2397  case uehThrow: throw TUnicodeException(srcIdx - 1, c, "UTF-16 cannot encode " + TUInt::GetStr(c, "%04x") + " as it belongs to the first surrogate range (" + TUInt::GetStr(Utf16FirstSurrogate, "%04x") + ".." + TUInt::GetStr(Utf16FirstSurrogate + 1023, "%04x") + ").");
2398  case uehAbort: return nEncoded;
2399  case uehReplace: dest.Add(TDestCh(swap ? SwapBytes(replacementChar) : replacementChar)); continue;
2400  case uehIgnore: continue;
2401  default: Fail; } }
2402  if (Utf16SecondSurrogate <= c && c < Utf16SecondSurrogate + 1023) {
2403  switch (errorHandling) {
2404  case uehThrow: throw TUnicodeException(srcIdx - 1, c, "The character " + TUInt::GetStr(c, "%04x") + " belongs to the second surrogate range (" + TUInt::GetStr(Utf16FirstSurrogate, "%04x") + ".." + TUInt::GetStr(Utf16FirstSurrogate + 1023, "%04x") + "), which is not allowed with strict == true.");
2405  case uehAbort: return nEncoded;
2406  case uehReplace: dest.Add(TDestCh(swap ? SwapBytes(replacementChar) : replacementChar)); continue;
2407  case uehIgnore: continue;
2408  default: Fail; } }
2409  // If c is <= 0xffff, it can be stored directly.
2410  if (c <= 0xffffu) {
2411  if (swap) c = ((c >> 8) & 0xff) | ((c & 0xff) << 8);
2412  dest.Add(TDestCh(c)); nEncoded++; continue; }
2413  // Otherwise, represent c by a pair of surrogate characters.
2414  c -= 0x10000u; IAssert(/*0 <= c &&*/ c <= 0xfffffu);
2415  uint c1 = (c >> 10) & 1023, c2 = c & 1023;
2417  if (swap) {
2418  c1 = ((c1 >> 8) & 0xff) | ((c1 & 0xff) << 8);
2419  c2 = ((c2 >> 8) & 0xff) | ((c2 & 0xff) << 8); }
2420  dest.Add(TDestCh(c1));
2421  dest.Add(TDestCh(c2));
2422  nEncoded++; continue;
2423  }
2424  return nEncoded;
2425 }
2426 
2427 template<typename TSrcVec, typename TDestCh>
2429  const TSrcVec& src, size_t srcIdx, const size_t srcCount,
2430  TVec<TDestCh>& dest, const bool clrDest, const bool insertBom,
2431  const TUniByteOrder destByteOrder) const
2432 {
2433  bool isDestLe = (destByteOrder == boLittleEndian || (destByteOrder == boMachineEndian && IsMachineLittleEndian()));
2434  size_t nEncoded = 0, srcEnd = srcIdx + srcCount;
2435  if (insertBom) { dest.Add(isDestLe ? 0xff : 0xfe); dest.Add(isDestLe ? 0xfe : 0xff); nEncoded++; }
2436  while (srcIdx < srcEnd)
2437  {
2438  uint c = uint(src[TVecIdx(srcIdx)]); srcIdx++;
2439  if (! (c <= 0x10ffffu)) {
2440  switch (errorHandling) {
2441  case uehThrow: throw TUnicodeException(srcIdx - 1, c, "UTF-16 only supports characters in the range 0..10ffff (not " + TUInt::GetStr(c, "%08x") + ").");
2442  case uehAbort: return nEncoded;
2443 #define ___OutRepl if (isDestLe) { dest.Add(replacementChar & 0xff); dest.Add((replacementChar >> 8) & 0xff); } else { dest.Add((replacementChar >> 8) & 0xff); dest.Add(replacementChar & 0xff); }
2444  case uehReplace: ___OutRepl; continue;
2445  case uehIgnore: continue;
2446  default: Fail; } }
2447  if (Utf16FirstSurrogate <= c && c < Utf16FirstSurrogate + 1023) {
2448  switch (errorHandling) {
2449  case uehThrow: throw TUnicodeException(srcIdx - 1, c, "UTF-16 cannot encode " + TUInt::GetStr(c, "%04x") + " as it belongs to the first surrogate range (" + TUInt::GetStr(Utf16FirstSurrogate, "%04x") + ".." + TUInt::GetStr(Utf16FirstSurrogate + 1023, "%04x") + ").");
2450  case uehAbort: return nEncoded;
2451  case uehReplace: ___OutRepl; continue;
2452  case uehIgnore: continue;
2453  default: Fail; } }
2454  if (Utf16SecondSurrogate <= c && c < Utf16SecondSurrogate + 1023) {
2455  switch (errorHandling) {
2456  case uehThrow: throw TUnicodeException(srcIdx - 1, c, "The character " + TUInt::GetStr(c, "%04x") + " belongs to the second surrogate range (" + TUInt::GetStr(Utf16FirstSurrogate, "%04x") + ".." + TUInt::GetStr(Utf16FirstSurrogate + 1023, "%04x") + "), which is not allowed with strict == true.");
2457  case uehAbort: return nEncoded;
2458  case uehReplace: ___OutRepl; continue;
2459  case uehIgnore: continue;
2460  default: Fail; } }
2461 #undef ___OutRepl
2462  // If c is <= 0xffff, it can be stored directly.
2463  if (c <= 0xffffu) {
2464  if (isDestLe) { dest.Add(c & 0xff); dest.Add((c >> 8) & 0xff); }
2465  else { dest.Add((c >> 8) & 0xff); dest.Add(c & 0xff); }
2466  nEncoded++; continue; }
2467  // Otherwise, represent c by a pair of surrogate characters.
2468  c -= 0x10000u; IAssert(/*0 <= c &&*/ c <= 0xfffffu);
2469  uint c1 = (c >> 10) & 1023, c2 = c & 1023;
2471  if (isDestLe) { dest.Add(c1 & 0xff); dest.Add((c1 >> 8) & 0xff); dest.Add(c2 & 0xff); dest.Add((c2 >> 8) & 0xff); }
2472  else { dest.Add((c1 >> 8) & 0xff); dest.Add(c1 & 0xff); dest.Add((c2 >> 8) & 0xff); dest.Add(c2 & 0xff); }
2473  nEncoded++; continue;
2474  }
2475  return nEncoded;
2476 }
2477 
2478 //-----------------------------------------------------------------------------
2479 // TUniChDb -- word boundaries
2480 //-----------------------------------------------------------------------------
2481 
2482 template<typename TSrcVec>
2483 bool TUniChDb::FindNextWordBoundary(const TSrcVec& src, const size_t srcIdx, const size_t srcCount, size_t &position) const
2484 {
2485  // WB1. Break at the start of text.
2486  if (position < srcIdx) { position = srcIdx; return true; }
2487  // If we are beyond the end of the text, there aren't any word breaks left.
2488  const size_t srcEnd = srcIdx + srcCount;
2489  if (position >= srcEnd) return false;
2490  // If 'position' is currently at an ignored character, move it back to the last nonignored character.
2491  size_t origPos = position;
2492  if (IsWbIgnored(src[TVecIdx(position)])) {
2493  if (! WbFindPrevNonIgnored(src, srcIdx, position))
2494  position = origPos;
2495  }
2496  // Determine the previous nonignored character (before 'position').
2497  size_t posPrev = position;
2498  if (! WbFindPrevNonIgnored(src, srcIdx, posPrev)) posPrev = position;
2499  // Sec 6.2. Allow a break between Sep and an ignored character.
2500  if (position == origPos && position + 1 < srcEnd && IsSbSep(src[TVecIdx(position)]) && IsWbIgnored(src[TVecIdx(position + 1)])) { position += 1; return true; }
2501  // Determine the next nonignored character (after 'position').
2502  size_t posNext = position; WbFindNextNonIgnored(src, posNext, srcEnd);
2503  size_t posNext2;
2504  int cPrev = (posPrev < position ? (int) src[TVecIdx(posPrev)] : -1), cCur = (position < srcEnd ? (int) src[TVecIdx(position)] : -1);
2505  int cNext = (position < posNext && posNext < srcEnd ? (int) src[TVecIdx(posNext)] : -1);
2506  int wbfPrev = GetWbFlags(cPrev), wbfCur = GetWbFlags(cCur), wbfNext = GetWbFlags(cNext);
2507  int cNext2, wbfNext2;
2508  //
2509  for ( ; position < srcEnd; posPrev = position, position = posNext, posNext = posNext2,
2510  cPrev = cCur, cCur = cNext, cNext = cNext2,
2511  wbfPrev = wbfCur, wbfCur = wbfNext, wbfNext = wbfNext2)
2512  {
2513  // Should there be a word boundary between 'position' and 'posNext' (or, more accurately,
2514  // between src[posNext - 1] and src[posNext] --- any ignored characters between 'position'
2515  // and 'posNext' are considered to belong to the previous character ('position'), not to the next one)?
2516  posNext2 = posNext; WbFindNextNonIgnored(src, posNext2, srcEnd);
2517  cNext2 = (posNext < posNext2 && posNext2 < srcEnd ? (int) src[TVecIdx(posNext2)] : -1);
2518  wbfNext2 = GetWbFlags(cNext2);
2519 #define TestCurNext(curFlag, nextFlag) if ((wbfCur & curFlag) == curFlag && (wbfNext & nextFlag) == nextFlag) continue
2520 #define TestCurNext2(curFlag, nextFlag, next2Flag) if ((wbfCur & curFlag) == curFlag && (wbfNext & nextFlag) == nextFlag && (wbfNext2 & next2Flag) == next2Flag) continue
2521 #define TestPrevCurNext(prevFlag, curFlag, nextFlag) if ((wbfPrev & prevFlag) == prevFlag && (wbfCur & curFlag) == curFlag && (wbfNext & nextFlag) == nextFlag) continue
2522  // WB3. Do not break within CRLF.
2523  if (cCur == 13 && cNext == 10) continue;
2524  // WB5. Do not break between most letters.
2526  // WB6. Do not break letters across certain punctuation.
2528  // WB7. Do not break letters across certain punctuation.
2530  // WB8. Do not break within sequences of digits, or digits adjacent to letters.
2532  // WB9. Do not break within sequences of digits, or digits adjacent to letters.
2534  // WB10. Do not break within sequences of digits, or digits adjacent to letters.
2536  // WB11. Do not break within sequences, such as "3.2" or "3.456,789".
2538  // WB12. Do not break within sequences, such as "3.2" or "3.456,789".
2540  // WB13. Do not break between Katakana.
2542  // WB13a. Do not break from extenders.
2543  if ((wbfCur & (ucfWbALetter | ucfWbNumeric | ucfWbKatakana | ucfWbExtendNumLet)) != 0 &&
2544  (wbfNext & ucfWbExtendNumLet) == ucfWbExtendNumLet) continue;
2545  // WB13b. Do not break from extenders.
2546  if ((wbfCur & ucfWbExtendNumLet) == ucfWbExtendNumLet &&
2547  (wbfNext & (ucfWbALetter | ucfWbNumeric | ucfWbKatakana)) != 0) continue;
2548  // WB14. Otherwise, break everywhere.
2549  position = posNext; return true;
2550 #undef TestCurNext
2551 #undef TestCurNext2
2552 #undef TestPrevCurNext
2553  }
2554  // WB2. Break at the end of text.
2555  IAssert(position == srcEnd);
2556  return true;
2557 }
2558 
2559 // ToDo: provide a more efficient implementation of this.
2560 template<typename TSrcVec>
2561 void TUniChDb::FindWordBoundaries(const TSrcVec& src, const size_t srcIdx, const size_t srcCount, TBoolV& dest) const
2562 {
2563  if (size_t(dest.Len()) != srcCount + 1) dest.Gen(TVecIdx(srcCount + 1));
2564  dest.PutAll(false);
2565  size_t position = srcIdx;
2566  dest[TVecIdx(position - srcIdx)] = true;
2567  while (position < srcIdx + srcCount)
2568  {
2569  size_t oldPos = position;
2570  FindNextWordBoundary(src, srcIdx, srcCount, position);
2571  if (oldPos >= position) {
2572  Assert(oldPos < position);
2573  }
2574  Assert(position <= srcIdx + srcCount);
2575  dest[TVecIdx(position - srcIdx)] = true;
2576  }
2577  Assert(dest[TVecIdx(srcCount)]);
2578 }
2579 
2580 //-----------------------------------------------------------------------------
2581 // TUniChDb -- sentence boundaries
2582 //-----------------------------------------------------------------------------
2583 
2584 template<typename TSrcVec>
2585 bool TUniChDb::CanSentenceEndHere(const TSrcVec& src, const size_t srcIdx, const size_t position) const
2586 {
2587  if (sbExTrie.Empty()) return true;
2588  // We'll move back from the position where a sentence-boundary is being considered.
2589  size_t pos = position;
2590  if (! WbFindPrevNonIgnored(src, srcIdx, pos)) return true;
2591  int c = (int) src[TVecIdx(pos)]; int sfb = GetSbFlags(c);
2592  // - Skip the Sep, if there is one.
2593  if ((c & ucfSbSep) == ucfSbSep) {
2594  if (! WbFindPrevNonIgnored(src, srcIdx, pos)) return true;
2595  c = (int) src[TVecIdx(pos)]; sfb = GetSbFlags(c); }
2596  // - Skip any Sp characters.
2597  while ((sfb & ucfSbSp) == ucfSbSp) {
2598  if (! WbFindPrevNonIgnored(src, srcIdx, pos)) return true;
2599  c = (int) src[TVecIdx(pos)]; sfb = GetSbFlags(c); }
2600  // - Skip any Close characters.
2601  while ((sfb & ucfSbSp) == ucfSbSp) {
2602  if (! WbFindPrevNonIgnored(src, srcIdx, pos)) return true;
2603  c = (int) src[TVecIdx(pos)]; sfb = GetSbFlags(c); }
2604  // - Skip any ATerm | STerm characters.
2605  while ((sfb & (ucfSbATerm | ucfSbSTerm)) != 0) {
2606  if (! WbFindPrevNonIgnored(src, srcIdx, pos)) return true;
2607  c = (int) src[TVecIdx(pos)]; sfb = GetSbFlags(c); }
2608  // Now start moving through the trie.
2609  int cLast = c, cButLast = -1, cButButLast = -1, len = 1, node = -1;
2610  while (true)
2611  {
2612  bool atEnd = (! WbFindPrevNonIgnored(src, srcIdx, pos));
2613  c = (atEnd ? -1 : (int) src[TVecIdx(pos)]);
2614  TUniChCategory cat = GetCat(c);
2615  if (atEnd || ! (cat == ucLetter || cat == ucNumber || cat == ucSymbol)) {
2616  // Check if the suffix we've read so far is one of those that appear in the trie.
2617  if (len == 1) return ! sbExTrie.Has1Gram(cLast);
2618  if (len == 2) return ! sbExTrie.Has2Gram(cLast, cButLast);
2619  IAssert(len >= 3); IAssert(node >= 0);
2620  if (sbExTrie.IsNodeTerminal(node)) return false;
2621  if (atEnd) return true; }
2622  if (len == 1) { cButLast = c; len++; }
2623  else if (len == 2) { cButButLast = c; len++;
2624  // Now we have read the last three characters; start descending the suitable subtrie.
2625  node = sbExTrie.Get3GramRoot(cLast, cButLast, cButButLast);
2626  if (node < 0) return true; }
2627  else {
2628  // Descend down the trie.
2629  node = sbExTrie.GetChild(node, c);
2630  if (node < 0) return true; }
2631  }
2632  //return true;
2633 }
2634 
2635 template<typename TSrcVec>
2636 bool TUniChDb::FindNextSentenceBoundary(const TSrcVec& src, const size_t srcIdx, const size_t srcCount, size_t &position) const
2637 {
2638  // SB1. Break at the start of text.
2639  if (position < srcIdx) { position = srcIdx; return true; }
2640  // If we are beyond the end of the text, there aren't any word breaks left.
2641  const size_t srcEnd = srcIdx + srcCount;
2642  if (position >= srcEnd) return false;
2643  // If 'position' is currently at an ignored character, move it back to the last nonignored character.
2644  size_t origPos = position;
2645  if (IsWbIgnored(src[TVecIdx(position)])) {
2646  if (! WbFindPrevNonIgnored(src, srcIdx, position))
2647  position = origPos;
2648  }
2649  // Determine the previous nonignored character (before 'position').
2650  size_t posPrev = position;
2651  if (! WbFindPrevNonIgnored(src, srcIdx, posPrev)) posPrev = position;
2652  // Sec 6.2. Allow a break between Sep and an ignored character.
2653  if (position == origPos && position + 1 < srcEnd && IsSbSep(src[TVecIdx(position)]) && IsWbIgnored(src[TVecIdx(position + 1)])) { position += 1; return true; }
2654  // Determine the next nonignored character (after 'position').
2655  size_t posNext = position; WbFindNextNonIgnored(src, posNext, srcEnd);
2656  size_t posNext2;
2657  int cPrev = (posPrev < position ? (int) src[TVecIdx(posPrev)] : -1), cCur = (position < srcEnd ? (int) src[TVecIdx(position)] : -1);
2658  int cNext = (position < posNext && posNext < srcEnd ? (int) src[TVecIdx(posNext)] : -1);
2659  int sbfPrev = GetSbFlags(cPrev), sbfCur = GetSbFlags(cCur), sbfNext = GetSbFlags(cNext);
2660  int cNext2, sbfNext2;
2661  // Initialize the state of the peek-back automaton.
2662  typedef enum { stInit, stATerm, stATermSp, stATermSep, stSTerm, stSTermSp, stSTermSep } TPeekBackState;
2663  TPeekBackState backState;
2664  {
2665  size_t pos = position;
2666  bool wasSep = false, wasSp = false, wasATerm = false, wasSTerm = false;
2667  while (true)
2668  {
2669  if (! WbFindPrevNonIgnored(src, srcIdx, pos)) break;
2670  // Skip at most one Sep.
2671  int cp = (int) src[TVecIdx(pos)]; int sbf = GetSbFlags(cp);
2672  if ((sbf & ucfSbSep) == ucfSbSep) {
2673  wasSep = true;
2674  if (! WbFindPrevNonIgnored(src, srcIdx, pos)) break;
2675  cp = (int) src[TVecIdx(pos)]; sbf = GetSbFlags(cp); }
2676  // Skip zero or more Sp's.
2677  bool stop = false;
2678  while ((sbf & ucfSbSp) == ucfSbSp) {
2679  wasSp = true;
2680  if (! WbFindPrevNonIgnored(src, srcIdx, pos)) { stop = true; break; }
2681  cp = (int) src[TVecIdx(pos)]; sbf = GetSbFlags(cp); }
2682  if (stop) break;
2683  // Skip zero or more Close's.
2684  while ((sbf & ucfSbClose) == ucfSbClose) {
2685  if (! WbFindPrevNonIgnored(src, srcIdx, pos)) { stop = true; break; }
2686  cp = (int) src[TVecIdx(pos)]; sbf = GetSbFlags(cp); }
2687  if (stop) break;
2688  // Process an ATerm or STerm.
2689  wasATerm = ((sbf & ucfSbATerm) == ucfSbATerm);
2690  wasSTerm = ((sbf & ucfSbSTerm) == ucfSbSTerm);
2691  break;
2692  }
2693  if (wasATerm) backState = (wasSep ? stATermSep : wasSp ? stATermSp : stATerm);
2694  else if (wasSTerm) backState = (wasSep ? stSTermSep : wasSp ? stSTermSp : stSTerm);
2695  else backState = stInit;
2696  }
2697  // Initialize the state of the peek-ahead automaton. This state tells us what follows
2698  // after we skip all contiguous characters from the complement of the set {OLetter, Upper, Lower, Sep, STerm, ATerm}.
2699  // Thus, the next character is either OLetter, Upper, Lower, Sep, STerm, ATerm, or the end of the input string.
2700  // Our peek-ahead automaton must tell us whether it is Lower or something else.
2701  typedef enum { stUnknown, stLower, stNotLower } TPeekAheadState;
2702  TPeekAheadState aheadState = stUnknown;
2703  //
2704  for ( ; position < srcEnd; posPrev = position, position = posNext, posNext = posNext2,
2705  cPrev = cCur, cCur = cNext, cNext = cNext2,
2706  sbfPrev = sbfCur, sbfCur = sbfNext, sbfNext = sbfNext2)
2707  {
2708  // Should there be a word boundary between 'position' and 'posNext' (or, more accurately,
2709  // between src[posNext - 1] and src[posNext] --- any ignored characters between 'position'
2710  // and 'posNext' are considered to belong to the previous character ('position'), not to the next one)?
2711  posNext2 = posNext; WbFindNextNonIgnored(src, posNext2, srcEnd);
2712  cNext2 = (posNext < posNext2 && posNext2 < srcEnd ? (int) src[TVecIdx(posNext2)] : -1);
2713  sbfNext2 = GetSbFlags(cNext2);
2714  // Update the peek-back automaton.
2715 #define TestCur(curFlag) ((sbfCur & ucfSb##curFlag) == ucfSb##curFlag)
2716 #define Trans(curFlag, newState) if (TestCur(curFlag)) { backState = st##newState; break; }
2717  switch (backState) {
2718  case stInit: Trans(ATerm, ATerm); Trans(STerm, STerm); break;
2719  case stATerm: Trans(Sp, ATermSp); Trans(Sep, ATermSep); Trans(ATerm, ATerm); Trans(STerm, STerm); Trans(Close, ATerm); backState = stInit; break;
2720  case stSTerm: Trans(Sp, STermSp); Trans(Sep, STermSep); Trans(ATerm, ATerm); Trans(STerm, STerm); Trans(Close, STerm); backState = stInit; break;
2721  case stATermSp: Trans(Sp, ATermSp); Trans(Sep, ATermSep); Trans(ATerm, ATerm); Trans(STerm, STerm); backState = stInit; break;
2722  case stSTermSp: Trans(Sp, STermSp); Trans(Sep, STermSep); Trans(ATerm, ATerm); Trans(STerm, STerm); backState = stInit; break;
2723  case stATermSep: Trans(ATerm, ATerm); Trans(STerm, STerm); backState = stInit; break;
2724  case stSTermSep: Trans(ATerm, ATerm); Trans(STerm, STerm); backState = stInit; break;
2725  default: IAssert(false); }
2726 #undef Trans
2727 #undef TestCur
2728  // Update the peek-ahead automaton.
2729 #define IsPeekAheadSkippable(sbf) ((sbf & (ucfSbOLetter | ucfSbUpper | ucfSbLower | ucfSbSep | ucfSbSTerm | ucfSbATerm)) == 0)
2730  if (! IsPeekAheadSkippable(sbfCur)) {
2731  bool isLower = ((sbfCur & ucfSbLower) == ucfSbLower);
2732  if (aheadState == stLower) IAssert(isLower);
2733  else if (aheadState == stNotLower) IAssert(! isLower);
2734  // We haven't peaked ahead farther than this so far -- invalidate the state.
2735  aheadState = stUnknown; }
2736  if (aheadState == stUnknown)
2737  {
2738  // Peak ahead to the next non-peekahead-skippable character.
2739  size_t pos = posNext;
2740  while (pos < srcEnd) {
2741  int cp = (int) src[TVecIdx(pos)]; int sbf = GetSbFlags(cp);
2742  if (! IsPeekAheadSkippable(sbf)) {
2743  if ((sbf & ucfSbLower) == ucfSbLower) aheadState = stLower;
2744  else aheadState = stNotLower;
2745  break; }
2746  WbFindNextNonIgnored(src, pos, srcEnd); }
2747  if (! (pos < srcEnd)) aheadState = stNotLower;
2748  }
2749 #undef IsPeekAheadSkippable
2750  //
2751 #define TestCurNext(curFlag, nextFlag) if ((sbfCur & curFlag) == curFlag && (sbfNext & nextFlag) == nextFlag) continue
2752 #define TestCurNext2(curFlag, nextFlag, next2Flag) if ((sbfCur & curFlag) == curFlag && (sbfNext & nextFlag) == nextFlag && (sbfNext2 & next2Flag) == next2Flag) continue
2753 #define TestPrevCurNext(prevFlag, curFlag, nextFlag) if ((sbfPrev & prevFlag) == prevFlag && (sbfCur & curFlag) == curFlag && (sbfNext & nextFlag) == nextFlag) continue
2754  // SB3. Do not break within CRLF.
2755  if (cCur == 13 && cNext == 10) continue;
2756  // SB4. Break ater paragraph separators.
2757  if ((sbfCur & ucfSbSep) == ucfSbSep) {
2758  if (! CanSentenceEndHere(src, srcIdx, position)) continue;
2759  position = posNext; return true; }
2760  // Do not break after ambiguous terminators like period, if they are immediately followed by a number
2761  // or lowercase letter, if they are between uppercase letters, or if the first following letter
2762  // (optionally after certain punctuation) is lowercase. For example, a period may be an abbreviation
2763  // or numeric period, and thus may not mark the end of a sentence.
2766  // SB8a. (STerm | ATerm) Close* Sp* [do not break] (STerm | ATerm)
2767  if ((backState == stATerm || backState == stATermSp || backState == stSTerm || backState == stSTermSp) &&
2768  (sbfNext & (ucfSbSTerm | ucfSbATerm)) != 0) continue;
2769  // SB8*. ATerm Close* Sp* [do not break] ( ! (OLetter | Upper | Lower | Sep | STerm | ATerm) )* Lower
2770  if ((backState == stATerm || backState == stATermSp) && aheadState == stLower) continue;
2771  // Break after sentence terminators, but include closing punctuation, trailing spaces, and a paragraph separator (if present).
2772  // SB9. ( STerm | ATerm ) Close* [do not break] ( Close | Sp | Sep )
2773  if ((backState == stATerm || backState == stSTerm) && (sbfNext & (ucfSbClose | ucfSbSp | ucfSbSep)) != 0) continue;
2774  // SB10. ( STerm | ATerm ) Close* Sp* [do not break] ( Sp | Sep )
2775  // SB11*. ( STerm | ATerm ) Close* Sp* Sep? [do break]
2776  if (backState == stATerm || backState == stATermSp || backState == stATermSep || backState == stSTerm || backState == stSTermSp || backState == stSTermSep) {
2777  if ((sbfNext & (ucfSbSp | ucfSbSep)) != 0) continue; // SB10
2778  if (! CanSentenceEndHere(src, srcIdx, position)) continue;
2779  position = posNext; return true; } // SB11
2780  // WB12. Otherwise, do not break.
2781  continue;
2782 #undef TestCurNext
2783 #undef TestCurNext2
2784 #undef TestPrevCurNext
2785  }
2786  // WB2. Break at the end of text.
2787  IAssert(position == srcEnd);
2788  return true;
2789 }
2790 
2791 // ToDo: provide a more efficient implementation of this.
2792 template<typename TSrcVec>
2793 void TUniChDb::FindSentenceBoundaries(const TSrcVec& src, const size_t srcIdx, const size_t srcCount, TBoolV& dest) const
2794 {
2795  if (size_t(dest.Len()) != srcCount + 1) dest.Gen(TVecIdx(srcCount + 1));
2796  dest.PutAll(false);
2797  size_t position = srcIdx;
2798  dest[TVecIdx(position - srcIdx)] = true;
2799  while (position < srcIdx + srcCount)
2800  {
2801  size_t oldPos = position;
2802  FindNextSentenceBoundary(src, srcIdx, srcCount, position);
2803  if (oldPos >= position) {
2804  Assert(oldPos < position);
2805  }
2806  Assert(position <= srcIdx + srcCount);
2807  dest[TVecIdx(position - srcIdx)] = true;
2808  }
2809  Assert(dest[TVecIdx(srcCount)]);
2810 }
2811 
2812 //-----------------------------------------------------------------------------
2813 // TUniChDb -- case conversions
2814 //-----------------------------------------------------------------------------
2815 
2816 template<typename TSrcVec, typename TDestCh>
2817 void TUniChDb::GetCaseConverted(const TSrcVec& src, size_t srcIdx, const size_t srcCount,
2818  TVec<TDestCh>& dest, const bool clrDest,
2819  const TUniChDb::TCaseConversion how,
2820  const bool turkic, const bool lithuanian) const
2821 {
2822  const TIntIntVH &specials = (how == ccUpper ? specialCasingUpper : how == ccLower ? specialCasingLower : how == ccTitle ? specialCasingTitle : *((TIntIntVH *) 0));
2823  if (clrDest) dest.Clr();
2824  enum {
2825  GreekCapitalLetterSigma = 0x3a3,
2826  GreekSmallLetterSigma = 0x3c3,
2827  GreekSmallLetterFinalSigma = 0x3c2,
2828  LatinCapitalLetterI = 0x49,
2829  LatinCapitalLetterJ = 0x4a,
2830  LatinCapitalLetterIWithOgonek = 0x12e,
2831  LatinCapitalLetterIWithGrave = 0xcc,
2832  LatinCapitalLetterIWithAcute = 0xcd,
2833  LatinCapitalLetterIWithTilde = 0x128,
2834  LatinCapitalLetterIWithDotAbove = 0x130,
2835  LatinSmallLetterI = 0x69,
2836  CombiningDotAbove = 0x307
2837  };
2838  //
2839  bool seenCased = false, seenTwoCased = false; int cpFirstCased = -1;
2840  size_t nextWordBoundary = srcIdx;
2841  TBoolV wordBoundaries; bool wbsKnown = false;
2842  for (const size_t origSrcIdx = srcIdx, srcEnd = srcIdx + srcCount; srcIdx < srcEnd; )
2843  {
2844  int cp = src[TVecIdx(srcIdx)]; srcIdx++;
2845  //if (turkic && cp == 0x130 && how == ccLower) printf("!");
2846  // For conversion to titlecase, the first cased character of each word
2847  // must be converted to titlecase; everything else must be converted
2848  // to lowercase.
2849  TUniChDb::TCaseConversion howHere;
2850  if (how != ccTitle) howHere = how;
2851  else {
2852  if (srcIdx - 1 == nextWordBoundary) { // A word starts/ends here.
2853  seenCased = false; seenTwoCased = false; cpFirstCased = -1;
2854  size_t next = nextWordBoundary; FindNextWordBoundary(src, origSrcIdx, srcCount, next);
2855  IAssert(next > nextWordBoundary); nextWordBoundary = next; }
2856  bool isCased = IsCased(cp);
2857  if (isCased && ! seenCased) { howHere = ccTitle; seenCased = true; cpFirstCased = cp; }
2858  else { howHere = ccLower;
2859  if (isCased && seenCased) seenTwoCased = true; }
2860  }
2861  // First, process the conditional mappings from SpecialCasing.txt.
2862  // These will be processed in code -- they were ignored while
2863  // we were reading SpecialCasing.txt itself.
2864  if (cp == GreekCapitalLetterSigma && howHere == ccLower)
2865  {
2866  // SpecialCasing.txt mentions the 'FinalSigma' condition, but sec. 3.13 of
2867  // the standard doesn't define it. We'll use FinalCased instead.
2868  // FinalCased: within the closest word boundaries containing C,
2869  // there is a cased letter before C, and there is no cased letter after C.
2870  //size_t nextBoundary = srcIdx - 1; FindNextWordBoundary(src, srcIdx, srcCount, nextBoundary);
2871  if (! wbsKnown) { FindWordBoundaries(src, origSrcIdx, srcCount, wordBoundaries); wbsKnown = true; }
2872  size_t srcIdx2 = srcIdx; bool casedAfter = false;
2873  if (how == ccTitle)
2874  printf("!");
2875  //while (srcIdx2 < nextBoundary)
2876  while (! wordBoundaries[TVecIdx(srcIdx2 - origSrcIdx)])
2877  {
2878  int cp2 = src[TVecIdx(srcIdx2)]; srcIdx2++;
2879  if (IsCased(cp2)) { casedAfter = true; break; }
2880  }
2881  if (! casedAfter)
2882  {
2883  //size_t prevBoundary = srcIdx - 1;
2884  //FindPreviousWordBoundary(src, srcIdx, srcCount, prevBoundary);
2885  srcIdx2 = srcIdx - 1; bool casedBefore = false;
2886  //while (prevBoundary < srcIdx2)
2887  while (! wordBoundaries[TVecIdx(srcIdx2 - origSrcIdx)])
2888  {
2889  --srcIdx2; int cp2 = src[TVecIdx(srcIdx2)];
2890  if (IsCased(cp2)) { casedBefore = true; break; }
2891  }
2892  if (casedBefore) {
2893  // Now we have a FinalCased character.
2894  dest.Add(GreekSmallLetterFinalSigma); Assert(howHere == ccLower); continue; }
2895  }
2896  // If we got here, add a non-final sigma.
2897  dest.Add(GreekSmallLetterSigma); continue;
2898  }
2899  else if (lithuanian)
2900  {
2901  if (howHere == ccLower)
2902  {
2903  if (cp == LatinCapitalLetterI || cp == LatinCapitalLetterJ || cp == LatinCapitalLetterIWithOgonek)
2904  {
2905  bool moreAbove = false;
2906  for (size_t srcIdx2 = srcIdx; srcIdx2 < srcEnd; )
2907  {
2908  const int cp2 = src[TVecIdx(srcIdx2)]; srcIdx2++;
2909  const int cc2 = GetCombiningClass(cp2);
2910  if (cc2 == TUniChInfo::ccStarter) break;
2911  if (cc2 == TUniChInfo::ccAbove) { moreAbove = true; break; }
2912  }
2913  if (moreAbove)
2914  {
2915  if (cp == LatinCapitalLetterI) { dest.Add(0x69); dest.Add(0x307); continue; }
2916  if (cp == LatinCapitalLetterJ) { dest.Add(0x6a); dest.Add(0x307); continue; }
2917  if (cp == LatinCapitalLetterIWithOgonek) { dest.Add(0x12f); dest.Add(0x307); continue; }
2918  }
2919  }
2920  else if (cp == LatinCapitalLetterIWithGrave) { dest.Add(0x69); dest.Add(0x307); dest.Add(0x300); continue; }
2921  else if (cp == LatinCapitalLetterIWithAcute) { dest.Add(0x69); dest.Add(0x307); dest.Add(0x301); continue; }
2922  else if (cp == LatinCapitalLetterIWithTilde) { dest.Add(0x69); dest.Add(0x307); dest.Add(0x303); continue; }
2923  }
2924  if (cp == CombiningDotAbove)
2925  {
2926  // Lithuanian, howHere != ccLower.
2927  // AfterSoftDotted := the last preceding character with a combining class
2928  // of zero before C was Soft_Dotted, and there is no intervening combining
2929  // character class 230 (ABOVE).
2930  bool afterSoftDotted = false;
2931  size_t srcIdx2 = srcIdx - 1; // now srcIdx2 is the index from which we got 'cp'
2932  while (origSrcIdx < srcIdx2)
2933  {
2934  --srcIdx2; int cp2 = src[TVecIdx(srcIdx2)];
2935  int cc2 = GetCombiningClass(cp2);
2936  if (cc2 == TUniChInfo::ccAbove) break;
2937  if (cc2 == TUniChInfo::ccStarter) {
2938  afterSoftDotted = IsSoftDotted(cp2); break; }
2939  }
2940  if (afterSoftDotted)
2941  {
2942  Assert(lithuanian);
2943  // Remove DOT ABOVE after "i" with upper or titlecase.
2944  // - Note: but this must only be done if that "i" was actually placed into uppercase (if how == ccTitle,
2945  // the "i" may have been kept lowercase and thus we shouldn't remove the dot).
2946  if (how == ccLower) { dest.Add(0x307); continue; }
2947  if (how == ccUpper) continue;
2948  Assert(how == ccTitle);
2949  Assert(howHere == ccLower); // because CombiningDotAbove is not a cased character
2950  if (seenCased && ! seenTwoCased) continue; // The "i" has been placed into uppercase; thus, remove the dot.
2951  dest.Add(0x307); continue;
2952  }
2953  }
2954  }
2955  else if (turkic) // language code 'tr' (Turkish) and 'az' (Azeri)
2956  {
2957  // I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
2958  // The following rules handle those cases.
2959  if (cp == LatinCapitalLetterIWithDotAbove) {
2960  dest.Add(howHere == ccLower ? 0x69 : 0x130); continue; }
2961  // When lowercasing, remove dot_above in the sequence I + dot_above,
2962  // which will turn into i. This matches the behavior of the
2963  // canonically equivalent I-dot_above.
2964  else if (cp == CombiningDotAbove)
2965  {
2966  // AfterI: the last preceding base character was an uppercase I,
2967  // and there is no intervening combining character class 230 (ABOVE).
2968  bool afterI = false;
2969  size_t srcIdx2 = srcIdx - 1; // now srcIdx2 is the index from which we got 'cp'
2970  while (origSrcIdx < srcIdx2)
2971  {
2972  --srcIdx2; int cp2 = src[TVecIdx(srcIdx2)];
2973  if (cp2 == LatinCapitalLetterI) { afterI = true; break; }
2974  int cc2 = GetCombiningClass(cp2);
2975  if (cc2 == TUniChInfo::ccAbove || cc2 == TUniChInfo::ccStarter) break;
2976  }
2977  if (afterI) {
2978  if (how == ccTitle && seenCased && ! seenTwoCased) {
2979  // Sec. 3.13 defines title-casing in an unusual way: find the first cased character in each word;
2980  // if found, map it to titlecase; otherwise, map all characters in that word to lowercase.
2981  // This suggests that if a cased character is found, others in that word should be left alone.
2982  // This seems unusual; we map all other characters to lowercase instead.
2983  // But this leads to problems with e.g. I followed by dot-above (U+0307): since the dot-above
2984  // is not the first cased character (it isn't even cased), we attempt to set it to lowercase;
2985  // but since afterI is also true here, this would mean deleting it. Thus our titlecased
2986  // form of "I followed by dot-above" would be just "I", which is clearly wrong.
2987  // So we treat this as a special case here.
2988  IAssert(cpFirstCased == LatinCapitalLetterI);
2989  dest.Add(0x307); continue; }
2990  if (howHere != ccLower) dest.Add(0x307);
2991  continue; }
2992  }
2993  // When lowercasing, unless an I is before a dot_above,
2994  // it turns into a dotless i.
2995  else if (cp == LatinCapitalLetterI)
2996  {
2997  // BeforeDot: C is followed by U+0307 (combining dot above).
2998  // Any sequence of characters with a combining class that is
2999  // neither 0 nor 230 may intervene between the current character
3000  // and the combining dot above.
3001  bool beforeDot = false;
3002  for (size_t srcIdx2 = srcIdx; srcIdx2 < srcEnd; )
3003  {
3004  const int cp2 = src[TVecIdx(srcIdx2)]; srcIdx2++;
3005  if (cp2 == 0x307) { beforeDot = true; break; }
3006  const int cc2 = GetCombiningClass(cp2);
3007  if (cc2 == TUniChInfo::ccStarter || cc2 == TUniChInfo::ccAbove) break;
3008  }
3009  if (! beforeDot) {
3010  dest.Add(howHere == ccLower ? 0x131 : 0x49); continue; }
3011  }
3012  // When uppercasing, i turns into a dotted capital I.
3013  else if (cp == LatinSmallLetterI)
3014  {
3015  dest.Add(howHere == ccLower ? 0x69 : 0x130); continue;
3016  }
3017  }
3018  // Try to use the unconditional mappings.
3019  const TIntIntVH &specHere = (
3020  howHere == how ? specials :
3021  howHere == ccLower ? specialCasingLower :
3022  howHere == ccTitle ? specialCasingTitle :
3023  howHere == ccUpper ? specialCasingUpper : *((TIntIntVH *) 0));
3024  int i = specHere.GetKeyId(cp);
3025  if (i >= 0) { TUniCaseFolding::AppendVector(specHere[i], dest); continue; }
3026  // Try to use the simple (one-character) mappings.
3027  i = h.GetKeyId(cp);
3028  if (i >= 0) {
3029  const TUniChInfo &ci = h[i];
3030  int cpNew = (
3031  howHere == ccLower ? ci.simpleLowerCaseMapping :
3032  howHere == ccUpper ? ci.simpleUpperCaseMapping :
3034  if (cpNew < 0) cpNew = cp;
3035  dest.Add(cpNew); continue; }
3036  // As a final resort, leave 'cp' unchanged.
3037  dest.Add(cp);
3038  }
3039 }
3040 
3041 template<typename TSrcVec, typename TDestCh>
3042 void TUniChDb::GetSimpleCaseConverted(const TSrcVec& src, size_t srcIdx, const size_t srcCount,
3043  TVec<TDestCh>& dest, const bool clrDest, const TCaseConversion how) const
3044 {
3045  if (clrDest) dest.Clr();
3046  bool seenCased = false; size_t nextWordBoundary = srcIdx;
3047  for (const size_t origSrcIdx = srcIdx, srcEnd = srcIdx + srcCount; srcIdx < srcEnd; )
3048  {
3049  const int cp = src[TVecIdx(srcIdx)]; srcIdx++;
3050  int i = h.GetKeyId(cp); if (i < 0) { dest.Add(cp); continue; }
3051  const TUniChInfo &ci = h[i];
3052  // With titlecasing, the first cased character of each word must be put into titlecase,
3053  // all others into lowercase. This is what the howHere variable is for.
3054  TUniChDb::TCaseConversion howHere;
3055  if (how != ccTitle) howHere = how;
3056  else {
3057  if (srcIdx - 1 == nextWordBoundary) { // A word starts/ends here.
3058  seenCased = false;
3059  size_t next = nextWordBoundary; FindNextWordBoundary(src, origSrcIdx, srcCount, next);
3060  IAssert(next > nextWordBoundary); nextWordBoundary = next; }
3061  bool isCased = IsCased(cp);
3062  if (isCased && ! seenCased) { howHere = ccTitle; seenCased = true; }
3063  else howHere = ccLower;
3064  }
3065  int cpNew = (howHere == ccTitle ? ci.simpleTitleCaseMapping : howHere == ccUpper ? ci.simpleUpperCaseMapping : ci.simpleLowerCaseMapping);
3066  if (cpNew < 0) cpNew = cp;
3067  dest.Add(cpNew);
3068  }
3069 }
3070 
3071 template<typename TSrcVec>
3072 void TUniChDb::ToSimpleCaseConverted(TSrcVec& src, size_t srcIdx, const size_t srcCount, const TCaseConversion how) const
3073 {
3074  bool seenCased = false; size_t nextWordBoundary = srcIdx;
3075  for (const size_t origSrcIdx = srcIdx, srcEnd = srcIdx + srcCount; srcIdx < srcEnd; srcIdx++)
3076  {
3077  const int cp = src[TVecIdx(srcIdx)];
3078  int i = h.GetKeyId(cp); if (i < 0) continue;
3079  const TUniChInfo &ci = h[i];
3080  // With titlecasing, the first cased character of each word must be put into titlecase,
3081  // all others into lowercase. This is what the howHere variable is for.
3082  TUniChDb::TCaseConversion howHere;
3083  if (how != ccTitle) howHere = how;
3084  else {
3085  if (srcIdx == nextWordBoundary) { // A word starts/ends here.
3086  seenCased = false;
3087  size_t next = nextWordBoundary; FindNextWordBoundary(src, origSrcIdx, srcCount, next);
3088  IAssert(next > nextWordBoundary); nextWordBoundary = next; }
3089  bool isCased = IsCased(cp);
3090  if (isCased && ! seenCased) { howHere = ccTitle; seenCased = true; }
3091  else howHere = ccLower;
3092  }
3093  int cpNew = (howHere == ccTitle ? ci.simpleTitleCaseMapping : howHere == ccUpper ? ci.simpleUpperCaseMapping : ci.simpleLowerCaseMapping);
3094  if (cpNew >= 0) src[TVecIdx(srcIdx)] = cpNew;
3095  }
3096 }
3097 
3098 //-----------------------------------------------------------------------------
3099 // TUniChDb -- composition, decomposition, normal forms
3100 //-----------------------------------------------------------------------------
3101 
3102 template<typename TDestCh>
3103 void TUniChDb::AddDecomposition(const int codePoint, TVec<TDestCh>& dest, const bool compatibility) const
3104 {
3105  if (HangulSBase <= codePoint && codePoint < HangulSBase + HangulSCount)
3106  {
3107  // UAX #15, sec. 16: Hangul decomposition
3108  const int SIndex = codePoint - HangulSBase;
3109  const int L = HangulLBase + SIndex / HangulNCount;
3110  const int V = HangulVBase + (SIndex % HangulNCount) / HangulTCount;
3111  const int T = HangulTBase + (SIndex % HangulTCount);
3112  dest.Add(L); dest.Add(V);
3113  if (T != HangulTBase) dest.Add(T);
3114  return;
3115  }
3116  int i = h.GetKeyId(codePoint); if (i < 0) { dest.Add(codePoint); return; }
3117  const TUniChInfo &ci = h[i];
3118  int ofs = ci.decompOffset; if (ofs < 0) { dest.Add(codePoint); return; }
3119  if ((! compatibility) && ci.IsCompatibilityDecomposition()) { dest.Add(codePoint); return; }
3120  while (true) {
3121  int cp = decompositions[ofs++]; if (cp < 0) return;
3122  AddDecomposition(cp, dest, compatibility); }
3123 }
3124 
3125 template<typename TSrcVec, typename TDestCh>
3126 void TUniChDb::Decompose(const TSrcVec& src, size_t srcIdx, const size_t srcCount,
3127  TVec<TDestCh>& dest, const bool compatibility, bool clrDest) const
3128 {
3129  if (clrDest) dest.Clr();
3130  const size_t destStart = dest.Len()/*, srcEnd = srcIdx + srcCount*/;
3131  // Decompose the string.
3132  while (srcIdx < srcCount) {
3133  AddDecomposition(src[TVecIdx(srcIdx)], dest, compatibility); srcIdx++; }
3134  // Rearrange the decomposed string into canonical order.
3135  for (size_t destIdx = destStart, destEnd = dest.Len(); destIdx < destEnd; )
3136  {
3137  size_t j = destIdx;
3138  int cp = dest[TVecIdx(destIdx)]; destIdx++;
3139  int cpCls = GetCombiningClass(cp);
3140  if (cpCls == TUniChInfo::ccStarter) continue;
3141  while (destStart < j && GetCombiningClass(dest[TVecIdx(j - 1)]) > cpCls) {
3142  dest[TVecIdx(j)] = dest[TVecIdx(j - 1)]; j--; }
3143  dest[TVecIdx(j)] = cp;
3144  }
3145 }
3146 
3147 template<typename TSrcVec, typename TDestCh>
3148 void TUniChDb::DecomposeAndCompose(const TSrcVec& src, size_t srcIdx, const size_t srcCount,
3149  TVec<TDestCh>& dest, bool compatibility, bool clrDest) const
3150 {
3151  if (clrDest) dest.Clr();
3152  TIntV temp;
3153  Decompose(src, srcIdx, srcCount, temp, compatibility);
3154  Compose(temp, 0, temp.Len(), dest, clrDest);
3155 }
3156 
3157 template<typename TSrcVec, typename TDestCh>
3158 void TUniChDb::Compose(const TSrcVec& src, size_t srcIdx, const size_t srcCount,
3159  TVec<TDestCh>& dest, bool clrDest) const
3160 {
3161  if (clrDest) dest.Clr();
3162  bool lastStarterKnown = false; // has a starter been encountered yet?
3163  size_t lastStarterPos = size_t(-1); // the index (in 'dest') of the last starter
3164  int cpLastStarter = -1; // the codepoint of the last starter (i.e. cpLastStarter == dest[lastStarterPos])
3165  const size_t srcEnd = srcIdx + srcCount;
3166  int ccMax = -1; // The highest combining class among the characters since the last starter.
3167  while (srcIdx < srcEnd)
3168  {
3169  const int cp = src[TVecIdx(srcIdx)]; srcIdx++;
3170  const int cpClass = GetCombiningClass(cp);
3171  //int cpCombined = -1;
3172  // If there is a starter with which 'cp' can be combined, and from which it is not blocked
3173  // by some intermediate character, we can try to combine them.
3174  if (lastStarterKnown && ccMax < cpClass)
3175  {
3176  int j = inverseDec.GetKeyId(TIntPr(cpLastStarter, cp));
3177  int cpCombined = -1;
3178  do {
3179  // Try to look up a composition in the inverseDec table.
3180  if (j >= 0) { cpCombined = inverseDec[j]; break; }
3181  // UAX #15, sec. 16: Hangul composition
3182  // - Try to combine L and V.
3183  const int LIndex = cpLastStarter - HangulLBase;
3184  if (0 <= LIndex && LIndex < HangulLCount) {
3185  const int VIndex = cp - HangulVBase;
3186  if (0 <= VIndex && VIndex < HangulVCount) {
3187  cpCombined = HangulSBase + (LIndex * HangulVCount + VIndex) * HangulTCount;
3188  break; } }
3189  // - Try to combine LV and T.
3190  const int SIndex = cpLastStarter - HangulSBase;
3191  if (0 <= SIndex && SIndex < HangulSCount && (SIndex % HangulTCount) == 0)
3192  {
3193  const int TIndex = cp - HangulTBase;
3194  if (0 <= TIndex && TIndex < HangulTCount) {
3195  cpCombined = cpLastStarter + TIndex;
3196  break; }
3197  }
3198  } while (false);
3199  // If a combining character has been found, use it to replace the old cpStarter.
3200  if (cpCombined >= 0) {
3201  dest[TVecIdx(lastStarterPos)] = cpCombined;
3203  // if (cpCombined is not a starter) { starterKnown = false; lastStarterPos = size_t(01); cpLastStarter = -1; } else
3204  cpLastStarter = cpCombined; continue; }
3205  }
3206  if (cpClass == TUniChInfo::ccStarter) { // 'cp' is a starter, remember it for later. Set ccMax to -1 so that this starter can be combined with another starter.
3207  lastStarterKnown = true; lastStarterPos = dest.Len(); cpLastStarter = cp; ccMax = cpClass - 1; }
3208  else if (cpClass > ccMax) // Remember cp's class as the new maximum class since the last starter (for blocking).
3209  ccMax = cpClass;
3210  dest.Add(cp);
3211  }
3212 }
3213 
3214 template<typename TSrcVec, typename TDestCh>
3215 size_t TUniChDb::ExtractStarters(const TSrcVec& src, size_t srcIdx, const size_t srcCount,
3216  TVec<TDestCh>& dest, bool clrDest) const
3217 {
3218  if (clrDest) dest.Clr();
3219  size_t retVal = 0;
3220  for (const size_t srcEnd = srcIdx + srcCount; srcIdx < srcEnd; srcIdx++) {
3221  const int cp = src[TVecIdx(srcIdx)];
3223  { dest.Add(cp); retVal++; } }
3224  return retVal;
3225 }
3226 
3227 inline bool AlwaysFalse()
3228 {
3229  int sum = 0;
3230  for (int i = 0; i < 5; i++) sum += i;
3231  return sum > 100;
3232 }
3233 
3234 inline bool AlwaysTrue()
3235 {
3236  int sum = 0;
3237  for (int i = 0; i < 5; i++) sum += i;
3238  return sum < 100;
3239 }
3240 
3241 /*
3242 
3243 Notes on decomposition:
3244 
3245 - In UnicodeData.txt, there is a field with the decomposition mapping.
3246  This field may also include a tag, <...>.
3247  If there is a tag, this is a compatibility mapping.
3248  Otherwise it is a canonical mapping.
3249 - Canonical decomposition uses only canonical mappings,
3250  compatibility decomposition uses both canonical and compatibility mappings.
3251 - Decomposition:
3252  1. Apply the decomposition mappings (canonical or canonical+compatibility), recursively.
3253  2. Put the string into canonical order, which means:
3254  while there exists a pair of characters, A immediately followed by B,
3255  such that combiningclass(A) > combiningclass(B) > 0 [an "exchangeable pair"]:
3256  swap A and B;
3257  This results in NFD (normalized form D, after canonical decomposition)
3258  or NFKD (normalized form KD, after compatibility decomposition).
3259 - Canonical composition:
3260  1. Before composition, the string should have been decomposed
3261  (using either canonical or compatibility decomposition).
3262  2. For each character C (from left to right):
3263  2.1. Find the last starter S before C (if not found, continue).
3264  2.2. If there is, between S and C, some character with a combining class >= than that of C, then continue.
3265  2.3. If there exists a character L for which the canonical decomposition is S+L
3266  and L is not in the composition exclusion table [i.e. L is a "primary composite"],
3267  then replace S by L, and remove C.
3268  This results in NFC (normalized form C, with canonical decomposition followed by canonical composition)
3269  or NFKC (normalized form KD, with compatibility decomposition followed by canonical composition).
3270 - Composition exclusion table:
3271  - Anything in CompositionExclusions.txt.
3272  - Singletons: characters whose canonical decomposition is a single character.
3273  - Non-starter decompositions: characters whose canonical decomposition begins with a non-starter.
3274 
3275 Example:
3276  E-grave (00c8; composition class 0; canonical decomposition: 0045 0300)
3277  E-macron (0112; composition class 0; 0045 0304)
3278  grave (0300; composition class 230)
3279  macron (0304; composition class 230)
3280  source string: 00c8 0304
3281  after canonical decomposition (or compatibility decomposition, they would be the same here): 0045 0300 0304
3282  after canonical composition: 00c8 0304
3283 
3284  cc(horn) = 216
3285  cc(dot below) = 220
3286  cc(dot above) = 230
3287 
3288 ToDos:
3289 - case folding - je misljen predvsem za primerjanje tako dobljenih nizov.
3290  Funkcija f(s) = NFC(toCaseFold(s)) je idempotentna.
3291  Funkcija g(s) = NFKC(toCaseFold(s)) pa ni -- ce hocemo to, moramo pri foldingu
3292  upostevati se nekaj dodatnih mappingov (glej 5.18, zadnji odstavek; DerivedNormalizationProps.txt).
3293 - Zdi se, da je CaseFolding.txt v bistvu cisto navaden folding v lowercase.
3294  Ker hocemo imeti tudi ostale foldinge, glejmo raje SpecialCasing.txt
3295  (+ simple case mappinge v UnicodeData.txt).
3296  Predlagam, da pri branju SpecialCasing.txt conditional mappinge kar ignoriramo
3297  in jih potem upostevamo posebej kar v source kodi nasih programov [za
3298  podrobno definicijo pogojev pa glej tabelo 3.13].
3299  - Pripis: vseeno se mi zdi, da je CaseFolding.txt nekaj malo drugacnega od navadnega lowercase.
3300  Na primer, za small final sigma 03c2 je tam navedeno, naj se spremeni v navadno small sigma 03c3.
3301  To ne sledi niti iz UnicodeData.txt niti iz SpecialCasing.txt, pa ceprav v UCD.html pise,
3302  da je CaseFolding.txt izpeljan iz njiju. Glavni namen CaseFolding.txt naj bi bil za
3303  potrebe "locale-independent case folding" (table 4.1 in sec. 5.18).
3304  - Preden se zacnes ubadati s case conversioni, si oglej razdelek 3.13
3305  in se posebej str. 90.
3306  - Glej str. 91 o kombinaciji N[K]FD + caseFold + N[K]FD
3307  - definicija cased ipd. na str. 89
3308 - isIdentifierStart(c), isIdentifierEnd(c) -- sec. 5.15
3309  Glej DerivedCoreProperties.txt, kjer je na podoben nacin definiranih se kup podobnih
3310  stvari, med drugim isLowerCase in isUpperCase. Tam je tudi isLetter, isAlphabetic itd. (sec. 4.9).
3311  To je se najbolje dodati med flagse posameznega characterja.
3312 - general category: sec. 4.5
3313 - motivacija za titlecase: 5.18
3314 - primerjaj nas dosedanji izracun compositionExclusion s tistim, kar je naracunano v DerivedNormalizationProps.txt
3315  pod Full_Composition_Exclusion
3316 - script names: Scripts.txt in UAX #24.
3317 - block names: Blocks.txt
3318 - space characters: table 6.2 in baje tudi UCD.html
3319 - dash characters: table 6.3
3320 */
3321 
3322 //#endif
3323 
void InitAfterLoad()
Definition: unicode.cpp:1368
Definition: bd.h:440
#define IAssert(Cond)
Definition: bd.h:262
int GetWbFlags() const
Definition: unicode.h:1118
bool IsVariationSelector() const
Definition: unicode.h:1103
void ToSimpleTitleCase(TSrcVec &src, size_t srcIdx, const size_t srcCount) const
Definition: unicode.h:1611
TUniChCategory_
Definition: unicode.h:662
static int SwapBytes(int x)
Definition: unicode.h:250
TPair< TInt, TInt > TIntPr
Definition: ds.h:83
void Load(TSIn &SIn)
Definition: unicode.h:286
bool IsSbFlag(const int cp, const TUniChFlags flag) const
Definition: unicode.h:1358
static int FromUnicode(int c)
Definition: unicode.h:499
void InitCodecs()
Definition: unicode.cpp:1683
void Clr()
Definition: unicode.h:1276
void ToSimpleUpperCase(TIntV &src) const
Definition: unicode.h:1977
void GetUpperCase(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest=true, const bool turkic=false, const bool lithuanian=false) const
Definition: unicode.h:1591
static void Add(TVector &vector, const TElement &element)
Definition: unicode.h:432
void Init()
Definition: unicode.h:1779
void TestDecodeUtf16(TRnd &rnd, const TStr &testCaseDesc, const TUtf16BomHandling bomHandling, const TUniByteOrder defaultByteOrder, const bool insertBom)
Definition: unicode.cpp:341
static const int fromUnicodeTable1[6 *16]
Definition: unicode.h:510
void DecomposeAndCompose(const TSrcVec &src, TVec< TDestCh > &dest, bool compatibility, bool clrDest=true) const
Definition: unicode.h:1542
const char * GetCharName(const int cp) const
Definition: unicode.h:1331
void Compose(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, bool clrDest=true) const
Definition: unicode.h:3158
bool strict
Definition: unicode.h:83
enum TUniChProperties_ TUniChProperties
T8BitCodec< TEncoding_ISO8859_4 > iso8859_4
Definition: unicode.h:1849
TStr GetStr() const
Definition: dt.h:1200
#define IAssertR(Cond, Reason)
Definition: bd.h:265
TUniVecIdx TVecIdx
Definition: unicode.h:118
bool Has1Gram(const TItem &item) const
Definition: unicode.h:1204
void ToSimpleTitleCase(TSrcVec &src) const
Definition: unicode.h:1614
size_t srcIdx
Definition: unicode.h:32
int Len() const
Definition: dt.h:490
void SetPropertyX(const TUniChPropertiesX flag)
Definition: unicode.h:1108
static void AppendVector(const TVec< TSrcDat > &src, TVec< TDestDat > &dest)
Definition: unicode.h:278
static TStr GetName()
Definition: unicode.h:531
TUniChCategory GetCat(const int cp) const
Definition: unicode.h:1353
static int ToUnicode(int c)
Definition: unicode.h:481
int GetScriptByName(const TStr &scriptName) const
Definition: unicode.h:1322
TUcdFileReader & operator=(const TUcdFileReader &r)
Definition: unicode.h:1678
void ToSimpleUpperCase(TSrcVec &src) const
Definition: unicode.h:1612
void Clr()
Definition: unicode.h:1200
bool Empty() const
Definition: unicode.h:1202
void SbEx_Add(const TStr &s)
Definition: unicode.h:1492
static TStr GetBinFn()
Definition: unicode.h:1310
TUniVecIdx TVecIdx
Definition: unicode.h:281
TStr EncodeUtf8Str(const TIntV &src) const
Definition: unicode.h:1796
bool IsSbFormat() const
Definition: unicode.h:1129
enum TUniChFlags_ TUniChFlags
bool IsCompositionExclusion() const
Definition: unicode.h:1111
static void Add(TVector &vector, const TElement &element)
Definition: unicode.h:423
THash< TItemPr, TVoid > pairs
Definition: unicode.h:1195
TUniCaseFolding(TSIn &SIn)
Definition: unicode.h:285
void SaveBin(const TStr &fnBinUcd)
Definition: unicode.cpp:1362
#define ___OutRepl
Definition: dt.h:11
TUniChDb::TCaseConversion TCaseConversion
Definition: unicode.h:1963
bool IsDcpFlag(const TUniChFlags flag) const
Definition: unicode.h:1068
static const ushort LineBreak_Quotation
Definition: unicode.h:1032
void SetProperty(const TUniChProperties flag)
Definition: unicode.h:1085
void Add(const TSrcVec &src)
Definition: unicode.h:1249
Definition: ds.h:130
bool IsGraphemeExtend() const
Definition: unicode.h:1077
void SetSbFlag(const TUniChFlags flag)
Definition: unicode.h:1127
void GetSimpleCaseConverted(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest, const TCaseConversion how) const
Definition: unicode.h:3042
static const int fromUnicodeTable1[14 *16]
Definition: unicode.h:480
static TStr GetSpecialCasingFn()
Definition: unicode.h:1297
enum TUniChSubCategory_ TUniChSubCategory
TUniChSubCategory subCat
Definition: unicode.h:1020
TPair< TItem, TItem > TItemPr
Definition: unicode.h:1191
T8BitCodec< TEncoding_ISO8859_1 > iso8859_1
Definition: unicode.h:1846
TVec< TDat > TVector
Definition: unicode.h:421
void Save(TSOut &SOut) const
Definition: dt.cpp:1694
int GetWbFlags(const int cp) const
Definition: unicode.h:1357
TPt< TCodecBase > PCodecBase
Definition: unicode.h:328
bool IsSurrogate() const
Definition: unicode.h:1162
void WbFindNextNonIgnoredS(const TSrcVec &src, size_t &position, const size_t srcEnd) const
Definition: unicode.h:1429
void SetDcpFlag(const TUniChFlags flag)
Definition: unicode.h:1070
void SetWbFlag(const TUniChFlags flag)
Definition: unicode.h:1117
virtual void Test() const
Definition: unicode.h:345
TUtf16BomHandling_
Definition: unicode.h:46
bool IsGraphemeBase() const
Definition: unicode.h:1076
TStr EncodeUtf8Str(const TSrcVec &src, size_t srcIdx, const size_t srcCount) const
Definition: unicode.h:149
void ToSimpleUpperCase(TSrcVec &src, size_t srcIdx, const size_t srcCount) const
Definition: unicode.h:1609
enum TUniChCategory_ TUniChCategory
enum TUnicodeErrorHandling_ TUnicodeErrorHandling
unsigned int uint
Definition: bd.h:11
T8BitCodec< TEncoding_ISO8859_3 > TCodec_ISO8859_3
Definition: unicode.h:651
static int ToUnicode(int c)
Definition: unicode.h:533
TUniChSubCategory GetSubCat(const int cp) const
Definition: unicode.h:1354
TSubcatHelper(TUniChDb &owner_)
Definition: unicode.h:1727
size_t FromUnicode(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TDestVec &dest, const bool clrDest=true) const
Definition: unicode.h:618
uchar combClass
Definition: unicode.h:1018
#define Fail
Definition: bd.h:238
size_t ToUnicode(const TStr &src, TIntV &dest, const bool clrDest=true) const
Definition: unicode.h:613
static TStr GetScriptNameKatakana()
Definition: unicode.h:1318
TUniTrie< TInt > sbExTrie
Definition: unicode.h:1461
const char * GetCStr(const uint &Offset) const
Definition: dt.h:814
static const ushort LineBreak_InfixNumeric
Definition: unicode.h:1032
TUniChCategory cat
Definition: unicode.h:1019
static uint GetRndUint(TRnd &rnd)
Definition: unicode.cpp:62
bool IsLogicalOrderException() const
Definition: unicode.h:1097
void Clr()
Definition: dt.h:258
TUniCodec()
Definition: unicode.h:91
static TStr GetName()
Definition: unicode.h:465
T8BitCodec< TEncoding_ISO8859_4 > TCodec_ISO8859_4
Definition: unicode.h:652
TSizeTy Len() const
Returns the number of elements in the vector.
Definition: ds.h:575
static int ToUnicode(int c)
Definition: unicode.h:444
virtual TStr GetName() const =0
void InitPropList(const TStr &basePath)
Definition: unicode.cpp:950
static const int toUnicodeTable[8 *16]
Definition: unicode.h:532
TUniChSubCategory GetSubCat() const
Definition: unicode.h:1146
void ToCaseFolded(TSrcVec &src, const bool turkic=false) const
Definition: unicode.h:1637
void Save(TSOut &SOut) const
Definition: hash.h:183
void GetSimpleTitleCase(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest=true) const
Definition: unicode.h:1603
bool IsDefaultIgnorable() const
Definition: unicode.h:1075
bool Empty() const
Definition: hash.h:227
void ToSimpleLowerCase(TSrcVec &src) const
Definition: unicode.h:1613
T8BitCodec(TUnicodeErrorHandling errorHandling_, int replacementChar_=TUniCodec::DefaultReplacementChar)
Definition: unicode.h:571
void ClrCodecs()
Definition: unicode.h:1881
enum TUniChDb::TCaseConversion_ TCaseConversion
void ClrDcpFlags()
Definition: unicode.h:1069
void GetSimpleUpperCase(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest=true) const
Definition: unicode.h:1602
bool IsAlphabetic() const
Definition: unicode.h:1071
void GetLowerCase(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest=true, const bool turkic=false, const bool lithuanian=false) const
Definition: unicode.h:1590
int GetSbFlags(const int cp) const
Definition: unicode.h:1359
void WbFindCurOrNextNonIgnored(const TSrcVec &src, size_t &position, const size_t srcEnd) const
Definition: unicode.h:1422
TUniChInfo(TSIn &SIn)
Definition: unicode.h:1061
virtual size_t FromUnicode(const TIntV &src, size_t srcIdx, const size_t srcCount, TStr &dest, const bool clrDest=true) const
Definition: unicode.h:397
static const ushort LineBreak_ComplexContext
Definition: unicode.h:1032
T8BitCodec< TEncoding_CP852 > cp852
Definition: unicode.h:1852
TIntIntVH cfFull
Definition: unicode.h:275
const TStr & GetScriptName(const int scriptId) const
Definition: unicode.h:1321
TVec< TNode > TNodeV
Definition: unicode.h:1190
int Len() const
Definition: dt.h:259
TUniChCategory GetCat() const
Definition: unicode.h:1145
static TStr GetName()
Definition: unicode.h:443
TIntIntVH specialCasingUpper
Definition: unicode.h:1271
int GetScript(const TUniChInfo &ci) const
Definition: unicode.h:1323
static const int yuAsciiChars[10]
Definition: unicode.h:493
TStr GetSubStr(const int &BChN, const int &EChN) const
Definition: dt.cpp:811
void GetUpperCase(const TIntV &src, TIntV &dest) const
Definition: unicode.h:1966
T8BitCodec< TEncoding_ISO8859_3 > iso8859_3
Definition: unicode.h:1848
THash< TItem, TVoid > singles
Definition: unicode.h:1194
void RegisterCodec(const TStr &nameList, const PCodecBase &codec)
Definition: unicode.h:1873
TUniChDb ucd
Definition: unicode.h:1775
void InitDerivedCoreProperties(const TStr &basePath)
Definition: unicode.cpp:1007
void InitAfterLoad()
Definition: unicode.h:1035
virtual void Test() const
Definition: unicode.h:386
static int FromUnicode(int c)
Definition: unicode.h:445
bool IsWhiteSpace() const
Definition: unicode.h:1104
static int FromUnicode(int c)
Definition: unicode.h:469
void InitLineBreaks(const TStr &basePath)
Definition: unicode.cpp:1046
static const int uniChars[10]
Definition: unicode.h:493
void Decompose(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, bool compatibility, bool clrDest=true) const
Definition: unicode.h:3126
void GetLowerCase(const TIntV &src, TIntV &dest) const
Definition: unicode.h:1965
void WbFindNextNonIgnored(const TSrcVec &src, size_t &position, const size_t srcEnd) const
Definition: unicode.h:1425
void ToSimpleLowerCase(TIntV &src) const
Definition: unicode.h:1978
bool WbFindPrevNonIgnored(const TSrcVec &src, const size_t srcStart, size_t &position) const
Definition: unicode.h:1434
char chCat
Definition: unicode.h:1017
#define Trans(curFlag, newState)
void Add(const TSrcVec &src, const size_t srcIdx, const size_t srcCount)
Definition: unicode.h:1220
static TStr GetNormalizationTestFn()
Definition: unicode.h:1309
T8BitCodec< TEncoding_ISO8859_2 > TCodec_ISO8859_2
Definition: unicode.h:650
void GetCaseFolded(const TSrcVec &src, TVec< TDestCh > &dest, const bool clrDest=true, const bool full=true, const bool turkic=false) const
Definition: unicode.h:1632
void Clr()
Definition: bd.h:502
void Load(TSIn &SIn)
Definition: ds.h:946
TItem_ TItem
Definition: unicode.h:1180
enum TUniChPropertiesX_ TUniChPropertiesX
void GetSimpleLowerCase(const TIntV &src, TIntV &dest) const
Definition: unicode.h:1972
bool IsWbFlag(const int cp, const TUniChFlags flag) const
Definition: unicode.h:1356
static void SaveUShort(TSOut &SOut, ushort u)
Definition: unicode.h:1047
TUnicodeErrorHandling errorHandling
Definition: unicode.h:66
bool IsGbExtend() const
Definition: unicode.h:1139
THash< TInt, TIntV > TIntIntVH
Definition: unicode.h:269
void ToCaseFolded(TIntV &src) const
Definition: unicode.h:1992
void DecomposeAndCompose(const TIntV &src, TIntV &dest, bool compatibility) const
Definition: unicode.h:1946
bool IsUppercase() const
Definition: unicode.h:1072
void Test(const TStr &basePath)
Definition: unicode.cpp:1377
static const int fromUnicodeTable2[2 *16]
Definition: unicode.h:532
static const int fromUnicodeTable2[4 *16]
Definition: unicode.h:510
T8BitCodec< TEncoding_CP437 > cp437
Definition: unicode.h:1853
TNodeV nodes
Definition: unicode.h:1197
virtual void LoadCs()
Definition: fl.cpp:28
void Load(TSIn &SIn)
Definition: hash.h:177
static void ParseCodePointRange(const TStr &s, int &from, int &to)
Definition: unicode.h:1703
TIntIntVH specialCasingLower
Definition: unicode.h:1271
virtual TStr GetName() const
Definition: unicode.h:384
int simpleUpperCaseMapping
Definition: unicode.h:1022
size_t ExtractStarters(const TSrcVec &src, TVec< TDestCh > &dest, bool clrDest=true) const
Definition: unicode.h:1551
static TStr GetName()
Definition: unicode.h:509
bool FindNextWordBoundary(const TIntV &src, int &position) const
Definition: unicode.h:1901
size_t ToUnicode(const TStr &src, TIntV &dest, const bool clrDest=true) const
Definition: unicode.h:353
static void LoadUShort(TSIn &SIn, ushort &u)
Definition: unicode.h:1045
size_t UniToStr(const TIntV &src, TStr &dest, const bool clrDest=true) const
Definition: unicode.h:646
Definition: fl.h:58
void TestCaseConversion(const TStr &source, const TStr &trueLc, const TStr &trueTc, const TStr &trueUc, bool turkic, bool lithuanian)
Definition: unicode.cpp:825
static int FromUnicode(int c)
Definition: unicode.h:483
void Save(TSOut &SOut) const
Definition: ds.h:954
static TStr GetUnicodeDataFn()
Definition: unicode.h:1298
T8BitCodec< TEncoding_YuAscii > TCodec_YuAscii
Definition: unicode.h:656
THash< TIntPr, TInt > inverseDec
Definition: unicode.h:1267
void FindWordBoundaries(const TIntV &src, TBoolV &dest) const
Definition: unicode.h:1907
bool IsPropertyX(const TUniChPropertiesX flag) const
Definition: unicode.h:1107
bool FindNextSentenceBoundary(const TSrcVec &src, const size_t srcIdx, const size_t srcCount, size_t &position) const
Definition: unicode.h:2636
size_t EncodeUtf8(const TSrcVec &src, TVec< TDestCh > &dest, const bool clrDest=true) const
Definition: unicode.h:145
void ClrSentenceBoundaryExceptions()
Definition: unicode.h:1924
bool IsPrivateUse() const
Definition: unicode.h:1161
size_t DecodeUtf16FromWords(const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, bool clrDest, const TUtf16BomHandling bomHandling=bomAllowed, const TUniByteOrder defaultByteOrder=boMachineEndian) const
Definition: unicode.h:2294
TStr GetWbFlagsStr() const
Definition: unicode.h:1120
TUniByteOrder_
Definition: unicode.h:38
static TStr GetScriptsFn()
Definition: unicode.h:1300
static const int fromUnicodeTable3[6 *16]
Definition: unicode.h:510
void ToSimpleCaseConverted(TSrcVec &src, size_t srcIdx, const size_t srcCount, const TCaseConversion how) const
Definition: unicode.h:3072
void FindSentenceBoundaries(const TIntV &src, TBoolV &dest) const
Definition: unicode.h:1922
static int ToUnicode(int c)
Definition: unicode.h:467
int propertiesX
Definition: unicode.h:1027
bool IsLowercase() const
Definition: unicode.h:1073
void Clr(bool DoDel=false)
Definition: dt.h:819
size_t FromUnicode(const TIntV &src, TChA &dest, const bool clrDest=true) const
Definition: unicode.h:362
bool IsDeprecated() const
Definition: unicode.h:1089
void Clr()
Definition: unicode.h:288
void TestCaseConversions()
Definition: unicode.cpp:853
int simpleTitleCaseMapping
Definition: unicode.h:1022
bool IsCurrency() const
Definition: unicode.h:1157
Definition: dt.h:781
TUniChFlags_
Definition: unicode.h:712
static PSIn New(const TStr &FNm)
Definition: fl.cpp:290