SNAP Library 6.0, Developer Reference  2020-12-09 16:24:20
SNAP, a general purpose, high performance system for analysis and manipulation of large networks
unicode.h File Reference
#include "bd.h"
#include <new>
Include dependency graph for unicode.h:
This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Classes

class  TUnicodeException
 
class  TUniCodec
 
class  TUniCaseFolding
 
class  TCodecBase
 
class  TCodecWrapper< TCodecImpl_ >
 
class  TVecElt< TVector_ >
 
class  TVecElt< TVec< TDat > >
 
class  TVecElt< TChA >
 
class  TEncoding_ISO8859_1
 
class  TEncoding_ISO8859_2
 
class  TEncoding_ISO8859_3
 
class  TEncoding_ISO8859_4
 
class  TEncoding_YuAscii
 
class  TEncoding_CP437
 
class  TEncoding_CP852
 
class  TEncoding_CP1250
 
class  T8BitCodec< TEncoding_ >
 
class  TUniChInfo
 
class  TUniTrie< TItem_ >
 
class  TUniTrie< TItem_ >::TNode
 
class  TUniChDb
 
class  TUniChDb::TUcdFileReader
 
class  TUniChDb::TSubcatHelper
 
class  TUnicode
 

Macros

#define DefineByte(b7, b6, b5, b4, b3, b2, b1, b0)   _ ## b7 ## b6 ## b5 ## b4 ## _ ## b3 ## b2 ## b1 ## b0 = (b7 << 7) | (b6 << 6) | (b5 << 5) | (b4 << 4) | (b3 << 3) | (b2 << 2) | (b1 << 1) | b0
 
#define DefineUniCat(cat, c)   uc ## cat = (int(uchar(c)) & 0xff)
 
#define DefineUniSubCat(cat, subCat, c)   uc ## cat ## subCat = ((uc ## cat) << 8) | (int(uchar(c)) & 0xff)
 
#define ___UniFwd1(name)   bool name(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return false; else return h[i].name(); }
 
#define ___UniFwd2(name1, name2)   ___UniFwd1(name1) ___UniFwd1(name2)
 
#define ___UniFwd3(name1, name2, name3)   ___UniFwd2(name1, name2) ___UniFwd1(name3)
 
#define ___UniFwd4(name1, name2, name3, name4)   ___UniFwd3(name1, name2, name3) ___UniFwd1(name4)
 
#define ___UniFwd5(name1, name2, name3, name4, name5)   ___UniFwd4(name1, name2, name3, name4) ___UniFwd1(name5)
 
#define DECLARE_FORWARDED_PROPERTY_METHODS
 
#define ___UniFwd1(name)   bool name(const int cp) const { return ucd.name(cp); }
 
#define ___OutRepl   if (isDestLe) { dest.Add(replacementChar & 0xff); dest.Add((replacementChar >> 8) & 0xff); } else { dest.Add((replacementChar >> 8) & 0xff); dest.Add(replacementChar & 0xff); }
 
#define TestCurNext(curFlag, nextFlag)   if ((wbfCur & curFlag) == curFlag && (wbfNext & nextFlag) == nextFlag) continue
 
#define TestCurNext2(curFlag, nextFlag, next2Flag)   if ((wbfCur & curFlag) == curFlag && (wbfNext & nextFlag) == nextFlag && (wbfNext2 & next2Flag) == next2Flag) continue
 
#define TestPrevCurNext(prevFlag, curFlag, nextFlag)   if ((wbfPrev & prevFlag) == prevFlag && (wbfCur & curFlag) == curFlag && (wbfNext & nextFlag) == nextFlag) continue
 
#define TestCur(curFlag)   ((sbfCur & ucfSb##curFlag) == ucfSb##curFlag)
 
#define Trans(curFlag, newState)   if (TestCur(curFlag)) { backState = st##newState; break; }
 
#define IsPeekAheadSkippable(sbf)   ((sbf & (ucfSbOLetter | ucfSbUpper | ucfSbLower | ucfSbSep | ucfSbSTerm | ucfSbATerm)) == 0)
 
#define TestCurNext(curFlag, nextFlag)   if ((sbfCur & curFlag) == curFlag && (sbfNext & nextFlag) == nextFlag) continue
 
#define TestCurNext2(curFlag, nextFlag, next2Flag)   if ((sbfCur & curFlag) == curFlag && (sbfNext & nextFlag) == nextFlag && (sbfNext2 & next2Flag) == next2Flag) continue
 
#define TestPrevCurNext(prevFlag, curFlag, nextFlag)   if ((sbfPrev & prevFlag) == prevFlag && (sbfCur & curFlag) == curFlag && (sbfNext & nextFlag) == nextFlag) continue
 

Typedefs

typedef int TUniVecIdx
 
typedef enum TUnicodeErrorHandling_ TUnicodeErrorHandling
 
typedef enum TUniByteOrder_ TUniByteOrder
 
typedef enum TUtf16BomHandling_ TUtf16BomHandling
 
typedef THash< TInt, TIntVTIntIntVH
 
typedef TPt< TCodecBasePCodecBase
 
typedef TVec< PCodecBaseTCodecBaseV
 
typedef T8BitCodec< TEncoding_ISO8859_1TCodec_ISO8859_1
 
typedef T8BitCodec< TEncoding_ISO8859_2TCodec_ISO8859_2
 
typedef T8BitCodec< TEncoding_ISO8859_3TCodec_ISO8859_3
 
typedef T8BitCodec< TEncoding_ISO8859_4TCodec_ISO8859_4
 
typedef T8BitCodec< TEncoding_CP852TCodec_CP852
 
typedef T8BitCodec< TEncoding_CP437TCodec_CP437
 
typedef T8BitCodec< TEncoding_CP1250TCodec_CP1250
 
typedef T8BitCodec< TEncoding_YuAsciiTCodec_YuAscii
 
typedef enum TUniChCategory_ TUniChCategory
 
typedef enum TUniChSubCategory_ TUniChSubCategory
 
typedef enum TUniChFlags_ TUniChFlags
 
typedef enum TUniChProperties_ TUniChProperties
 
typedef enum TUniChPropertiesX_ TUniChPropertiesX
 

Enumerations

enum  TUnicodeErrorHandling_ { uehIgnore = 0, uehThrow = 1, uehReplace = 2, uehAbort = 3 }
 
enum  TUniByteOrder_ { boMachineEndian = 0, boLittleEndian = 1, boBigEndian = 2 }
 
enum  TUtf16BomHandling_ { bomAllowed = 0, bomRequired = 1, bomIgnored = 2 }
 
enum  TUniChCategory_ {
  DefineUniCat =(Letter, 'L'), DefineUniCat =(Letter, 'L'), DefineUniCat =(Letter, 'L'), DefineUniCat =(Letter, 'L'),
  DefineUniCat =(Letter, 'L'), DefineUniCat =(Letter, 'L'), DefineUniCat =(Letter, 'L'), DefineUniCat =(Letter, 'L')
}
 
enum  TUniChSubCategory_ {
  DefineUniSubCat =(Letter, Uppercase, 'u'), DefineUniSubCat =(Letter, Uppercase, 'u'), DefineUniSubCat =(Letter, Uppercase, 'u'), DefineUniSubCat =(Letter, Uppercase, 'u'),
  DefineUniSubCat =(Letter, Uppercase, 'u'), DefineUniSubCat =(Letter, Uppercase, 'u'), DefineUniSubCat =(Letter, Uppercase, 'u'), DefineUniSubCat =(Letter, Uppercase, 'u'),
  DefineUniSubCat =(Letter, Uppercase, 'u'), DefineUniSubCat =(Letter, Uppercase, 'u'), DefineUniSubCat =(Letter, Uppercase, 'u'), DefineUniSubCat =(Letter, Uppercase, 'u'),
  DefineUniSubCat =(Letter, Uppercase, 'u'), DefineUniSubCat =(Letter, Uppercase, 'u'), DefineUniSubCat =(Letter, Uppercase, 'u'), DefineUniSubCat =(Letter, Uppercase, 'u'),
  DefineUniSubCat =(Letter, Uppercase, 'u'), DefineUniSubCat =(Letter, Uppercase, 'u'), DefineUniSubCat =(Letter, Uppercase, 'u'), DefineUniSubCat =(Letter, Uppercase, 'u'),
  DefineUniSubCat =(Letter, Uppercase, 'u'), DefineUniSubCat =(Letter, Uppercase, 'u'), DefineUniSubCat =(Letter, Uppercase, 'u'), DefineUniSubCat =(Letter, Uppercase, 'u'),
  DefineUniSubCat =(Letter, Uppercase, 'u'), DefineUniSubCat =(Letter, Uppercase, 'u'), DefineUniSubCat =(Letter, Uppercase, 'u'), DefineUniSubCat =(Letter, Uppercase, 'u'),
  DefineUniSubCat =(Letter, Uppercase, 'u'), DefineUniSubCat =(Letter, Uppercase, 'u'), DefineUniSubCat =(Letter, Uppercase, 'u')
}
 
enum  TUniChFlags_ {
  ucfCompatibilityDecomposition = 1, ucfCompositionExclusion = 1 << 1, ucfWbFormat = 1 << 2, ucfWbKatakana = 1 << 3,
  ucfWbALetter = 1 << 4, ucfWbMidLetter = 1 << 5, ucfWbMidNum = 1 << 6, ucfWbNumeric = 1 << 7,
  ucfWbExtendNumLet = 1 << 8, ucfSbSep = 1 << 9, ucfSbFormat = 1 << 10, ucfSbSp = 1 << 11,
  ucfSbLower = 1 << 12, ucfSbUpper = 1 << 13, ucfSbOLetter = 1 << 14, ucfSbNumeric = 1 << 15,
  ucfSbATerm = 1 << 16, ucfSbSTerm = 1 << 17, ucfSbClose = 1 << 18, ucfSbMask = ucfSbSep | ucfSbFormat | ucfSbSp | ucfSbLower | ucfSbUpper | ucfSbOLetter | ucfSbNumeric | ucfSbATerm | ucfSbSTerm | ucfSbClose,
  ucfWbMask = ucfWbFormat | ucfWbKatakana | ucfWbALetter | ucfWbMidLetter | ucfWbMidNum | ucfWbNumeric | ucfWbExtendNumLet | ucfSbSep, ucfDcpAlphabetic = 1 << 19, ucfDcpDefaultIgnorableCodePoint = 1 << 20, ucfDcpLowercase = 1 << 21,
  ucfDcpGraphemeBase = 1 << 22, ucfDcpGraphemeExtend = 1 << 23, ucfDcpIdStart = 1 << 24, ucfDcpIdContinue = 1 << 25,
  ucfDcpMath = 1 << 26, ucfDcpUppercase = 1 << 27, ucfDcpXidStart = 1 << 28, ucfDcpXidContinue = 1 << 29,
  ucfDcpMask
}
 
enum  TUniChProperties_ {
  ucfPrAsciiHexDigit = 1, ucfPrBidiControl = 2, ucfPrDash = 4, ucfPrDeprecated = 8,
  ucfPrDiacritic = 0x10, ucfPrExtender = 0x20, ucfPrGraphemeLink = 0x40, ucfPrHexDigit = 0x80,
  ucfPrHyphen = 0x100, ucfPrIdeographic = 0x200, ucfPrJoinControl = 0x400, ucfPrLogicalOrderException = 0x800,
  ucfPrNoncharacterCodePoint = 0x1000, ucfPrPatternSyntax = 0x2000, ucfPrPatternWhiteSpace = 0x4000, ucfPrQuotationMark = 0x8000,
  ucfPrSoftDotted = 0x10000, ucfPrSTerm = 0x20000, ucfPrTerminalPunctuation = 0x40000, ucfPrVariationSelector = 0x80000,
  ucfPrWhiteSpace = 0x100000
}
 
enum  TUniChPropertiesX_ {
  ucfPxOtherAlphabetic = 1, ucfPxOtherDefaultIgnorableCodePoint = 2, ucfPxOtherGraphemeExtend = 4, ucfPxOtherIdContinue = 8,
  ucfPxOtherIdStart = 0x10, ucfPxOtherLowercase = 0x20, ucfPxOtherMath = 0x40, ucfPxOtherUppercase = 0x80,
  ucfPxIdsBinaryOperator = 0x100, ucfPxIdsTrinaryOperator = 0x200, ucfPxRadical = 0x400, ucfPxUnifiedIdeograph = 0x800
}
 

Functions

bool AlwaysFalse ()
 
bool AlwaysTrue ()
 

Macro Definition Documentation

#define ___OutRepl   if (isDestLe) { dest.Add(replacementChar & 0xff); dest.Add((replacementChar >> 8) & 0xff); } else { dest.Add((replacementChar >> 8) & 0xff); dest.Add(replacementChar & 0xff); }
#define ___UniFwd1 (   name)    bool name(const int cp) const { int i = h.GetKeyId(cp); if (i < 0) return false; else return h[i].name(); }

Definition at line 2014 of file unicode.h.

#define ___UniFwd1 (   name)    bool name(const int cp) const { return ucd.name(cp); }

Definition at line 2014 of file unicode.h.

#define ___UniFwd2 (   name1,
  name2 
)    ___UniFwd1(name1) ___UniFwd1(name2)

Definition at line 1362 of file unicode.h.

#define ___UniFwd3 (   name1,
  name2,
  name3 
)    ___UniFwd2(name1, name2) ___UniFwd1(name3)

Definition at line 1363 of file unicode.h.

#define ___UniFwd4 (   name1,
  name2,
  name3,
  name4 
)    ___UniFwd3(name1, name2, name3) ___UniFwd1(name4)

Definition at line 1364 of file unicode.h.

#define ___UniFwd5 (   name1,
  name2,
  name3,
  name4,
  name5 
)    ___UniFwd4(name1, name2, name3, name4) ___UniFwd1(name5)

Definition at line 1365 of file unicode.h.

#define DECLARE_FORWARDED_PROPERTY_METHODS
Value:
___UniFwd5(IsAsciiHexDigit, IsBidiControl, IsDash, IsDeprecated, IsDiacritic) \
___UniFwd5(IsExtender, IsGraphemeLink, IsHexDigit, IsHyphen, IsIdeographic) \
___UniFwd5(IsJoinControl, IsLogicalOrderException, IsNoncharacter, IsQuotationMark, IsSoftDotted) \
___UniFwd4(IsSTerminal, IsTerminalPunctuation, IsVariationSelector, IsWhiteSpace) \
___UniFwd5(IsAlphabetic, IsUppercase, IsLowercase, IsMath, IsDefaultIgnorable) \
___UniFwd4(IsGraphemeBase, IsGraphemeExtend, IsIdStart, IsIdContinue) \
___UniFwd2(IsXidStart, IsXidContinue) \
___UniFwd3(IsCompositionExclusion, IsCompatibilityDecomposition, IsSbSep) \
___UniFwd1(IsGbExtend) \
___UniFwd2(IsCased, IsCurrency)
#define ___UniFwd5(name1, name2, name3, name4, name5)
Definition: unicode.h:1365
#define ___UniFwd3(name1, name2, name3)
Definition: unicode.h:1363
#define ___UniFwd4(name1, name2, name3, name4)
Definition: unicode.h:1364
#define ___UniFwd1(name)
Definition: unicode.h:2014
#define ___UniFwd2(name1, name2)
Definition: unicode.h:1362

Definition at line 1367 of file unicode.h.

#define DefineByte (   b7,
  b6,
  b5,
  b4,
  b3,
  b2,
  b1,
  b0 
)    _ ## b7 ## b6 ## b5 ## b4 ## _ ## b3 ## b2 ## b1 ## b0 = (b7 << 7) | (b6 << 6) | (b5 << 5) | (b4 << 4) | (b3 << 3) | (b2 << 2) | (b1 << 1) | b0

Definition at line 102 of file unicode.h.

#define DefineUniCat (   cat,
 
)    uc ## cat = (int(uchar(c)) & 0xff)

Definition at line 664 of file unicode.h.

#define DefineUniSubCat (   cat,
  subCat,
 
)    uc ## cat ## subCat = ((uc ## cat) << 8) | (int(uchar(c)) & 0xff)

Definition at line 678 of file unicode.h.

#define IsPeekAheadSkippable (   sbf)    ((sbf & (ucfSbOLetter | ucfSbUpper | ucfSbLower | ucfSbSep | ucfSbSTerm | ucfSbATerm)) == 0)
#define TestCur (   curFlag)    ((sbfCur & ucfSb##curFlag) == ucfSb##curFlag)
#define TestCurNext (   curFlag,
  nextFlag 
)    if ((wbfCur & curFlag) == curFlag && (wbfNext & nextFlag) == nextFlag) continue
#define TestCurNext (   curFlag,
  nextFlag 
)    if ((sbfCur & curFlag) == curFlag && (sbfNext & nextFlag) == nextFlag) continue
#define TestCurNext2 (   curFlag,
  nextFlag,
  next2Flag 
)    if ((wbfCur & curFlag) == curFlag && (wbfNext & nextFlag) == nextFlag && (wbfNext2 & next2Flag) == next2Flag) continue
#define TestCurNext2 (   curFlag,
  nextFlag,
  next2Flag 
)    if ((sbfCur & curFlag) == curFlag && (sbfNext & nextFlag) == nextFlag && (sbfNext2 & next2Flag) == next2Flag) continue
#define TestPrevCurNext (   prevFlag,
  curFlag,
  nextFlag 
)    if ((wbfPrev & prevFlag) == prevFlag && (wbfCur & curFlag) == curFlag && (wbfNext & nextFlag) == nextFlag) continue
#define TestPrevCurNext (   prevFlag,
  curFlag,
  nextFlag 
)    if ((sbfPrev & prevFlag) == prevFlag && (sbfCur & curFlag) == curFlag && (sbfNext & nextFlag) == nextFlag) continue
#define Trans (   curFlag,
  newState 
)    if (TestCur(curFlag)) { backState = st##newState; break; }

Typedef Documentation

Definition at line 328 of file unicode.h.

Definition at line 655 of file unicode.h.

Definition at line 654 of file unicode.h.

Definition at line 653 of file unicode.h.

Definition at line 649 of file unicode.h.

Definition at line 650 of file unicode.h.

Definition at line 651 of file unicode.h.

Definition at line 652 of file unicode.h.

Definition at line 656 of file unicode.h.

Definition at line 330 of file unicode.h.

Definition at line 269 of file unicode.h.

typedef enum TUniChFlags_ TUniChFlags
typedef int TUniVecIdx

Definition at line 11 of file unicode.h.

Enumeration Type Documentation

Enumerator
boMachineEndian 
boLittleEndian 
boBigEndian 

Definition at line 38 of file unicode.h.

39 {
40  boMachineEndian = 0,
41  boLittleEndian = 1,
42  boBigEndian = 2
43 }
Enumerator
DefineUniCat 
DefineUniCat 
DefineUniCat 
DefineUniCat 
DefineUniCat 
DefineUniCat 
DefineUniCat 
DefineUniCat 

Definition at line 662 of file unicode.h.

663 {
664 #define DefineUniCat(cat, c) uc ## cat = (int(uchar(c)) & 0xff)
665  DefineUniCat(Letter, 'L'), // ucLetter
666  DefineUniCat(Mark, 'M'),
667  DefineUniCat(Number, 'N'),
668  DefineUniCat(Punctuation, 'P'),
669  DefineUniCat(Symbol, 'S'),
670  DefineUniCat(Separator, 'Z'),
671  DefineUniCat(Other, 'C')
672 #undef DefineUniCat
673 }
#define DefineUniCat(cat, c)
Definition: unicode.h:664
Enumerator
ucfCompatibilityDecomposition 
ucfCompositionExclusion 
ucfWbFormat 
ucfWbKatakana 
ucfWbALetter 
ucfWbMidLetter 
ucfWbMidNum 
ucfWbNumeric 
ucfWbExtendNumLet 
ucfSbSep 
ucfSbFormat 
ucfSbSp 
ucfSbLower 
ucfSbUpper 
ucfSbOLetter 
ucfSbNumeric 
ucfSbATerm 
ucfSbSTerm 
ucfSbClose 
ucfSbMask 
ucfWbMask 
ucfDcpAlphabetic 
ucfDcpDefaultIgnorableCodePoint 
ucfDcpLowercase 
ucfDcpGraphemeBase 
ucfDcpGraphemeExtend 
ucfDcpIdStart 
ucfDcpIdContinue 
ucfDcpMath 
ucfDcpUppercase 
ucfDcpXidStart 
ucfDcpXidContinue 
ucfDcpMask 

Definition at line 712 of file unicode.h.

713 {
714  ucfCompatibilityDecomposition = 1, // if this flag is not set, the decomposition is canonical
715  ucfCompositionExclusion = 1 << 1, // from CompositionExclusions.txt
716  // Flags used when searching for word boundaries. See UAX #29.
717  ucfWbFormat = 1 << 2,
718  ucfWbKatakana = 1 << 3,
719  ucfWbALetter = 1 << 4,
720  ucfWbMidLetter = 1 << 5,
721  ucfWbMidNum = 1 << 6,
722  ucfWbNumeric = 1 << 7,
723  ucfWbExtendNumLet = 1 << 8,
724  // Flags used with sentence boundaries (Sep is also used with word boundaries). See UAX #29.
725  ucfSbSep = 1 << 9,
726  ucfSbFormat = 1 << 10,
727  ucfSbSp = 1 << 11,
728  ucfSbLower = 1 << 12,
729  ucfSbUpper = 1 << 13,
730  ucfSbOLetter = 1 << 14,
731  ucfSbNumeric = 1 << 15,
732  ucfSbATerm = 1 << 16,
733  ucfSbSTerm = 1 << 17,
734  ucfSbClose = 1 << 18,
737  // Flags from DerivedCoreProperties.txt.
738  // [The comments are from UCD.html.]
739  // - Characters with the Alphabetic property. For more information, see Chapter 4 in [Unicode].
740  // Generated from: Other_Alphabetic + Lu + Ll + Lt + Lm + Lo + Nl
741  ucfDcpAlphabetic = 1 << 19,
742  // - For programmatic determination of default-ignorable code points.
743  // New characters that should be ignored in processing (unless explicitly supported)
744  // will be assigned in these ranges, permitting programs to correctly handle the default
745  // behavior of such characters when not otherwise supported. For more information, see
746  // UAX #29: Text Boundaries [Breaks].
747  // Generated from Other_Default_Ignorable_Code_Point + Cf + Cc + Cs + Noncharacters - White_Space - annotation characters
748  // [Examples: soft hyphen, zero-width space, noncharacters (e.g. U+fffe, U+ffff, U+1fffe, U+1ffff, etc.), surrogates, language tags, variation selectors]
750  // - Characters with the Lowercase property. For more information, see Chapter 4 in [Unicode].
751  // Generated from: Other_Lowercase + Ll
752  ucfDcpLowercase = 1 << 21,
753  // - For programmatic determination of grapheme cluster boundaries.
754  // For more information, see UAX #29: Text Boundaries [Breaks].
755  // Generated from: [0..10FFFF] - Cc - Cf - Cs - Co - Cn - Zl - Zp - Grapheme_Extend
756  ucfDcpGraphemeBase = 1 << 22,
757  // - For programmatic determination of grapheme cluster boundaries.
758  // For more information, see UAX #29: Text Boundaries [Breaks].
759  // Generated from: Other_Grapheme_Extend + Me + Mn
760  // Note: depending on an application's interpretation of Co (private use), they may be either
761  // in Grapheme_Base, or in Grapheme_Extend, or in neither.
762  ucfDcpGraphemeExtend = 1 << 23,
763  // - Used to determine programming identifiers, as described in UAX #31: Identifier and Pattern Syntax.
764  ucfDcpIdStart = 1 << 24,
765  ucfDcpIdContinue = 1 << 25,
766  // - Characters with the Math property. For more information, see Chapter 4 in [Unicode].
767  // Generated from: Sm + Other_Math
768  ucfDcpMath = 1 << 26,
769  // - Characters with the Uppercase property. For more information, see Chapter 4 in [Unicode].
770  // Generated from: Lu + Other_Uppercase
771  ucfDcpUppercase = 1 << 27,
772  // - Used to determine programming identifiers, as described in UAX #31: Identifier and Pattern Syntax.
773  ucfDcpXidStart = 1 << 28,
774  ucfDcpXidContinue = 1 << 29,
777 }
Enumerator
ucfPrAsciiHexDigit 
ucfPrBidiControl 
ucfPrDash 
ucfPrDeprecated 
ucfPrDiacritic 
ucfPrExtender 
ucfPrGraphemeLink 
ucfPrHexDigit 
ucfPrHyphen 
ucfPrIdeographic 
ucfPrJoinControl 
ucfPrLogicalOrderException 
ucfPrNoncharacterCodePoint 
ucfPrPatternSyntax 
ucfPrPatternWhiteSpace 
ucfPrQuotationMark 
ucfPrSoftDotted 
ucfPrSTerm 
ucfPrTerminalPunctuation 
ucfPrVariationSelector 
ucfPrWhiteSpace 

Definition at line 780 of file unicode.h.

781 {
782  // The flags from PropList.txt.
783  // [The comments are from UCD.html.]
784  // - ASCII characters commonly used for the representation of hexadecimal numbers.
785  // [= 0123456789abcdefABCDEF]
786  ucfPrAsciiHexDigit = 1,
787  // - Those format control characters which have specific functions in the Bidirectional Algorithm.
788  ucfPrBidiControl = 2,
789  // - Those punctuation characters explicitly called out as dashes in the Unicode Standard,
790  // plus compatibility equivalents to those. Most of these have the Pd General Category,
791  // but some have the Sm General Category because of their use in mathematics.
792  // U+0002d HYPHEN-MINUS
793  // U+0058a ARMENIAN HYPHEN
794  // U+005be HEBREW PUNCTUATION MAQAF
795  // U+01806 MONGOLIAN TODO SOFT HYPHEN
796  // U+02010 HYPHEN
797  // U+02011 NON-BREAKING HYPHEN
798  // U+02012 FIGURE DASH
799  // U+02013 EN DASH
800  // U+02014 EM DASH
801  // U+02015 HORIZONTAL BAR
802  // U+02053 SWUNG DASH
803  // U+0207b SUPERSCRIPT MINUS
804  // U+0208b SUBSCRIPT MINUS
805  // U+02212 MINUS SIGN
806  // U+02e17 DOUBLE OBLIQUE HYPHEN
807  // U+0301c WAVE DASH
808  // U+03030 WAVY DASH
809  // U+030a0 KATAKANA-HIRAGANA DOUBLE HYPHEN
810  // U+0fe31 PRESENTATION FORM FOR VERTICAL EM DASH
811  // U+0fe32 PRESENTATION FORM FOR VERTICAL EN DASH
812  // U+0fe58 SMALL EM DASH
813  // U+0fe63 SMALL HYPHEN-MINUS
814  // U+0ff0d FULLWIDTH HYPHEN-MINUS
815  ucfPrDash = 4,
816  // - For a machine-readable list of deprecated characters. No characters will ever be removed
817  // from the standard, but the usage of deprecated characters is strongly discouraged.
818  ucfPrDeprecated = 8,
819  // - Characters that linguistically modify the meaning of another character to which they apply.
820  // Some diacritics are not combining characters, and some combining characters are not diacritics.
821  ucfPrDiacritic = 0x10,
822  // - Characters whose principal function is to extend the value or shape of a preceding alphabetic
823  // character. Typical of these are length and iteration marks.
824  ucfPrExtender = 0x20,
825  // - Used in determining default grapheme cluster boundaries. For more information, see UAX #29: Text Boundaries.
826  ucfPrGraphemeLink = 0x40,
827  // - Characters commonly used for the representation of hexadecimal numbers, plus their compatibility equivalents.
828  // [= AsciiHexDigit + fullwidth digit {0..9} + fullwidth latin {small|capital} letter {a..f}]
829  ucfPrHexDigit = 0x80,
830  // - Those dashes used to mark connections between pieces of words, plus the Katakana middle dot.
831  // The Katakana middle dot functions like a hyphen, but is shaped like a dot rather than a dash.
832  // U+0002d HYPHEN-MINUS
833  // U+000ad SOFT HYPHEN
834  // U+0058a ARMENIAN HYPHEN
835  // U+01806 MONGOLIAN TODO SOFT HYPHEN
836  // U+02010 HYPHEN
837  // U+02011 NON-BREAKING HYPHEN
838  // U+02e17 DOUBLE OBLIQUE HYPHEN
839  // U+030fb KATAKANA MIDDLE DOT
840  // U+0fe63 SMALL HYPHEN-MINUS
841  // U+0ff0d FULLWIDTH HYPHEN-MINUS
842  // U+0ff65 HALFWIDTH KATAKANA MIDDLE DOT
843  ucfPrHyphen = 0x100,
844  // - Characters considered to be CJKV (Chinese, Japanese, Korean, and Vietnamese) ideographs.
845  ucfPrIdeographic = 0x200,
846  // - Those format control characters which have specific functions for control of cursive joining and ligation.
847  ucfPrJoinControl = 0x400,
848  // - There are a small number of characters that do not use logical order.
849  // These characters require special handling in most processing.
851  // - Code points that are permanently reserved for internal use.
853  // - Used for pattern syntax as described in UAX #31: Identifier and Pattern Syntax.
854  ucfPrPatternSyntax = 0x2000,
855  ucfPrPatternWhiteSpace = 0x4000,
856  // - Those punctuation characters that function as quotation marks.
857  // U+00022 QUOTATION MARK
858  // U+00027 APOSTROPHE
859  // U+000ab LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
860  // U+000bb RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
861  // U+02018 LEFT SINGLE QUOTATION MARK
862  // U+02019 RIGHT SINGLE QUOTATION MARK
863  // U+0201a SINGLE LOW-9 QUOTATION MARK
864  // U+0201b SINGLE HIGH-REVERSED-9 QUOTATION MARK
865  // U+0201c LEFT DOUBLE QUOTATION MARK
866  // U+0201d RIGHT DOUBLE QUOTATION MARK
867  // U+0201e DOUBLE LOW-9 QUOTATION MARK
868  // U+0201f DOUBLE HIGH-REVERSED-9 QUOTATION MARK
869  // U+02039 SINGLE LEFT-POINTING ANGLE QUOTATION MARK
870  // U+0203a SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
871  // U+0300c LEFT CORNER BRACKET
872  // U+0300d RIGHT CORNER BRACKET
873  // U+0300e LEFT WHITE CORNER BRACKET
874  // U+0300f RIGHT WHITE CORNER BRACKET
875  // U+0301d REVERSED DOUBLE PRIME QUOTATION MARK
876  // U+0301e DOUBLE PRIME QUOTATION MARK
877  // U+0301f LOW DOUBLE PRIME QUOTATION MARK
878  // U+0fe41 PRESENTATION FORM FOR VERTICAL LEFT CORNER BRACKET
879  // U+0fe42 PRESENTATION FORM FOR VERTICAL RIGHT CORNER BRACKET
880  // U+0fe43 PRESENTATION FORM FOR VERTICAL LEFT WHITE CORNER BRACKET
881  // U+0fe44 PRESENTATION FORM FOR VERTICAL RIGHT WHITE CORNER BRACKET
882  // U+0ff02 FULLWIDTH QUOTATION MARK
883  // U+0ff07 FULLWIDTH APOSTROPHE
884  // U+0ff62 HALFWIDTH LEFT CORNER BRACKET
885  // U+0ff63 HALFWIDTH RIGHT CORNER BRACKET
886  ucfPrQuotationMark = 0x8000,
887  // - Characters with a "soft dot", like i or j. An accent placed on these characters causes the dot to disappear.
888  // An explicit _dot above_ can be added where required, such as in Lithuanian.
889  ucfPrSoftDotted = 0x10000,
890  // - Sentence Terminal. Used in UAX #29: Text Boundaries.
891  // U+00021 EXCLAMATION MARK
892  // U+0002e FULL STOP
893  // U+0003f QUESTION MARK
894  // U+0203c DOUBLE EXCLAMATION MARK
895  // U+0203d INTERROBANG
896  // U+02047 DOUBLE QUESTION MARK
897  // U+02048 QUESTION EXCLAMATION MARK
898  // U+02049 EXCLAMATION QUESTION MARK
899  // U+03002 IDEOGRAPHIC FULL STOP
900  // [plus many characters from other writing systems]
901  ucfPrSTerm = 0x20000,
902  // - Those punctuation characters that generally mark the end of textual units.
903  // [JB note: this set contains more character than STerm. For example, it contains
904  // the comma, colon and semicolon, whereas STerm doesn't.]
905  // U+00021 EXCLAMATION MARK
906  // U+0002c COMMA
907  // U+0002e FULL STOP
908  // U+0003a COLON
909  // U+0003b SEMICOLON
910  // U+0003f QUESTION MARK
911  // U+0203c DOUBLE EXCLAMATION MARK
912  // U+0203d INTERROBANG
913  // U+02047 DOUBLE QUESTION MARK
914  // U+02048 QUESTION EXCLAMATION MARK
915  // U+02049 EXCLAMATION QUESTION MARK
916  // [plus *lots* of charcters from other writing systems]
917  ucfPrTerminalPunctuation = 0x40000,
918  // - Indicates all those characters that qualify as Variation Selectors.
919  // For details on the behavior of these characters, see StandardizedVariants.html and
920  // Section 16.4, Variation Selectors in [Unicode].
921  ucfPrVariationSelector = 0x80000,
922  // - Those separator characters and control characters which should be treated by
923  // programming languages as "white space" for the purpose of parsing elements.
924  // Note: ZERO WIDTH SPACE and ZERO WIDTH NO-BREAK SPACE are not included,
925  // since their functions are restricted to line-break control.
926  // Their names are unfortunately misleading in this respect.
927  // Note: There are other senses of "whitespace" that encompass a different set of characters.
928  // [JB note: e.g. there's a BIDI class for whitespace ('WS') in UnicodeData.txt.
929  // There's also a "Sp" class in the sentence boundary algorithm, see UAX #29, sec. 5.1.]
930  // This includes the following characters:
931  // U+0009 <control>
932  // U+000a <control>
933  // U+000b <control>
934  // U+000c <control>
935  // U+000d <control>
936  // U+0020 SPACE
937  // U+0085 <control>
938  // U+00a0 NO-BREAK SPACE
939  // U+1680 OGHAM SPACE MARK
940  // U+180e MONGOLIAN VOWEL SEPARATOR
941  // U+2000 EN QUAD
942  // U+2001 EM QUAD
943  // U+2002 EN SPACE
944  // U+2003 EM SPACE
945  // U+2004 THREE-PER-EM SPACE
946  // U+2005 FOUR-PER-EM SPACE
947  // U+2006 SIX-PER-EM SPACE
948  // U+2007 FIGURE SPACE
949  // U+2008 PUNCTUATION SPACE
950  // U+2009 THIN SPACE
951  // U+200a HAIR SPACE
952  // U+2028 LINE SEPARATOR
953  // U+2029 PARAGRAPH SEPARATOR
954  // U+202f NARROW NO-BREAK SPACE
955  // U+205f MEDIUM MATHEMATICAL SPACE
956  // U+3000 IDEOGRAPHIC SPACE
957  ucfPrWhiteSpace = 0x100000
958 }
Enumerator
ucfPxOtherAlphabetic 
ucfPxOtherDefaultIgnorableCodePoint 
ucfPxOtherGraphemeExtend 
ucfPxOtherIdContinue 
ucfPxOtherIdStart 
ucfPxOtherLowercase 
ucfPxOtherMath 
ucfPxOtherUppercase 
ucfPxIdsBinaryOperator 
ucfPxIdsTrinaryOperator 
ucfPxRadical 
ucfPxUnifiedIdeograph 

Definition at line 961 of file unicode.h.

962 {
963  // More properties from PropList.txt.
964  // - Used to derive the properties in DerivedCoreProperties.txt.
969  ucfPxOtherIdStart = 0x10,
970  ucfPxOtherLowercase = 0x20,
971  ucfPxOtherMath = 0x40,
972  ucfPxOtherUppercase = 0x80,
973  // - Used in ideographic description sequences.
974  ucfPxIdsBinaryOperator = 0x100,
975  ucfPxIdsTrinaryOperator = 0x200,
976  ucfPxRadical = 0x400,
977  ucfPxUnifiedIdeograph = 0x800
978 }
Enumerator
DefineUniSubCat 
DefineUniSubCat 
DefineUniSubCat 
DefineUniSubCat 
DefineUniSubCat 
DefineUniSubCat 
DefineUniSubCat 
DefineUniSubCat 
DefineUniSubCat 
DefineUniSubCat 
DefineUniSubCat 
DefineUniSubCat 
DefineUniSubCat 
DefineUniSubCat 
DefineUniSubCat 
DefineUniSubCat 
DefineUniSubCat 
DefineUniSubCat 
DefineUniSubCat 
DefineUniSubCat 
DefineUniSubCat 
DefineUniSubCat 
DefineUniSubCat 
DefineUniSubCat 
DefineUniSubCat 
DefineUniSubCat 
DefineUniSubCat 
DefineUniSubCat 
DefineUniSubCat 
DefineUniSubCat 
DefineUniSubCat 

Definition at line 676 of file unicode.h.

677 {
678 #define DefineUniSubCat(cat, subCat, c) uc ## cat ## subCat = ((uc ## cat) << 8) | (int(uchar(c)) & 0xff)
679  DefineUniSubCat(Letter, Uppercase, 'u'), // ucLetterUppercase
680  DefineUniSubCat(Letter, Lowercase, 'l'),
681  DefineUniSubCat(Letter, Titlecase, 't'),
682  DefineUniSubCat(Letter, Modifier, 'm'),
683  DefineUniSubCat(Letter, Other, 'o'),
684  DefineUniSubCat(Mark, Nonspacing, 'n'),
685  DefineUniSubCat(Mark, SpacingCombining, 'c'),
686  DefineUniSubCat(Mark, Enclosing, 'e'),
687  DefineUniSubCat(Number, DecimalDigit, 'd'),
688  DefineUniSubCat(Number, Letter, 'l'),
689  DefineUniSubCat(Number, Other, 'o'),
690  DefineUniSubCat(Punctuation, Connector, 'c'),
691  DefineUniSubCat(Punctuation, Dash, 'd'),
692  DefineUniSubCat(Punctuation, Open, 's'),
693  DefineUniSubCat(Punctuation, Close, 'e'),
694  DefineUniSubCat(Punctuation, InitialQuote, 'i'),
695  DefineUniSubCat(Punctuation, FinalQuote, 'f'),
696  DefineUniSubCat(Punctuation, Other, 'o'),
697  DefineUniSubCat(Symbol, Math, 'm'),
698  DefineUniSubCat(Symbol, Currency, 'c'),
699  DefineUniSubCat(Symbol, Modifier, 'k'),
700  DefineUniSubCat(Symbol, Other, 'o'),
701  DefineUniSubCat(Separator, Space, 's'),
702  DefineUniSubCat(Separator, Line, 'l'),
703  DefineUniSubCat(Separator, Paragraph, 'p'),
704  DefineUniSubCat(Other, Control, 'c'),
705  DefineUniSubCat(Other, Format, 'f'),
706  DefineUniSubCat(Other, Surrogate, 's'),
707  DefineUniSubCat(Other, PrivateUse, 'o'),
708  DefineUniSubCat(Other, NotAssigned, 'n')
709 }
#define DefineUniSubCat(cat, subCat, c)
Definition: unicode.h:678
Enumerator
uehIgnore 
uehThrow 
uehReplace 
uehAbort 

Definition at line 18 of file unicode.h.

19 {
20  // What happens when an error occurs:
21  uehIgnore = 0, // - it is silently ignored (nothing is added to the output vector)
22  uehThrow = 1, // - an exception is thrown (TUnicodeException)
23  uehReplace = 2, // - the replacement character is added to the output vector
24  uehAbort = 3 // - the encoding/decoding process stops immediately
25 }
Enumerator
bomAllowed 
bomRequired 
bomIgnored 

Definition at line 46 of file unicode.h.

47 {
48  bomAllowed = 0, // if a BOM is present, it is used to determine the byte order; otherwise, the default byte order is used
49  bomRequired = 1, // if a BOM is present, it is used to determine the byte order; otherwise, an error is reported
50  bomIgnored = 2 // the default byte order is used; if a BOM is present, it is treated like any other character
51 }

Function Documentation

bool AlwaysFalse ( )
inline

Definition at line 3227 of file unicode.h.

Referenced by TUniChDb::InitScripts(), and TUniChDb::TestFindNextWordOrSentenceBoundary().

3228 {
3229  int sum = 0;
3230  for (int i = 0; i < 5; i++) sum += i;
3231  return sum > 100;
3232 }

Here is the caller graph for this function:

bool AlwaysTrue ( )
inline

Definition at line 3234 of file unicode.h.

3235 {
3236  int sum = 0;
3237  for (int i = 0; i < 5; i++) sum += i;
3238  return sum < 100;
3239 }