SNAP Library 4.0, Developer Reference  2017-07-27 13:18:06
SNAP, a general purpose, high performance system for analysis and manipulation of large networks
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
unicodestring.cpp
Go to the documentation of this file.
1 // Unicode-Definition
3 
5 
7  return TSysProc::GetExeFNm().GetFPath()+"UnicodeDef.Bin";
8 }
9 
11 // Unicode-String
12 TUStr::TUStr(const TStr& Str){
15  TIntV NfcUniChV; TUnicodeDef::GetDef()->Decompose(UniChV, NfcUniChV, true);
16  UniChV=NfcUniChV;
17 }
18 
21 }
22 
25 }
26 
28  TIntV StarterUniChV;
29  TUnicodeDef::GetDef()->ExtractStarters(UniChV, StarterUniChV);
30  TUnicodeDef::GetDef()->Decompose(StarterUniChV, UniChV, true);
31 }
32 
33 void TUStr::GetWordBoundPV(TBoolV& WordBoundPV){
35 }
36 
37 void TUStr::GetWordUStrV(TUStrV& WordUStrV){
38  // clear word vector
39  WordUStrV.Clr();
40  // create boundaries
41  TBoolV WordBoundPV; GetWordBoundPV(WordBoundPV);
42  IAssert(Len()==WordBoundPV.Len()-1);
43  IAssert((WordBoundPV.Len()>0)&&(WordBoundPV.Last()));
44  // traverse characters and bounds
45  int UniChs=Len(); TIntV WordUniChV;
46  for (int UniChN=0; UniChN<=UniChs; UniChN++){
47  if ((UniChN==UniChs)||(WordBoundPV[UniChN+1])){ // finish or word-boundary
48  if (UniChN<UniChs){ // if not finish
49  // if last-word-char or single-alphabetic-char
50  if ((!WordUniChV.Empty())||(IsAlphabetic(UniChV[UniChN]))){
51  WordUniChV.Add(UniChV[UniChN]); // add char
52  }
53  }
54  if (!WordUniChV.Empty()){ // add current word to vector
55  TUStr WordUStr(WordUniChV); // construct word from char-vector
56  WordUStrV.Add(WordUStr); // add word to word-vector
57  WordUniChV.Clr(false); // clear char-vector
58  }
59  } else {
60  // add character to char-vector
61  WordUniChV.Add(UniChV[UniChN]);
62  }
63  }
64 }
65 
68  return Str;
69 }
70 
72  TIntV UniChV1; TIntV UniChV2;
74  TUnicodeDef::GetDef()->Decompose(UniChV1, UniChV2, true);
75  TStr Str=TUnicodeDef::GetDef()->EncodeUtf8Str(UniChV2);
76  return Str;
77 }
78 
80  TIntV UniChV1; TIntV UniChV2; TIntV UniChV3;
82  TUnicodeDef::GetDef()->ExtractStarters(UniChV1, UniChV2);
83  TUnicodeDef::GetDef()->Decompose(UniChV2, UniChV3, true);
84  TStr Str=TUnicodeDef::GetDef()->EncodeUtf8Str(UniChV3);
85  return Str;
86 }
87 
88 int TUStr::GetScriptId(const TStr& ScriptNm){
89  return TUnicodeDef::GetDef()->ucd.GetScriptByName(ScriptNm);
90 }
91 
92 TStr TUStr::GetScriptNm(const int& ScriptId){
93  return TUnicodeDef::GetDef()->ucd.GetScriptName(ScriptId);
94 }
95 
96 int TUStr::GetChScriptId(const int& UniCh){
97  return TUnicodeDef::GetDef()->ucd.GetScript(UniCh);
98 }
99 
100 TStr TUStr::GetChScriptNm(const int& UniCh){
101  return GetScriptNm(GetChScriptId(UniCh));
102 }
103 
104 TStr TUStr::GetChNm(const int& UniCh){
105  TStr UniChNm(TUnicodeDef::GetDef()->ucd.GetCharNameS(UniCh));
106  return UniChNm;
107 }
108 
109 TStr TUStr::GetChTypeStr(const int& UniCh){
110  TChA ChTypeChA;
111  ChTypeChA+='[';
112  if (IsCase(UniCh)){ChTypeChA+="Case,";}
113  if (IsUpperCase(UniCh)){ChTypeChA+="UpperCase,";}
114  if (IsLowerCase(UniCh)){ChTypeChA+="LowerCase,";}
115  if (IsAlphabetic(UniCh)){ChTypeChA+="Alphabetic,";}
116  if (IsMath(UniCh)){ChTypeChA+="Math,";}
117  if (ChTypeChA.LastCh()=='['){ChTypeChA+=']';}
118  else {ChTypeChA[ChTypeChA.Len()-1]=']';}
119  return ChTypeChA;
120 }
121 
122 bool TUStr::IsCase(const int& UniCh){
123  TUniChInfo ChInfo;
124  if (TUnicodeDef::GetDef()->ucd.IsGetChInfo(UniCh, ChInfo)){
125  return ChInfo.IsCased();}
126  else {return false;}
127 }
128 
129 bool TUStr::IsUpperCase(const int& UniCh){
130  TUniChInfo ChInfo;
131  if (TUnicodeDef::GetDef()->ucd.IsGetChInfo(UniCh, ChInfo)){
132  return ChInfo.IsUppercase();}
133  else {return false;}
134 }
135 
136 bool TUStr::IsLowerCase(const int& UniCh){
137  TUniChInfo ChInfo;
138  if (TUnicodeDef::GetDef()->ucd.IsGetChInfo(UniCh, ChInfo)){
139  return ChInfo.IsLowercase();}
140  else {return false;}
141 }
142 
143 bool TUStr::IsAlphabetic(const int& UniCh){
144  TUniChInfo ChInfo;
145  if (TUnicodeDef::GetDef()->ucd.IsGetChInfo(UniCh, ChInfo)){
146  return ChInfo.IsAlphabetic();}
147  else {return false;}
148 }
149 
150 bool TUStr::IsMath(const int& UniCh){
151  TUniChInfo ChInfo;
152  if (TUnicodeDef::GetDef()->ucd.IsGetChInfo(UniCh, ChInfo)){
153  return ChInfo.IsMath();}
154  else {return false;}
155 }
156 
157 TStr TUStr::EncodeUtf8(const int& UniCh) {
160 }
161 
#define IAssert(Cond)
Definition: bd.h:262
static bool IsAlphabetic(const int &UniCh)
void ToSimpleUpperCase(TIntV &src) const
Definition: unicode.h:1977
void ToStarterCase()
int GetScriptByName(const TStr &scriptName) const
Definition: unicode.h:1322
TStr EncodeUtf8Str(const TIntV &src) const
Definition: unicode.h:1796
static TUnicodeDef UnicodeDef
Definition: unicodestring.h:5
TStr GetFPath() const
Definition: dt.cpp:1389
static TStr GetChNm(const int &UniCh)
TSizeTy Len() const
Returns the number of elements in the vector.
Definition: ds.h:575
bool IsAlphabetic() const
Definition: unicode.h:1071
void ToUpperCase()
const TStr & GetScriptName(const int scriptId) const
Definition: unicode.h:1321
int Len() const
Definition: dt.h:259
int GetScript(const TUniChInfo &ci) const
Definition: unicode.h:1323
TUniChDb ucd
Definition: unicode.h:1775
static TUnicode * GetDef()
Definition: unicodestring.h:23
void ToSimpleLowerCase(TIntV &src) const
Definition: unicode.h:1978
void GetSimpleLowerCase(const TIntV &src, TIntV &dest) const
Definition: unicode.h:1972
TStr GetStarterLowerCaseStr() const
static TStr GetChScriptNm(const int &UniCh)
static TStr GetDfFNm()
bool IsUppercase() const
Definition: unicode.h:1072
static TStr GetChTypeStr(const int &UniCh)
bool Empty() const
Tests whether the vector is empty.
Definition: ds.h:570
void FindWordBoundaries(const TIntV &src, TBoolV &dest) const
Definition: unicode.h:1907
bool IsLowercase() const
Definition: unicode.h:1073
static bool IsLowerCase(const int &UniCh)
TStr GetCharNameS(const int cp) const
Definition: unicode.h:2025
void Clr(const bool &DoDel=true, const TSizeTy &NoDelLim=-1)
Clears the contents of the vector.
Definition: ds.h:1022
static bool IsMath(const int &UniCh)
char LastCh() const
Definition: dt.h:281
int ExtractStarters(const TIntV &src, TIntV &dest) const
Definition: unicode.h:1951
static int GetChScriptId(const int &UniCh)
const TVal & Last() const
Returns a reference to the last element of the vector.
Definition: ds.h:579
static bool IsUpperCase(const int &UniCh)
static int GetScriptId(const TStr &ScriptNm)
void GetWordUStrV(TUStrV &UStrV)
void GetWordBoundPV(TBoolV &WordBoundPV)
int DecodeUtf8(const TIntV &src, TIntV &dest) const
Definition: unicode.h:1787
Definition: dt.h:201
Definition: dt.h:412
TIntV UniChV
Definition: unicodestring.h:34
static TStr GetScriptNm(const int &ScriptId)
bool IsCased() const
Definition: unicode.h:1142
void Decompose(const TIntV &src, TIntV &dest, bool compatibility) const
Definition: unicode.h:1934
void ToLowerCase()
static TVec< TInt, TSizeTy > GetV(const TInt &Val1)
Returns a vector on element Val1.
Definition: ds.h:848
TStr GetStarterStr() const
static TStr EncodeUtf8(const int &UniCh)
static void AssertUnicodeDefOk()
Definition: unicodestring.h:35
bool IsMath() const
Definition: unicode.h:1074
TSizeTy Add()
Adds a new element at the end of the vector, after its current last element.
Definition: ds.h:602
int Len() const
Definition: unicodestring.h:57
TStr GetStr() const
static bool IsCase(const int &UniCh)