SNAP Library, User Reference  2012-10-15 15:06:59
SNAP, a general purpose network analysis and graph mining library
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines
TUnicode Class Reference

#include <unicode.h>

List of all members.

Public Types

typedef TUniChDb::TCaseConversion TCaseConversion

Public Member Functions

 TUnicode ()
 TUnicode (const TStr &fnBinUcd)
void Init ()
int DecodeUtf8 (const TIntV &src, TIntV &dest) const
int DecodeUtf8 (const TStr &src, TIntV &dest) const
int EncodeUtf8 (const TIntV &src, TIntV &dest) const
TStr EncodeUtf8Str (const TIntV &src) const
int DecodeUtf16FromBytes (const TIntV &src, TIntV &dest, const TUtf16BomHandling bomHandling=bomAllowed, const TUniByteOrder defaultByteOrder=boMachineEndian) const
int DecodeUtf16FromWords (const TIntV &src, TIntV &dest, const TUtf16BomHandling bomHandling=bomAllowed, const TUniByteOrder defaultByteOrder=boMachineEndian) const
int EncodeUtf16ToWords (const TIntV &src, TIntV &dest, const bool insertBom, const TUniByteOrder destByteOrder=boMachineEndian) const
int EncodeUtf16ToBytes (const TIntV &src, TIntV &dest, const bool insertBom, const TUniByteOrder destByteOrder=boMachineEndian) const
void RegisterCodec (const TStr &nameList, const PCodecBase &codec)
void UnregisterCodec (const TStr &nameList)
void ClrCodecs ()
void InitCodecs ()
PCodecBase GetCodec (const TStr &name) const
void GetAllCodecs (TCodecBaseV &dest) const
bool FindNextWordBoundary (const TIntV &src, int &position) const
void FindWordBoundaries (const TIntV &src, TBoolV &dest) const
bool FindNextSentenceBoundary (const TIntV &src, int &position) const
void FindSentenceBoundaries (const TIntV &src, TBoolV &dest) const
void ClrSentenceBoundaryExceptions ()
void UseEnglishSentenceBoundaryExceptions ()
void Decompose (const TIntV &src, TIntV &dest, bool compatibility) const
void Compose (const TIntV &src, TIntV &dest) const
void DecomposeAndCompose (const TIntV &src, TIntV &dest, bool compatibility) const
int ExtractStarters (const TIntV &src, TIntV &dest) const
int ExtractStarters (TIntV &src) const
void GetLowerCase (const TIntV &src, TIntV &dest) const
void GetUpperCase (const TIntV &src, TIntV &dest) const
void GetTitleCase (const TIntV &src, TIntV &dest) const
void GetSimpleLowerCase (const TIntV &src, TIntV &dest) const
void GetSimpleUpperCase (const TIntV &src, TIntV &dest) const
void GetSimpleTitleCase (const TIntV &src, TIntV &dest) const
void ToSimpleUpperCase (TIntV &src) const
void ToSimpleLowerCase (TIntV &src) const
void ToSimpleTitleCase (TIntV &src) const
void GetCaseFolded (const TIntV &src, TIntV &dest, const bool full=true) const
void ToCaseFolded (TIntV &src) const
TStr GetUtf8CaseFolded (const TStr &s) const
DECLARE_FORWARDED_PROPERTY_METHODS ___UniFwd2 (IsPrivateUse, IsSurrogate) TUniChCategory GetCat(const int cp) const
TUniChSubCategory GetSubCat (const int cp) const
const char * GetCharName (const int cp) const
TStr GetCharNameS (const int cp) const

Public Attributes

TUniCodec codec
TUniChDb ucd
T8BitCodec< TEncoding_ISO8859_1iso8859_1
T8BitCodec< TEncoding_ISO8859_2iso8859_2
T8BitCodec< TEncoding_ISO8859_3iso8859_3
T8BitCodec< TEncoding_ISO8859_4iso8859_4
T8BitCodec< TEncoding_YuAsciiyuAscii
T8BitCodec< TEncoding_CP1250cp1250
T8BitCodec< TEncoding_CP852cp852
T8BitCodec< TEncoding_CP437cp437

Static Protected Member Functions

static TStr NormalizeCodecName (const TStr &name)

Protected Attributes

THash< TStr, PCodecBasecodecs

Detailed Description

Definition at line 1768 of file unicode.h.


Member Typedef Documentation


Constructor & Destructor Documentation

TUnicode::TUnicode ( ) [inline]

Definition at line 1774 of file unicode.h.

{ Init(); }
TUnicode::TUnicode ( const TStr fnBinUcd) [inline, explicit]

Definition at line 1775 of file unicode.h.

{ ucd.LoadBin(fnBinUcd); Init(); }

Member Function Documentation

DECLARE_FORWARDED_PROPERTY_METHODS TUnicode::___UniFwd2 ( IsPrivateUse  ,
IsSurrogate   
) const [inline]

Definition at line 2011 of file unicode.h.

                                                  { return ucd.GetCat(cp); }
void TUnicode::ClrCodecs ( ) [inline]

Definition at line 1874 of file unicode.h.

{ codecs.Clr(); }

Definition at line 1917 of file unicode.h.

{ ucd.SbEx_Clr(); }
void TUnicode::Compose ( const TIntV src,
TIntV dest 
) const [inline]

Definition at line 1934 of file unicode.h.

{ return ucd.Compose(src, dest, true); }
int TUnicode::DecodeUtf16FromBytes ( const TIntV src,
TIntV dest,
const TUtf16BomHandling  bomHandling = bomAllowed,
const TUniByteOrder  defaultByteOrder = boMachineEndian 
) const [inline]

Definition at line 1803 of file unicode.h.

                                                                              {
                        return (int) codec.DecodeUtf16FromBytes(src, 0, src.Len(), dest, true, bomHandling, defaultByteOrder); }
int TUnicode::DecodeUtf16FromWords ( const TIntV src,
TIntV dest,
const TUtf16BomHandling  bomHandling = bomAllowed,
const TUniByteOrder  defaultByteOrder = boMachineEndian 
) const [inline]

Definition at line 1816 of file unicode.h.

                                                                              {
                        return (int) codec.DecodeUtf16FromWords(src, 0, src.Len(), dest, true, bomHandling, defaultByteOrder); }
int TUnicode::DecodeUtf8 ( const TIntV src,
TIntV dest 
) const [inline]

Definition at line 1784 of file unicode.h.

{ return (int) codec.DecodeUtf8(src, dest); }
int TUnicode::DecodeUtf8 ( const TStr src,
TIntV dest 
) const [inline]

Definition at line 1785 of file unicode.h.

{ return (int) codec.DecodeUtf8(src, dest); }
void TUnicode::Decompose ( const TIntV src,
TIntV dest,
bool  compatibility 
) const [inline]

Definition at line 1927 of file unicode.h.

{ ucd.Decompose(src, dest, compatibility, true); }
void TUnicode::DecomposeAndCompose ( const TIntV src,
TIntV dest,
bool  compatibility 
) const [inline]

Definition at line 1939 of file unicode.h.

{ return ucd.DecomposeAndCompose(src, dest, compatibility); }
int TUnicode::EncodeUtf16ToBytes ( const TIntV src,
TIntV dest,
const bool  insertBom,
const TUniByteOrder  destByteOrder = boMachineEndian 
) const [inline]

Definition at line 1831 of file unicode.h.

                                                                           {
                        return (int) codec.EncodeUtf16ToBytes(src, 0, src.Len(), dest, true, insertBom, destByteOrder); }
int TUnicode::EncodeUtf16ToWords ( const TIntV src,
TIntV dest,
const bool  insertBom,
const TUniByteOrder  destByteOrder = boMachineEndian 
) const [inline]

Definition at line 1827 of file unicode.h.

                                                                           {
                        return (int) codec.EncodeUtf16ToWords(src, 0, src.Len(), dest, true, insertBom, destByteOrder); }
int TUnicode::EncodeUtf8 ( const TIntV src,
TIntV dest 
) const [inline]

Definition at line 1789 of file unicode.h.

{ return (int) codec.EncodeUtf8(src, dest); }
TStr TUnicode::EncodeUtf8Str ( const TIntV src) const [inline]

Definition at line 1793 of file unicode.h.

{ return codec.EncodeUtf8Str(src); }
int TUnicode::ExtractStarters ( const TIntV src,
TIntV dest 
) const [inline]

Definition at line 1944 of file unicode.h.

{ return (int) ucd.ExtractStarters(src, dest); }
int TUnicode::ExtractStarters ( TIntV src) const [inline]

Definition at line 1946 of file unicode.h.

{ return (int) ucd.ExtractStarters(src); }
bool TUnicode::FindNextSentenceBoundary ( const TIntV src,
int &  position 
) const [inline]

Definition at line 1909 of file unicode.h.

                                                                             {
                if (position < 0) { position = 0; return true; }
                size_t position_; bool retVal = ucd.FindNextSentenceBoundary(src, 0, src.Len(), position_); position = int(position_); return retVal; }
bool TUnicode::FindNextWordBoundary ( const TIntV src,
int &  position 
) const [inline]

Definition at line 1894 of file unicode.h.

                                                                         {
                if (position < 0) { position = 0; return true; }
                size_t position_; bool retVal = ucd.FindNextWordBoundary(src, 0, src.Len(), position_); position = int(position_); return retVal; }
void TUnicode::FindSentenceBoundaries ( const TIntV src,
TBoolV dest 
) const [inline]

Definition at line 1915 of file unicode.h.

{ ucd.FindSentenceBoundaries(src, 0, src.Len(), dest); }
void TUnicode::FindWordBoundaries ( const TIntV src,
TBoolV dest 
) const [inline]

Definition at line 1900 of file unicode.h.

{ ucd.FindWordBoundaries(src, 0, src.Len(), dest); }
void TUnicode::GetAllCodecs ( TCodecBaseV dest) const [inline]

Definition at line 1880 of file unicode.h.

                                                   {
                dest.Clr();
                for (int i = codecs.FFirstKeyId(); codecs.FNextKeyId(i); ) {
                        PCodecBase codec = codecs[i]; bool found = false;
                        for (int j = 0; j < dest.Len(); j++) if (dest[j]() == codec()) { found = true; break; }
                        if (! found) dest.Add(codec); }}
void TUnicode::GetCaseFolded ( const TIntV src,
TIntV dest,
const bool  full = true 
) const [inline]

Definition at line 1982 of file unicode.h.

{ return ucd.GetCaseFolded(src, dest, true, full, false); }
const char* TUnicode::GetCharName ( const int  cp) const [inline]

Definition at line 2017 of file unicode.h.

{ return ucd.GetCharName(cp); }
TStr TUnicode::GetCharNameS ( const int  cp) const [inline]

Definition at line 2018 of file unicode.h.

{ return ucd.GetCharNameS(cp); }
PCodecBase TUnicode::GetCodec ( const TStr name) const [inline]

Definition at line 1876 of file unicode.h.

                                                    {
                TStr s = NormalizeCodecName(name);
                PCodecBase p; if (! codecs.IsKeyGetDat(s, p)) p.Clr();
                return p; }
void TUnicode::GetLowerCase ( const TIntV src,
TIntV dest 
) const [inline]

Definition at line 1958 of file unicode.h.

{ ucd.GetLowerCase(src, dest, true, false, false); }
void TUnicode::GetSimpleLowerCase ( const TIntV src,
TIntV dest 
) const [inline]

Definition at line 1965 of file unicode.h.

{ ucd.GetSimpleLowerCase(src, dest, true); }
void TUnicode::GetSimpleTitleCase ( const TIntV src,
TIntV dest 
) const [inline]

Definition at line 1967 of file unicode.h.

{ ucd.GetSimpleTitleCase(src, dest, true); }
void TUnicode::GetSimpleUpperCase ( const TIntV src,
TIntV dest 
) const [inline]

Definition at line 1966 of file unicode.h.

{ ucd.GetSimpleUpperCase(src, dest, true); }
TUniChSubCategory TUnicode::GetSubCat ( const int  cp) const [inline]

Definition at line 2014 of file unicode.h.

{ return ucd.GetSubCat(cp); }
void TUnicode::GetTitleCase ( const TIntV src,
TIntV dest 
) const [inline]

Definition at line 1960 of file unicode.h.

{ ucd.GetTitleCase(src, dest, true, false, false); }
void TUnicode::GetUpperCase ( const TIntV src,
TIntV dest 
) const [inline]

Definition at line 1959 of file unicode.h.

{ ucd.GetUpperCase(src, dest, true, false, false); }
TStr TUnicode::GetUtf8CaseFolded ( const TStr s) const [inline]

Definition at line 1987 of file unicode.h.

                                                    {
                bool isAscii = true;
                for (int i = 0, n = s.Len(); i < n; i++) if (uchar(s[i]) >= 128) { isAscii = false; break; }
                if (isAscii) return s.GetLc();
                TIntV src; DecodeUtf8(s, src);
                TIntV dest; GetCaseFolded(src, dest);
                return EncodeUtf8Str(dest); }
void TUnicode::Init ( ) [inline]

Definition at line 1776 of file unicode.h.

{ InitCodecs(); }

Definition at line 1687 of file unicode.cpp.

{
        ClrCodecs();
        RegisterCodec("ISO-8859-1 ISO_8859-1 ISO_8859-1:1987 ISO-IR-100 CP819 IBM819 LATIN1 L1 csISOLatin1 ISO8859-1 ISO8859_1 CP28591", TCodecBase::New<TCodec_ISO8859_1>());
        RegisterCodec("ISO-8859-2 ISO_8859-2 ISO_8859-2:1987 ISO-IR-101 LATIN2 L2 csISOLatin2 ISO8859-2 ISO8859_2 CP28592", TCodecBase::New<TCodec_ISO8859_2>());
        RegisterCodec("ISO-8859-3 ISO_8859-3 ISO_8859-3:1988 ISO-IR-109 LATIN3 L3 csISOLatin3 ISO8859-3 ISO8859_3 CP28593", TCodecBase::New<TCodec_ISO8859_3>());
        RegisterCodec("ISO-8859-4 ISO_8859-4 ISO_8859-4:1988 ISO-IR-110 LATIN4 L4 csISOLatin4 ISO8859-4 ISO8859_4 CP28594", TCodecBase::New<TCodec_ISO8859_4>());
        RegisterCodec("YUASCII YU-ASCII YU_ASCII", TCodecBase::New<TCodec_YuAscii>());
        RegisterCodec("CP1250 Windows-1250 MS-EE", TCodecBase::New<TCodec_CP1250>());
        RegisterCodec("CP852 cp852_DOSLatin2 DOSLatin2", TCodecBase::New<TCodec_CP852>());
        RegisterCodec("CP437 cp437_DOSLatinUS DOSLatinUS", TCodecBase::New<TCodec_CP437>());
}
static TStr TUnicode::NormalizeCodecName ( const TStr name) [inline, static, protected]

Definition at line 1863 of file unicode.h.

                                                                {
                TStr s = name.GetLc(); s.ChangeStrAll("_", ""); s.ChangeStrAll("-", ""); return s; }
void TUnicode::RegisterCodec ( const TStr nameList,
const PCodecBase codec 
) [inline]

Definition at line 1866 of file unicode.h.

                                                                          {
                TStrV names; nameList.SplitOnWs(names);
                for (int i = 0; i < names.Len(); i++)
                        codecs.AddDat(NormalizeCodecName(names[i]), codec); }
void TUnicode::ToCaseFolded ( TIntV src) const [inline]

Definition at line 1985 of file unicode.h.

{ return ucd.ToCaseFolded(src, false); }
void TUnicode::ToSimpleLowerCase ( TIntV src) const [inline]

Definition at line 1971 of file unicode.h.

void TUnicode::ToSimpleTitleCase ( TIntV src) const [inline]

Definition at line 1972 of file unicode.h.

void TUnicode::ToSimpleUpperCase ( TIntV src) const [inline]

Definition at line 1970 of file unicode.h.

void TUnicode::UnregisterCodec ( const TStr nameList) [inline]

Definition at line 1870 of file unicode.h.

                                                   {
                TStrV names; nameList.SplitOnWs(names);
                for (int i = 0; i < names.Len(); i++)
                        codecs.DelKey(NormalizeCodecName(names[i])); }

Definition at line 1918 of file unicode.h.


Member Data Documentation

Definition at line 1771 of file unicode.h.

Definition at line 1862 of file unicode.h.

Definition at line 1772 of file unicode.h.


The documentation for this class was generated from the following files: