#include <unicode.h>

Public Types
enum	{ DefaultReplacementChar = 0xfffd }
Public Member Functions
	TUniCodec ()
	TUniCodec (TUnicodeErrorHandling errorHandling_, bool strict_, int replacementChar_, bool skipBom_)
template<typename TSrcVec , typename TDestCh >
size_t	DecodeUtf8 (const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest=true) const
template<typename TSrcVec , typename TDestCh >
size_t	DecodeUtf8 (const TSrcVec &src, TVec< TDestCh > &dest, const bool clrDest=true) const
template<typename TSrcVec , typename TDestCh >
size_t	EncodeUtf8 (const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest=true) const
template<typename TSrcVec , typename TDestCh >
size_t	EncodeUtf8 (const TSrcVec &src, TVec< TDestCh > &dest, const bool clrDest=true) const
template<typename TSrcVec >
TStr	EncodeUtf8Str (const TSrcVec &src, size_t srcIdx, const size_t srcCount) const
template<typename TSrcVec >
TStr	EncodeUtf8Str (const TSrcVec &src) const
template<typename TSrcVec , typename TDestCh >
size_t	DecodeUtf16FromBytes (const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest, const TUtf16BomHandling bomHandling=bomAllowed, const TUniByteOrder defaultByteOrder=boMachineEndian) const
template<typename TSrcVec , typename TDestCh >
size_t	DecodeUtf16FromWords (const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, bool clrDest, const TUtf16BomHandling bomHandling=bomAllowed, const TUniByteOrder defaultByteOrder=boMachineEndian) const
template<typename TSrcVec , typename TDestCh >
size_t	EncodeUtf16ToWords (const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest, const bool insertBom, const TUniByteOrder destByteOrder=boMachineEndian) const
template<typename TSrcVec , typename TDestCh >
size_t	EncodeUtf16ToBytes (const TSrcVec &src, size_t srcIdx, const size_t srcCount, TVec< TDestCh > &dest, const bool clrDest, const bool insertBom, const TUniByteOrder destByteOrder=boMachineEndian) const
void	TestUtf8 ()
void	TestUtf16 ()
Public Attributes
int	replacementChar
TUnicodeErrorHandling	errorHandling
bool	strict
bool	skipBom
Protected Types
enum	{ DefineByte = (1, 0, 0, 0, 0, 0, 0, 0), DefineByte = (1, 0, 0, 0, 0, 0, 0, 0), DefineByte = (1, 0, 0, 0, 0, 0, 0, 0), DefineByte = (1, 0, 0, 0, 0, 0, 0, 0), DefineByte = (1, 0, 0, 0, 0, 0, 0, 0), DefineByte = (1, 0, 0, 0, 0, 0, 0, 0), DefineByte = (1, 0, 0, 0, 0, 0, 0, 0), DefineByte = (1, 0, 0, 0, 0, 0, 0, 0), DefineByte = (1, 0, 0, 0, 0, 0, 0, 0), DefineByte = (1, 0, 0, 0, 0, 0, 0, 0), DefineByte = (1, 0, 0, 0, 0, 0, 0, 0), DefineByte = (1, 0, 0, 0, 0, 0, 0, 0), DefineByte = (1, 0, 0, 0, 0, 0, 0, 0) }
enum	{ Utf16FirstSurrogate = 0xd800, Utf16SecondSurrogate = 0xdc00 }
typedef TUniVecIdx	TVecIdx
Protected Member Functions
void	TestUtf8 (bool decode, size_t expectedRetVal, bool expectedThrow, const TIntV &src, const TIntV &expectedDest, FILE *f)
void	TestDecodeUtf8 (TRnd &rnd, const TStr &testCaseDesc)
void	WordsToBytes (const TIntV &src, TIntV &dest)
void	TestUtf16 (bool decode, size_t expectedRetVal, bool expectedThrow, const TIntV &src, const TIntV &expectedDest, const TUtf16BomHandling bomHandling, const TUniByteOrder defaultByteOrder, const bool insertBom, FILE *f)
void	TestDecodeUtf16 (TRnd &rnd, const TStr &testCaseDesc, const TUtf16BomHandling bomHandling, const TUniByteOrder defaultByteOrder, const bool insertBom)
Static Protected Member Functions
static bool	IsMachineLittleEndian ()
static uint	GetRndUint (TRnd &rnd)
static uint	GetRndUint (TRnd &rnd, uint minVal, uint maxVal)
static int	SwapBytes (int x)
Friends
class	TUniCaseFolding
class	TUnicode

Detailed Description

Definition at line 54 of file unicode.h.

Member Typedef Documentation

typedef TUniVecIdx TUniCodec::TVecIdx [protected]

Definition at line 118 of file unicode.h.

Member Enumeration Documentation

anonymous enum

Enumerator:

DefaultReplacementChar

Definition at line 59 of file unicode.h.

{ DefaultReplacementChar = 0xfffd };

anonymous enum [protected]

Enumerator:

DefineByte
DefineByte
DefineByte
DefineByte
DefineByte
DefineByte
DefineByte
DefineByte
DefineByte
DefineByte
DefineByte
DefineByte
DefineByte

Definition at line 101 of file unicode.h.

             {
#define DefineByte(b7, b6, b5, b4, b3, b2, b1, b0) _ ## b7 ## b6 ## b5 ## b4 ## _ ## b3 ## b2 ## b1 ## b0 = (b7 << 7) | (b6 << 6) | (b5 << 5) | (b4 << 4) | (b3 << 3) | (b2 << 2) | (b1 << 1) | b0
                DefineByte(1, 0, 0, 0, 0, 0, 0, 0),
                DefineByte(1, 1, 0, 0, 0, 0, 0, 0),
                DefineByte(1, 1, 1, 0, 0, 0, 0, 0),
                DefineByte(1, 1, 1, 1, 0, 0, 0, 0),
                DefineByte(1, 1, 1, 1, 1, 0, 0, 0),
                DefineByte(1, 1, 1, 1, 1, 1, 0, 0),
                DefineByte(1, 1, 1, 1, 1, 1, 1, 0),
                DefineByte(0, 0, 1, 1, 1, 1, 1, 1),
                DefineByte(0, 0, 0, 1, 1, 1, 1, 1),
                DefineByte(0, 0, 0, 0, 1, 1, 1, 1),
                DefineByte(0, 0, 0, 0, 0, 1, 1, 1),
                DefineByte(0, 0, 0, 0, 0, 0, 1, 1)
#undef DefineByte
        };

anonymous enum [protected]

Enumerator:

Utf16FirstSurrogate
Utf16SecondSurrogate

Definition at line 157 of file unicode.h.

             {
                Utf16FirstSurrogate = 0xd800,
                Utf16SecondSurrogate = 0xdc00
        };

Constructor & Destructor Documentation

TUniCodec::TUniCodec ( ) [inline]

Definition at line 91 of file unicode.h.

                    : replacementChar(DefaultReplacementChar), errorHandling(uehIgnore), strict(false), skipBom(true)
        {
        }

TUniCodec::TUniCodec	(	TUnicodeErrorHandling	errorHandling_,
		bool	strict_,
		int	replacementChar_,
		bool	skipBom_
	)		`[inline]`

Definition at line 95 of file unicode.h.

                                                                                                           :
                replacementChar(replacementChar_), errorHandling(errorHandling_), strict(strict_), skipBom(skipBom_)
        {
        }

Member Function Documentation

template<typename TSrcVec , typename TDestCh >

size_t TUniCodec::DecodeUtf16FromBytes	(	const TSrcVec &	src,
		size_t	srcIdx,
		const size_t	srcCount,
		TVec< TDestCh > &	dest,
		const bool	clrDest,
		const TUtf16BomHandling	bomHandling = `bomAllowed`,
		const TUniByteOrder	defaultByteOrder = `boMachineEndian`
	)		const

Definition at line 2210 of file unicode.h.

References TVec< TVal, TSizeTy >::Add(), boBigEndian, boLittleEndian, boMachineEndian, bomAllowed, bomIgnored, bomRequired, TVec< TVal, TSizeTy >::Clr(), errorHandling, Fail, TInt::GetStr(), IAssert, IsMachineLittleEndian(), replacementChar, skipBom, strict, uehAbort, uehIgnore, uehReplace, uehThrow, Utf16FirstSurrogate, and Utf16SecondSurrogate.

Referenced by TUnicode::DecodeUtf16FromBytes(), and TestUtf16().

{
        IAssert(srcCount % 2 == 0);
        IAssert(bomHandling == bomAllowed || bomHandling == bomRequired || bomHandling == bomIgnored);
        IAssert(defaultByteOrder == boMachineEndian || defaultByteOrder == boBigEndian || defaultByteOrder == boLittleEndian);
        if (clrDest) dest.Clr();
        size_t nDecoded = 0;
        if (srcCount <= 0) return nDecoded;
        const size_t origSrcIdx = srcIdx, srcEnd = srcIdx + srcCount;
        bool littleEndian = false;
        bool leDefault = (defaultByteOrder == boLittleEndian || (defaultByteOrder == boMachineEndian && IsMachineLittleEndian()));
        if (bomHandling == bomIgnored) littleEndian = leDefault;
        else if (bomHandling == bomAllowed || bomHandling == bomRequired)
        {
                int byte1 = uint(src[TVecIdx(srcIdx)]) & 0xff, byte2 = uint(src[TVecIdx(srcIdx + 1)]) & 0xff;
                if (byte1 == 0xfe && byte2 == 0xff) { littleEndian = false; if (skipBom) srcIdx += 2; }
                else if (byte1 == 0xff && byte2 == 0xfe) { littleEndian = true; if (skipBom) srcIdx += 2; }
                else if (bomHandling == bomAllowed) littleEndian = leDefault;
                else { // Report an error.
                        switch (errorHandling) {
                        case uehThrow: throw TUnicodeException(srcIdx, byte1, "BOM expected at the beginning of the input vector (" + TInt::GetStr(byte1, "%02x") + " " + TInt::GetStr(byte2, "%02x") + " found instead).");
                        case uehAbort: case uehReplace: case uehIgnore: return size_t(-1);
                        default: Fail; } }
        }
        else Fail;
        while (srcIdx < srcEnd)
        {
                const size_t charSrcIdx = srcIdx;
                uint byte1 = uint(src[TVecIdx(srcIdx)]) & 0xff, byte2 = uint(src[TVecIdx(srcIdx + 1)]) & 0xff; srcIdx += 2;
                uint c = littleEndian ? (byte1 | (byte2 << 8)) : (byte2 | (byte1 << 8));
                if (Utf16FirstSurrogate <= c && c <= Utf16FirstSurrogate + 1023)
                {
                        // c is the first character in a surrogate pair.  Read the next character.
                        if (! (srcIdx + 2 <= srcEnd)) {
                                switch (errorHandling) {
                                case uehThrow: throw TUnicodeException(charSrcIdx, c, "The second character of a surrogate pair is missing.");
                                case uehAbort: return nDecoded;
                                case uehReplace: dest.Add(TDestCh(replacementChar)); continue;
                                case uehIgnore: continue;
                                default: Fail; } }
                        uint byte1 = uint(src[TVecIdx(srcIdx)]) & 0xff, byte2 = uint(src[TVecIdx(srcIdx + 1)]) & 0xff; srcIdx += 2;
                        uint c2 = littleEndian ? (byte1 | (byte2 << 8)) : (byte2 | (byte1 << 8));
                        // c2 should be the second character of the surrogate pair.
                        if (c2 < Utf16SecondSurrogate || Utf16SecondSurrogate + 1023 < c2) {
                                switch (errorHandling) {
                                case uehThrow: throw TUnicodeException(charSrcIdx + 2, c2, "The second character of a surrogate pair should be in the range " + TInt::GetStr(Utf16SecondSurrogate, "%04x") + ".." + TInt::GetStr(Utf16SecondSurrogate + 1023, "%04x") + ", not " + TInt::GetStr(c2, "04x") + ".");
                                case uehAbort: return nDecoded;
                                // with uehReplace and uehIgnore, we'll just skip the first character of the surrogate pair; we'll process the second one during the next iteration, this time as an ordinary character
                                case uehReplace: dest.Add(TDestCh(replacementChar)); srcIdx -= 2; continue;
                                case uehIgnore: srcIdx -= 2; continue;
                                default: Fail; } }
                        // c and c2 each contain 10 bits of information.
                        uint cc = ((c - Utf16FirstSurrogate) << 10) | (c2 - Utf16SecondSurrogate);
                        cc += 0x10000;
                        dest.Add(TDestCh(cc)); nDecoded++; continue;
                }
                else if (strict && Utf16SecondSurrogate <= c && c <= Utf16SecondSurrogate + 1023) {
                        switch (errorHandling) {
                        case uehThrow: throw TUnicodeException(charSrcIdx, c, "This 16-bit value should be used only as the second character of a surrogate pair.");
                        case uehAbort: return nDecoded;
                        case uehReplace: dest.Add(TDestCh(replacementChar)); continue;
                        case uehIgnore: continue;
                        default: Fail; } }
                // If 'c' is the first character in the input stream, and it's a BOM, we might have to skip it.
                if (charSrcIdx == origSrcIdx && (c == 0xfffeu || c == 0xfeffu) && skipBom) continue;
                // Otherwise, store 'c' to the destination vector.
                dest.Add(TDestCh(c)); nDecoded++;
        }
        return nDecoded;
}

Here is the call graph for this function:

Here is the caller graph for this function:

template<typename TSrcVec , typename TDestCh >

size_t TUniCodec::DecodeUtf16FromWords	(	const TSrcVec &	src,
		size_t	srcIdx,
		const size_t	srcCount,
		TVec< TDestCh > &	dest,
		bool	clrDest,
		const TUtf16BomHandling	bomHandling = `bomAllowed`,
		const TUniByteOrder	defaultByteOrder = `boMachineEndian`
	)		const

Definition at line 2294 of file unicode.h.

References TVec< TVal, TSizeTy >::Add(), boBigEndian, boLittleEndian, boMachineEndian, bomAllowed, bomIgnored, bomRequired, TVec< TVal, TSizeTy >::Clr(), errorHandling, Fail, TInt::GetStr(), IAssert, IsMachineLittleEndian(), replacementChar, skipBom, strict, uehAbort, uehIgnore, uehReplace, uehThrow, Utf16FirstSurrogate, and Utf16SecondSurrogate.

Referenced by TUnicode::DecodeUtf16FromWords(), and TestUtf16().

{
        IAssert(bomHandling == bomAllowed || bomHandling == bomRequired || bomHandling == bomIgnored);
        IAssert(defaultByteOrder == boMachineEndian || defaultByteOrder == boBigEndian || defaultByteOrder == boLittleEndian);
        if (clrDest) dest.Clr();
        size_t nDecoded = 0;
        if (srcCount <= 0) return nDecoded;
        const size_t origSrcIdx = srcIdx, srcEnd = srcIdx + srcCount;
        bool swap = false;
        bool isMachineLe = IsMachineLittleEndian();
        bool isDefaultLe = (defaultByteOrder == boLittleEndian || (defaultByteOrder == boMachineEndian && isMachineLe));
        if (bomHandling == bomIgnored) swap = (isDefaultLe != isMachineLe);
        else if (bomHandling == bomAllowed || bomHandling == bomRequired)
        {
                int c = uint(src[TVecIdx(srcIdx)]) & 0xffff;
                if (c == 0xfeff) { swap = false; if (skipBom) srcIdx += 1; }
                else if (c == 0xfffe) { swap = true; if (skipBom) srcIdx += 1; }
                else if (bomHandling == bomAllowed) swap = (isMachineLe != isDefaultLe);
                else { // Report an error.
                        switch (errorHandling) {
                        case uehThrow: throw TUnicodeException(srcIdx, c, "BOM expected at the beginning of the input vector (" + TInt::GetStr(c, "%04x") + " found instead).");
                        case uehAbort: case uehReplace: case uehIgnore: return size_t(-1);
                        default: Fail; } }
        }
        else Fail;
        while (srcIdx < srcEnd)
        {
                const size_t charSrcIdx = srcIdx;
                uint c = uint(src[TVecIdx(srcIdx)]) & 0xffffu; srcIdx++;
                if (swap) c = ((c >> 8) & 0xff) | ((c & 0xff) << 8);
                if (Utf16FirstSurrogate <= c && c <= Utf16FirstSurrogate + 1023)
                {
                        // c is the first character in a surrogate pair.  Read the next character.
                        if (! (srcIdx < srcEnd)) {
                                switch (errorHandling) {
                                case uehThrow: throw TUnicodeException(charSrcIdx, c, "The second character of a surrogate pair is missing.");
                                case uehAbort: return nDecoded;
                                case uehReplace: dest.Add(TDestCh(replacementChar)); continue;
                                case uehIgnore: continue;
                                default: Fail; } }
                        uint c2 = uint(src[TVecIdx(srcIdx)]) & 0xffffu; srcIdx++;
                        if (swap) c2 = ((c2 >> 8) & 0xff) | ((c2 & 0xff) << 8);
                        // c2 should be the second character of the surrogate pair.
                        if (c2 < Utf16SecondSurrogate || Utf16SecondSurrogate + 1023 < c2) {
                                switch (errorHandling) {
                                case uehThrow: throw TUnicodeException(charSrcIdx + 1, c2, "The second character of a surrogate pair should be in the range " + TInt::GetStr(Utf16SecondSurrogate, "%04x") + ".." + TInt::GetStr(Utf16SecondSurrogate + 1023, "%04x") + ", not " + TInt::GetStr(c2, "04x") + ".");
                                case uehAbort: return nDecoded;
                                // with uehReplace and uehIgnore, we'll just skip the first character of the surrogate pair; we'll process the second one during the next iteration, this time as an ordinary character
                                case uehReplace: dest.Add(TDestCh(replacementChar)); srcIdx -= 1; continue;
                                case uehIgnore: srcIdx -= 1; continue;
                                default: Fail; } }
                        // c and c2 each contain 10 bits of information.
                        uint cc = ((c - Utf16FirstSurrogate) << 10) | (c2 - Utf16SecondSurrogate);
                        cc += 0x10000;
                        dest.Add(TDestCh(cc)); nDecoded++; continue;
                }
                else if (strict && Utf16SecondSurrogate <= c && c <= Utf16SecondSurrogate + 1023) {
                        switch (errorHandling) {
                        case uehThrow: throw TUnicodeException(charSrcIdx, c, "This 16-bit value should be used only as the second character of a surrogate pair.");
                        case uehAbort: return nDecoded;
                        case uehReplace: dest.Add(TDestCh(replacementChar)); continue;
                        case uehIgnore: continue;
                        default: Fail; } }
                // If 'c' is the first character in the input stream, and it's a BOM, we might have to skip it.
                if (charSrcIdx == origSrcIdx && (c == 0xfffeu || c == 0xfeffu) && skipBom) continue;
                // Otherwise, store 'c' to the destination vector.
                dest.Add(TDestCh(c)); nDecoded++;
        }
        return nDecoded;
}

Here is the call graph for this function:

Here is the caller graph for this function:

template<typename TSrcVec , typename TDestCh >

size_t TUniCodec::DecodeUtf8	(	const TSrcVec &	src,
		size_t	srcIdx,
		const size_t	srcCount,
		TVec< TDestCh > &	dest,
		const bool	clrDest = `true`
	)		const

Definition at line 2036 of file unicode.h.

References TVec< TVal, TSizeTy >::Add(), TVec< TVal, TSizeTy >::Clr(), errorHandling, Fail, TInt::GetStr(), replacementChar, skipBom, strict, uehAbort, uehIgnore, uehReplace, and uehThrow.

Referenced by TUnicode::DecodeUtf8(), TUniChDb::SbEx_AddUtf8(), and TestUtf8().

{
        size_t nDecoded = 0;
        if (clrDest) dest.Clr();
        const size_t origSrcIdx = srcIdx;
        const size_t srcEnd = srcIdx + srcCount;
        while (srcIdx < srcEnd)
        {
                const size_t charSrcIdx = srcIdx;
                uint c = src[TVecIdx(srcIdx)] & 0xff; srcIdx++;
                if ((c & _1000_0000) == 0) {
                        // c is one of the characters 0..0x7f, encoded as a single byte.
                        dest.Add(TDestCh(c)); nDecoded++; continue; }
                else if ((c & _1100_0000) == _1000_0000) {
                        // No character in a valid UTF-8-encoded string should begin with a byte of the form 10xxxxxx.
                        // We must have been thrown into the middle of a multi-byte character.
                        switch (errorHandling) {
                        case uehThrow: throw TUnicodeException(charSrcIdx, c, "Invalid character: 10xxxxxx.");
                        case uehAbort: return nDecoded;
                        case uehReplace: dest.Add(TDestCh(replacementChar)); continue;
                        case uehIgnore: continue;
                        default: Fail; } }
                else
                {
                        // c introduces a sequence of 2..6 bytes, depending on how many
                        // of the most significant bits of c are set.
                        uint nMoreBytes = 0, nBits = 0, minVal = 0;
                        if ((c & _1110_0000) == _1100_0000) nMoreBytes = 1, nBits = 5, minVal = 0x80;
                        else if ((c & _1111_0000) == _1110_0000) nMoreBytes = 2, nBits = 4, minVal = 0x800;
                        else if ((c & _1111_1000) == _1111_0000) nMoreBytes = 3, nBits = 3, minVal = 0x10000;
                        else if ((c & _1111_1100) == _1111_1000) nMoreBytes = 4, nBits = 2, minVal = 0x200000;
                        else if ((c & _1111_1110) == _1111_1100) nMoreBytes = 5, nBits = 1, minVal = 0x4000000;
                        else {
                                // c is of the form 1111111x, which is invalid even in the early definitions of UTF-8
                                // (which allowed the encoding of codepoints up to 2^31 - 1).  However, in principle this
                                // could be used to encode 32-bit integers with the msb set: 1aaabbbbccccddddeeeeffffgggghhhh
                                // could be encoded as 1111111a 10aabbbb 10ccccdd 10ddeeee 10ffffgg 10gghhhh.
                                if (strict)  {
                                        switch (errorHandling) {
                                        case uehThrow: throw TUnicodeException(charSrcIdx, c, "Invalid character: 1111111x.");
                                        case uehAbort: return nDecoded;
                                        // In the case of uehReplace and uehIgnore, we'll read the next 5 bytes
                                        // and try to decode the character.  Then, since 'strict' is true and
                                        // the codepoint is clearly >= 2^31, we'll notice this as an error later
                                        // and (in the case of uehReplace) insert a replacement character then.
                                        // This is probably better than inserting a replacement character right
                                        // away and then trying to read the next byte as if a new character
                                        // was beginning there -- if the current byte is really followed by five
                                        // 10xxxxxx bytes, we'll just get six replacement characters in a row.
                                        case uehReplace: break; //dest.Add(TDestCh(replacementChar)); continue;
                                        case uehIgnore: break; // continue;
                                        default: Fail; } }
                                nMoreBytes = 5; nBits = 2; minVal = 0x80000000u; }
                        // Decode this multi-byte sequence.
                        uint cOut = c & ((1 << nBits) - 1); // First extract the nBits least significant bits from c.
                        bool cancel = false;
                        for (uint i = 0; i < nMoreBytes && ! cancel; i++) {
                                // See if there are enough bytes left in the source vector.
                                if (! (srcIdx < srcEnd)) {
                                        switch (errorHandling) {
                                        case uehThrow: throw TUnicodeException(charSrcIdx, c, TInt::GetStr(nMoreBytes) + " more bytes expected, only " + TInt::GetStr(int(srcEnd - charSrcIdx - 1)) + " available.");
                                        case uehAbort: return nDecoded;
                                        case uehReplace: dest.Add(TDestCh(replacementChar)); cancel = true; continue;
                                        case uehIgnore: cancel = true; continue;
                                        default: Fail; } }
                                // Read the next byte.
                                c = src[TVecIdx(srcIdx)] & 0xff; srcIdx++;
                                if ((c & _1100_0000) != _1000_0000) { // Each subsequent byte should be of the form 10xxxxxx.
                                        switch (errorHandling) {
                                        case uehThrow: throw TUnicodeException(charSrcIdx, c, "Byte " + TInt::GetStr(i) + " of " + TInt::GetStr(nMoreBytes) + " extra bytes should begin with 10xxxxxx.");
                                        case uehAbort: return nDecoded;
                                        case uehReplace: dest.Add(TDestCh(replacementChar)); srcIdx--; cancel = true; continue;
                                        case uehIgnore: srcIdx--; cancel = true; continue;
                                        default: Fail; } }
                                cOut <<= 6; cOut |= (c & _0011_1111); }
                        if (cancel) continue;
                        if (strict) {
                                // err1: This codepoint has been represented by more bytes than it should have been.
                                // For example, cOut in the range 0..127 should be represented by a single byte,
                                // not by two or more bytes.
                                // - For example, this may happen in the "modified UTF-8" sometimes used for Java
                                // serialization, where the codepoint 0 is encoded as 11000000 10000000 to avoid
                                // the appearance of null bytes in the encoded stream.
                                bool err1 = (cOut < minVal);
                                // err2: Early definitions of UTF-8 allowed any 31-bit integer to be encoded, using up to 6 bytes.
                                // However, later this was restricted to the codepoints 0..0x10ffff only, because only these
                                // are valid Unicode codepoints.  Thus, no more than 4 bytes are ever necessary.
                                bool err2 = (nMoreBytes > 3 || (nMoreBytes == 3 && cOut > 0x10ffff));
                                if (err1 || err2) switch (errorHandling) {
                                        case uehThrow:
                                                if (err1) throw TUnicodeException(charSrcIdx, c, "The codepoint 0x" + TInt::GetStr(cOut, "%08x") + " has been represented by too many bytes (" + TInt::GetStr(nMoreBytes + 1) + ").");
                                                else if (err2) throw TUnicodeException(charSrcIdx, c, "Invalid multibyte sequence: it decodes into 0x" + TInt::GetStr(cOut, "%08x") + ", but only codepoints 0..0x10ffff are valid.");
                                                else { Fail; break; }
                                        case uehAbort: return nDecoded;
                                        case uehReplace: dest.Add(TDestCh(replacementChar)); continue;
                                        case uehIgnore: continue;
                                        default: Fail; } }
                        // Add the decoded codepoint to the destination vector.
                        // If this is the first decoded character, and it's one of the byte-order marks
                        // (0xfffe and 0xfeff), we will skip it (unless skipBom is false).
                        if (! (skipBom && (cOut == 0xfffe || cOut == 0xfeff) && charSrcIdx == origSrcIdx)) {
                                dest.Add(cOut); nDecoded++; }
                } // else (multi-byte sequence)
        } // while
        return nDecoded;
}

Here is the call graph for this function:

Here is the caller graph for this function:

template<typename TSrcVec , typename TDestCh >

size_t TUniCodec::DecodeUtf8	(	const TSrcVec &	src,
		TVec< TDestCh > &	dest,
		const bool	clrDest = `true`
	)		const `[inline]`

Definition at line 136 of file unicode.h.

References DecodeUtf8().

Referenced by DecodeUtf8().

{ return DecodeUtf8(src, 0, src.Len(), dest, clrDest); }

Here is the call graph for this function:

Here is the caller graph for this function:

template<typename TSrcVec , typename TDestCh >

size_t TUniCodec::EncodeUtf16ToBytes	(	const TSrcVec &	src,
		size_t	srcIdx,
		const size_t	srcCount,
		TVec< TDestCh > &	dest,
		const bool	clrDest,
		const bool	insertBom,
		const TUniByteOrder	destByteOrder = `boMachineEndian`
	)		const

Definition at line 2428 of file unicode.h.

References ___OutRepl, TVec< TVal, TSizeTy >::Add(), boLittleEndian, boMachineEndian, errorHandling, Fail, TUInt::GetStr(), IAssert, IsMachineLittleEndian(), uehAbort, uehIgnore, uehReplace, uehThrow, Utf16FirstSurrogate, and Utf16SecondSurrogate.

Referenced by TUnicode::EncodeUtf16ToBytes(), and TestUtf16().

{
        bool isDestLe = (destByteOrder == boLittleEndian || (destByteOrder == boMachineEndian && IsMachineLittleEndian()));
        size_t nEncoded = 0, srcEnd = srcIdx + srcCount;
        if (insertBom) { dest.Add(isDestLe ? 0xff : 0xfe); dest.Add(isDestLe ? 0xfe : 0xff); nEncoded++; }
        while (srcIdx < srcEnd)
        {
                uint c = uint(src[TVecIdx(srcIdx)]); srcIdx++;
                if (! (c <= 0x10ffffu)) {
                        switch (errorHandling) {
                        case uehThrow: throw TUnicodeException(srcIdx - 1, c, "UTF-16 only supports characters in the range 0..10ffff (not " + TUInt::GetStr(c, "%08x") + ").");
                        case uehAbort: return nEncoded;
#define ___OutRepl if (isDestLe) { dest.Add(replacementChar & 0xff); dest.Add((replacementChar >> 8) & 0xff); } else { dest.Add((replacementChar >> 8) & 0xff); dest.Add(replacementChar & 0xff); }
                        case uehReplace: ___OutRepl; continue;
                        case uehIgnore: continue;
                        default: Fail; } }
                if (Utf16FirstSurrogate <= c && c < Utf16FirstSurrogate + 1023) {
                        switch (errorHandling) {
                        case uehThrow: throw TUnicodeException(srcIdx - 1, c, "UTF-16 cannot encode " + TUInt::GetStr(c, "%04x") + " as it belongs to the first surrogate range (" + TUInt::GetStr(Utf16FirstSurrogate, "%04x") + ".." + TUInt::GetStr(Utf16FirstSurrogate + 1023, "%04x") + ").");
                        case uehAbort: return nEncoded;
                        case uehReplace: ___OutRepl; continue;
                        case uehIgnore: continue;
                        default: Fail; } }
                if (Utf16SecondSurrogate <= c && c < Utf16SecondSurrogate + 1023) {
                        switch (errorHandling) {
                        case uehThrow: throw TUnicodeException(srcIdx - 1, c, "The character " + TUInt::GetStr(c, "%04x") + " belongs to the second surrogate range (" + TUInt::GetStr(Utf16FirstSurrogate, "%04x") + ".." + TUInt::GetStr(Utf16FirstSurrogate + 1023, "%04x") + "), which is not allowed with strict == true.");
                        case uehAbort: return nEncoded;
                        case uehReplace: ___OutRepl; continue;
                        case uehIgnore: continue;
                        default: Fail; } }
#undef ___OutRepl
                // If c is <= 0xffff, it can be stored directly.
                if (c <= 0xffffu) {
                        if (isDestLe) { dest.Add(c & 0xff); dest.Add((c >> 8) & 0xff); }
                        else { dest.Add((c >> 8) & 0xff); dest.Add(c & 0xff); }
                        nEncoded++; continue; }
                // Otherwise, represent c by a pair of surrogate characters.
                c -= 0x10000u; IAssert(/*0 <= c &&*/ c <= 0xfffffu);
                uint c1 = (c >> 10) & 1023, c2 = c & 1023;
                c1 += Utf16FirstSurrogate; c2 += Utf16SecondSurrogate;
                if (isDestLe) { dest.Add(c1 & 0xff); dest.Add((c1 >> 8) & 0xff); dest.Add(c2 & 0xff); dest.Add((c2 >> 8) & 0xff); }
                else { dest.Add((c1 >> 8) & 0xff); dest.Add(c1 & 0xff); dest.Add((c2 >> 8) & 0xff); dest.Add(c2 & 0xff); }
                nEncoded++; continue;
        }
        return nEncoded;
}

Here is the call graph for this function:

Here is the caller graph for this function:

template<typename TSrcVec , typename TDestCh >

size_t TUniCodec::EncodeUtf16ToWords	(	const TSrcVec &	src,
		size_t	srcIdx,
		const size_t	srcCount,
		TVec< TDestCh > &	dest,
		const bool	clrDest,
		const bool	insertBom,
		const TUniByteOrder	destByteOrder = `boMachineEndian`
	)		const

Definition at line 2376 of file unicode.h.

References TVec< TVal, TSizeTy >::Add(), boBigEndian, boLittleEndian, errorHandling, Fail, TUInt::GetStr(), IAssert, IsMachineLittleEndian(), replacementChar, SwapBytes(), uehAbort, uehIgnore, uehReplace, uehThrow, Utf16FirstSurrogate, and Utf16SecondSurrogate.

Referenced by TUnicode::EncodeUtf16ToWords(), and TestUtf16().

{
        bool isMachineLe = IsMachineLittleEndian();
        bool swap = (destByteOrder == boLittleEndian && ! isMachineLe) || (destByteOrder == boBigEndian && isMachineLe);
        size_t nEncoded = 0, srcEnd = srcIdx + srcCount;
        if (insertBom) { dest.Add(TDestCh(swap ? 0xfffeu : 0xfeffu)); nEncoded++; }
        while (srcIdx < srcEnd)
        {
                uint c = uint(src[TVecIdx(srcIdx)]); srcIdx++;
                if (! (c <= 0x10ffffu)) {
                        switch (errorHandling) {
                        case uehThrow: throw TUnicodeException(srcIdx - 1, c, "UTF-16 only supports characters in the range 0..10ffff (not " + TUInt::GetStr(c, "%08x") + ").");
                        case uehAbort: return nEncoded;
                        case uehReplace: dest.Add(TDestCh(swap ? SwapBytes(replacementChar) : replacementChar)); continue;
                        case uehIgnore: continue;
                        default: Fail; } }
                if (Utf16FirstSurrogate <= c && c < Utf16FirstSurrogate + 1023) {
                        switch (errorHandling) {
                        case uehThrow: throw TUnicodeException(srcIdx - 1, c, "UTF-16 cannot encode " + TUInt::GetStr(c, "%04x") + " as it belongs to the first surrogate range (" + TUInt::GetStr(Utf16FirstSurrogate, "%04x") + ".." + TUInt::GetStr(Utf16FirstSurrogate + 1023, "%04x") + ").");
                        case uehAbort: return nEncoded;
                        case uehReplace: dest.Add(TDestCh(swap ? SwapBytes(replacementChar) : replacementChar)); continue;
                        case uehIgnore: continue;
                        default: Fail; } }
                if (Utf16SecondSurrogate <= c && c < Utf16SecondSurrogate + 1023) {
                        switch (errorHandling) {
                        case uehThrow: throw TUnicodeException(srcIdx - 1, c, "The character " + TUInt::GetStr(c, "%04x") + " belongs to the second surrogate range (" + TUInt::GetStr(Utf16FirstSurrogate, "%04x") + ".." + TUInt::GetStr(Utf16FirstSurrogate + 1023, "%04x") + "), which is not allowed with strict == true.");
                        case uehAbort: return nEncoded;
                        case uehReplace: dest.Add(TDestCh(swap ? SwapBytes(replacementChar) : replacementChar)); continue;
                        case uehIgnore: continue;
                        default: Fail; } }
                // If c is <= 0xffff, it can be stored directly.
                if (c <= 0xffffu) {
                        if (swap) c = ((c >> 8) & 0xff) | ((c & 0xff) << 8);
                        dest.Add(TDestCh(c)); nEncoded++; continue; }
                // Otherwise, represent c by a pair of surrogate characters.
                c -= 0x10000u; IAssert(/*0 <= c &&*/ c <= 0xfffffu);
                uint c1 = (c >> 10) & 1023, c2 = c & 1023;
                c1 += Utf16FirstSurrogate; c2 += Utf16SecondSurrogate;
                if (swap) {
                        c1 = ((c1 >> 8) & 0xff) | ((c1 & 0xff) << 8);
                        c2 = ((c2 >> 8) & 0xff) | ((c2 & 0xff) << 8); }
                dest.Add(TDestCh(c1));
                dest.Add(TDestCh(c2));
                nEncoded++; continue;
        }
        return nEncoded;
}

Here is the call graph for this function:

Here is the caller graph for this function:

template<typename TSrcVec , typename TDestCh >

size_t TUniCodec::EncodeUtf8	(	const TSrcVec &	src,
		size_t	srcIdx,
		const size_t	srcCount,
		TVec< TDestCh > &	dest,
		const bool	clrDest = `true`
	)		const

Definition at line 2152 of file unicode.h.

References TVec< TVal, TSizeTy >::Add(), errorHandling, Fail, TInt::GetStr(), replacementChar, strict, uehAbort, uehIgnore, uehReplace, and uehThrow.

Referenced by TUnicode::EncodeUtf8(), EncodeUtf8Str(), and TestUtf8().

{
        size_t nEncoded = 0;
        for (const size_t srcEnd = srcIdx + srcCount; srcIdx < srcEnd; srcIdx++)
        {
                uint c = uint(src[TVecIdx(srcIdx)]);
                bool err = false;
                if (strict && c > 0x10ffff) {
                        err = true;
                        switch (errorHandling) {
                        case uehThrow: throw TUnicodeException(srcIdx, c, "Invalid character (0x" + TInt::GetStr(c, "%x") + "; only characters in the range 0..0x10ffff are allowed).");
                        case uehAbort: return nEncoded;
                        case uehReplace: c = replacementChar; break;
                        case uehIgnore: continue;
                        default: Fail; } }
                if (c < 0x80u)
                        dest.Add(TDestCh(c & 0xffu));
                else if (c < 0x800u) {
                        dest.Add(TDestCh(_1100_0000 | ((c >> 6) & _0001_1111)));
                        dest.Add(TDestCh(_1000_0000 | (c & _0011_1111))); }
                else if (c < 0x10000u) {
                        dest.Add(TDestCh(_1110_0000 | ((c >> 12) & _0000_1111)));
                        dest.Add(TDestCh(_1000_0000 | ((c >> 6) & _0011_1111)));
                        dest.Add(TDestCh(_1000_0000 | (c & _0011_1111))); }
                else if (c < 0x200000u) {
                        dest.Add(TDestCh(_1111_0000 | ((c >> 18) & _0000_0111)));
                        dest.Add(TDestCh(_1000_0000 | ((c >> 12) & _0011_1111)));
                        dest.Add(TDestCh(_1000_0000 | ((c >> 6) & _0011_1111)));
                        dest.Add(TDestCh(_1000_0000 | (c & _0011_1111))); }
                else if (c < 0x4000000u) {
                        dest.Add(TDestCh(_1111_1000 | ((c >> 24) & _0000_0011)));
                        dest.Add(TDestCh(_1000_0000 | ((c >> 18) & _0011_1111)));
                        dest.Add(TDestCh(_1000_0000 | ((c >> 12) & _0011_1111)));
                        dest.Add(TDestCh(_1000_0000 | ((c >> 6) & _0011_1111)));
                        dest.Add(TDestCh(_1000_0000 | (c & _0011_1111))); }
                else {
                        dest.Add(TDestCh(_1111_1100 | ((c >> 30) & _0000_0011)));
                        dest.Add(TDestCh(_1000_0000 | ((c >> 24) & _0011_1111)));
                        dest.Add(TDestCh(_1000_0000 | ((c >> 18) & _0011_1111)));
                        dest.Add(TDestCh(_1000_0000 | ((c >> 12) & _0011_1111)));
                        dest.Add(TDestCh(_1000_0000 | ((c >> 6) & _0011_1111)));
                        dest.Add(TDestCh(_1000_0000 | (c & _0011_1111))); }
                if (! err) nEncoded++;
        }
        return nEncoded;
}

Here is the call graph for this function:

Here is the caller graph for this function:

template<typename TSrcVec , typename TDestCh >

size_t TUniCodec::EncodeUtf8	(	const TSrcVec &	src,
		TVec< TDestCh > &	dest,
		const bool	clrDest = `true`
	)		const `[inline]`

Definition at line 145 of file unicode.h.

References EncodeUtf8().

Referenced by EncodeUtf8().

{ return EncodeUtf8(src, 0, src.Len(), dest, clrDest); }

Here is the call graph for this function:

Here is the caller graph for this function:

template<typename TSrcVec >

TStr TUniCodec::EncodeUtf8Str	(	const TSrcVec &	src,
		size_t	srcIdx,
		const size_t	srcCount
	)		const `[inline]`

Definition at line 149 of file unicode.h.

References EncodeUtf8().

Referenced by TUnicode::EncodeUtf8Str().

{ TVec<char> temp; EncodeUtf8(src, srcIdx, srcCount, temp); TStr retVal = &(temp[0]); return retVal; }

Here is the call graph for this function:

Here is the caller graph for this function:

template<typename TSrcVec >

TStr TUniCodec::EncodeUtf8Str ( const TSrcVec & src ) const [inline]

Definition at line 150 of file unicode.h.

References TVec< TVal, TSizeTy >::Add(), and EncodeUtf8().

{ TVec<char> temp; EncodeUtf8(src, temp); temp.Add(0); TStr retVal = &(temp[0]); return retVal; }

Here is the call graph for this function:

uint TUniCodec::GetRndUint ( TRnd & rnd ) [static, protected]

Definition at line 62 of file unicode.cpp.

References TRnd::GetUniDevUInt().

Referenced by GetRndUint(), TestDecodeUtf16(), and TestDecodeUtf8().

{
        uint u = rnd.GetUniDevUInt(256) & 0xff;
        u <<= 8; u |= (rnd.GetUniDevUInt(256) & 0xff);
        u <<= 8; u |= (rnd.GetUniDevUInt(256) & 0xff);
        u <<= 8; u |= (rnd.GetUniDevUInt(256) & 0xff);
        return u;
}

Here is the call graph for this function:

Here is the caller graph for this function:

uint TUniCodec::GetRndUint	(	TRnd &	rnd,
		uint	minVal,
		uint	maxVal
	)		`[static, protected]`

Definition at line 71 of file unicode.cpp.

References GetRndUint(), TUInt::Mn, and TUInt::Mx.

{
        if (minVal == TUInt::Mn && maxVal == TUInt::Mx) return GetRndUint(rnd);
        uint range = maxVal - minVal + 1;
        if (range > (uint(1) << (8 * sizeof(uint) - 1)))
                while (true) { uint u = GetRndUint(rnd); if (u < range) return minVal + u; }
        uint mask = 1;
        while (mask < range) mask <<= 1;
        mask -= 1;
        while (true) { uint u = GetRndUint(rnd) & mask; if (u < range) return minVal + u; }
}

Here is the call graph for this function:

bool TUniCodec::IsMachineLittleEndian ( ) [static, protected]

Definition at line 83 of file unicode.cpp.

References FailR, and TInt::GetStr().

Referenced by DecodeUtf16FromBytes(), DecodeUtf16FromWords(), EncodeUtf16ToBytes(), EncodeUtf16ToWords(), TestDecodeUtf16(), TestUtf16(), and WordsToBytes().

{
        static bool isLE, initialized = false;
        if (initialized) return isLE;
        int i = 0x0201;
        char *p = (char *) (&i);
        char c1, c2;
        memcpy(&c1, p, 1); memcpy(&c2, p + 1, 1);
        if (c1 == 1 && c2 == 2) isLE = true;
        else if (c1 == 2 && c2 == 1) isLE = false;
        else {
                FailR(("TUniCodec::IsMachineLittleEndian: c1 = " + TInt::GetStr(int(uchar(c1)), "%02x") + ", c2 = " + TInt::GetStr(int(uchar(c2)), "%02x") + ".").CStr());
                isLE = true; }
        initialized = true; return isLE;
}

Here is the call graph for this function:

Here is the caller graph for this function:

static int TUniCodec::SwapBytes ( int x ) [inline, static, protected]

Definition at line 250 of file unicode.h.

Referenced by EncodeUtf16ToWords(), TestDecodeUtf16(), and TestUtf16().

                                           {
                return ((x >> 8) & 0xff) | ((x & 0xff) << 8); }

Here is the caller graph for this function:

void TUniCodec::TestDecodeUtf16	(	TRnd &	rnd,
		const TStr &	testCaseDesc,
		const TUtf16BomHandling	bomHandling,
		const TUniByteOrder	defaultByteOrder,
		const bool	insertBom
	)		`[protected]`

Definition at line 345 of file unicode.cpp.

References TVec< TVal, TSizeTy >::Add(), boLittleEndian, boMachineEndian, bomRequired, TStr::CStr(), errorHandling, Fail, GetRndUint(), IAssert, IsMachineLittleEndian(), TStr::Len(), TVec< TVal, TSizeTy >::Len(), replacementChar, skipBom, strict, SwapBytes(), TestUtf16(), uehAbort, uehReplace, uehThrow, Utf16FirstSurrogate, and Utf16SecondSurrogate.

Referenced by TestUtf16().

{
        TIntV src; TIntV expectedDest; int expectedRetVal = 0;
        bool expectedAbort = false;
        FILE *f = 0;
        bool isMachineLe = IsMachineLittleEndian();
        bool isDefaultLe = (defaultByteOrder == boLittleEndian || (defaultByteOrder == boMachineEndian && isMachineLe));
        bool swap = (isMachineLe != isDefaultLe);
        if (insertBom) {
                src.Add(swap ? 0xfffe : 0xfeff);
                if (! skipBom) { expectedRetVal += 1; expectedDest.Add(0xfeff); } }
        else if (bomHandling == bomRequired) {
                expectedAbort = true; expectedRetVal = -1; }
        // testCaseDesc should consist single characters or pairs of characters, 'c[e]', where:
        // - 'c' defines the range from which the codepoint should be taken ('A'..'E', 'X'..'Y');
        // - 'e' defines how many words will be removed from the end of the encoded sequence for this codepoint.
        //   (absent = 0, 'a' = 1).
        for (int i = 0; i < testCaseDesc.Len(); )
        {
                const char c = testCaseDesc[i++];
                uint cp = 0; int nWords = -1;
                if (c == 'X' || c == 'Y') IAssert(i > 1); // if you want a BOM at the beginning of your data, use insertBom -- if we permit X and Y here, predicting the expectedDest and expectedRetVal gets more complicated
                if (c == 'A') { cp = GetRndUint(rnd, 0u, Utf16FirstSurrogate - 1); nWords = 1; } // characters below the first surrogate range
                else if (c == 'B') { cp = GetRndUint(rnd, Utf16FirstSurrogate, Utf16FirstSurrogate + 1023); nWords = 1; } // the first surrogate range
                else if (c == 'C') { cp = GetRndUint(rnd, Utf16SecondSurrogate, Utf16SecondSurrogate + 1023); nWords = 1; } // the second surrogate range
                else if (c == 'D') { do { cp = GetRndUint(rnd, Utf16SecondSurrogate + 1024, 0xffffu); } while (cp == 0xfffe || cp == 0xfeff); nWords = 1; } // above the second surrogate range, but still in the BMP
                else if (c == 'E') { cp = GetRndUint(rnd, 0x10000u, 0x10ffffu); nWords = 2; } // above the BMP, but still within the range for UTF-16
                else if (c == 'X') { cp = 0xfffe; nWords = 1; }
                else if (c == 'Y') { cp = 0xfeff; nWords = 1; }
                else Fail;
                if (c == 'B' && i < testCaseDesc.Len()) IAssert(testCaseDesc[i] != 'C');
                // Process 'e'.
                int nToDel = 0;
                if (i < testCaseDesc.Len()) {
                        const char e = testCaseDesc[i];
                        if (e >= 'a') { i += 1; nToDel = 1; }}
                IAssert((nWords == 1 && nToDel == 0) || (nWords == 2 && (nToDel == 0 || nToDel == 1)));
                if (nWords == 2 && nToDel == 1 && i < testCaseDesc.Len()) IAssert(testCaseDesc[i] != 'C');
                // Will an error occur during the decoding of this codepoint?
                bool errHere = false;
                if (Utf16FirstSurrogate <= cp && cp <= Utf16FirstSurrogate + 1023) errHere = true;
                else if (cp > 0x10ffff) { Fail; errHere = true; }
                else if (nToDel > 0) errHere = true;
                else if (strict && (Utf16SecondSurrogate <= cp && cp <= Utf16SecondSurrogate + 1023)) errHere = true;
                // Update 'expectedDest' and 'expectedRetVal'.
                if (! expectedAbort) {
                        if (! errHere) {
                                if (src.Len() == 0 && (cp == 0xfffe || cp == 0xfeff) && skipBom) { }
                                else { expectedDest.Add(cp); expectedRetVal += 1; } }
                        else if (errorHandling == uehReplace) {
                                expectedDest.Add(replacementChar); }
                        if (errHere && (errorHandling == uehAbort || errorHandling == uehThrow)) expectedAbort = true; }
                // Update 'src'.
                if (nWords == 1) src.Add(swap ? SwapBytes(cp) : cp);
                else {
                        int c1 = ((cp - 0x10000) >> 10) & 1023; c1 += Utf16FirstSurrogate;
                        int c2 = (cp - 0x10000) & 1023; c2 += Utf16SecondSurrogate;
                        src.Add(swap ? SwapBytes(c1) : c1);
                        if (nToDel == 0) src.Add(swap ? SwapBytes(c2) : c2); }
        }
        if (f) fprintf(f, "Test case: \"%s\"\n", testCaseDesc.CStr());
        TestUtf16(true, expectedRetVal, expectedAbort && (errorHandling == uehThrow), src, expectedDest, bomHandling, defaultByteOrder, false, f);
}

Here is the call graph for this function:

Here is the caller graph for this function:

void TUniCodec::TestDecodeUtf8	(	TRnd &	rnd,
		const TStr &	testCaseDesc
	)		`[protected]`

Definition at line 137 of file unicode.cpp.

References TVec< TVal, TSizeTy >::Add(), TStr::CStr(), errorHandling, Fail, GetRndUint(), IAssert, TStr::Len(), TVec< TVal, TSizeTy >::Len(), replacementChar, skipBom, strict, TestUtf8(), uehAbort, uehReplace, and uehThrow.

Referenced by TestUtf8().

{
        TIntV src; TIntV expectedDest; int expectedRetVal = 0;
        bool expectedAbort = false;
        FILE *f = 0; // stderr
        // testCaseDesc should consist of pairs or triples of characters, 'cd[e]', where:
        // - 'c' defines the range from which the codepoint should be taken ('A'..'H', 'X'..'Z');
        // - 'd' defines how many bytes the codepoint should be encoded with ('1'..'6');
        // - 'e' defines how many bytes will be removed from the end of the encoded sequence for this codepoint.
        //   (absent = 0, 'a' = 1, 'b' = 2 and so on).
        for (int i = 0; i < testCaseDesc.Len(); )
        {
                IAssert(i + 2 <= testCaseDesc.Len());
                const char c = testCaseDesc[i], d = testCaseDesc[i + 1]; i += 2;
                uint cp = 0; int nBytes = -1, minBytes = -1; bool eighties = false;
                IAssert('1' <= d && d <= '6'); nBytes = d - '0';
                if (c == 'A') { cp = GetRndUint(rnd, 0u, 0x7fu); minBytes = 1; } // 1 byte
                else if (c == 'B') { cp = GetRndUint(rnd, 0x80u, 0x7ffu); minBytes = 2; } // 2 bytes
                else if (c == 'C') { cp = GetRndUint(rnd, 0x800u, 0xffffu); minBytes = 3; } // 3 bytes
                else if (c == 'D') { cp = GetRndUint(rnd, 0x10000u, 0x10ffffu); minBytes = 4; } // 4 bytes, valid Unicode
                else if (c == 'E') { cp = GetRndUint(rnd, 0x110000u, 0x1fffffu); minBytes = 4; } // 4 bytes, invalid Unicode
                else if (c == 'F') { cp = GetRndUint(rnd, 0x200000u, 0x3ffffffu); minBytes = 5; } // 5 bytes
                else if (c == 'G') { cp = GetRndUint(rnd, 0x4000000u, 0x7fffffffu); minBytes = 6; } // 6 bytes, 31 bits
                else if (c == 'H') { cp = GetRndUint(rnd, 0x80000000u, 0xffffffffu); minBytes = 6; } // 6 bytes, 32 bits
                else if (c == 'X') { cp = 0xfffe; minBytes = 3; }
                else if (c == 'Y') { cp = 0xfeff; minBytes = 3; }
                else if (c == 'Z') { eighties = true; minBytes = 1; } // insert several random 10xxxxxx bytes (= 0x80 | random(0..0x3f))
                else Fail;
                IAssert(nBytes >= minBytes);
                // Process 'e'.
                int nToDel = 0;
                if (i < testCaseDesc.Len()) {
                        const char e = testCaseDesc[i];
                        if (e >= 'a' && e <= 'e') { i += 1; nToDel = e - 'a' + 1; }}
                IAssert(nToDel < nBytes);
                // Will an error occur during the decoding of this codepoint?
                bool errHere = false;
                if (eighties) errHere = true;
                else if (nToDel > 0) errHere = true;
                else if (strict && (cp >= 0x10ffff || nBytes > minBytes)) errHere = true;
                // Update 'expectedDest' and 'expetedRetVal'.
                if (! expectedAbort) {
                        if (! errHere) {
                                if (src.Len() == 0 && (cp == 0xfffe || cp == 0xfeff) && skipBom) { }
                                else { expectedDest.Add(cp); expectedRetVal += 1; } }
                        else if (errorHandling == uehReplace) {
                                if (eighties) for (int j = 0; j < nBytes; j++) expectedDest.Add(replacementChar);
                                else expectedDest.Add(replacementChar); }
                        if (errHere && (errorHandling == uehAbort || errorHandling == uehThrow)) expectedAbort = true; }
                // Update 'src'.
                if (eighties) for (int j = 0; j < nBytes; j++) src.Add(GetRndUint(rnd, 0x80, 0xff));
                else if (nBytes == 1) src.Add(cp);
                else {
                        int mask = (1 << nBytes) - 1; mask <<= (8 - nBytes);
                        src.Add(mask | (uint(cp) >> (6 * (nBytes - 1))));
                        for (int j = 1; j < nBytes - nToDel; j++) src.Add(0x80 | ((cp >> (6 * (nBytes - j - 1))) & _0011_1111)); }
        }
        if (f) fprintf(f, "Test case: \"%s\"\n", testCaseDesc.CStr());
        TestUtf8(true, expectedRetVal, expectedAbort && (errorHandling == uehThrow), src, expectedDest, f);
}

Here is the call graph for this function:

Here is the caller graph for this function:

void TUniCodec::TestUtf16	(	bool	decode,
		size_t	expectedRetVal,
		bool	expectedThrow,
		const TIntV &	src,
		const TIntV &	expectedDest,
		const TUtf16BomHandling	bomHandling,
		const TUniByteOrder	defaultByteOrder,
		const bool	insertBom,
		FILE *	f
	)		`[protected]`

Definition at line 288 of file unicode.cpp.

References boBigEndian, boLittleEndian, bomAllowed, bomRequired, TVec< TVal, TSizeTy >::Clr(), TStr::CStr(), DecodeUtf16FromBytes(), DecodeUtf16FromWords(), EncodeUtf16ToBytes(), EncodeUtf16ToWords(), errorHandling, IAssert, TVec< TVal, TSizeTy >::Len(), TUnicodeException::message, replacementChar, skipBom, TUnicodeException::srcChar, TUnicodeException::srcIdx, strict, uehAbort, uehIgnore, uehReplace, uehThrow, and WordsToBytes().

{
        TIntV srcBytes, expectedDestBytes;
        WordsToBytes(src, srcBytes); WordsToBytes(expectedDest, expectedDestBytes);
        TIntV dest;
        if (f) {
                fprintf(f, "Settings: %s  %s  %s  %s  %s replacementChar = %x  \n",
                        (errorHandling == uehAbort ? "abort" : errorHandling == uehThrow ? "throw" : errorHandling == uehIgnore ? "ignore" : errorHandling == uehReplace ? "replace" : "????"),
                        (strict ? "STRICT" : ""), (decode ? (skipBom ? "skipBom" : "") : (insertBom ? "insrtBom" : "")),
                        (bomHandling == bomAllowed ? "bomAllowed" : bomHandling == bomRequired ? "bomRequired" : "bomIgnored"),
                        (defaultByteOrder == boBigEndian ? "boBigEndian" : defaultByteOrder == boLittleEndian ? "boLittleEndian" : "boMachineEndian"),
                        uint(replacementChar));
                fprintf(f, "src: "); for (int i = 0; i < src.Len(); i++) fprintf(f, (decode ? " %04x" : " %x"), uint(src[i])); }
        for (int useBytes = 0; useBytes < 2; useBytes++)
        {
                const char *fmt = (useBytes ? " %02x" : " %04x");
                try
                {
                        dest.Clr();
                        size_t retVal;
                        if (! useBytes) {
                                if (decode) retVal = DecodeUtf16FromWords(src, 0, src.Len(), dest, true, bomHandling, defaultByteOrder);
                                else retVal = EncodeUtf16ToWords(src, 0, src.Len(), dest, true, insertBom, defaultByteOrder); }
                        else {
                                if (decode) retVal = DecodeUtf16FromBytes(srcBytes, 0, srcBytes.Len(), dest, true, bomHandling, defaultByteOrder);
                                else retVal = EncodeUtf16ToBytes(src, 0, src.Len(), dest, true, insertBom, defaultByteOrder); }
                        const TIntV& ed = (useBytes && ! decode ? expectedDestBytes : expectedDest);
                        if (f) {
                                fprintf(f, "\n -> dest:    "); for (int i = 0; i < dest.Len(); i++) fprintf(f, (decode ? " %x" :  fmt), uint(dest[i]));
                                fprintf(f, "\n    expDest  "); for (int i = 0; i < ed.Len(); i++) fprintf(f, (decode ? " %x" :  fmt), uint(ed[i]));
                                fprintf(f, "\n    retVal = %llu (expected %llu)\n", static_cast<long long unsigned int> (retVal), static_cast<long long unsigned int> (expectedRetVal)); }
                        bool ok = true;
                        if (retVal != expectedRetVal) ok = false;
                        if (dest.Len() != ed.Len()) ok = false;
                        if (ok) for (int i = 0; i < dest.Len(); i++) if (dest[i] != ed[i]) ok = false;
                        if (! ok)
                        {
                                printf("!!!\n");
                        }
                        IAssert(retVal == expectedRetVal); IAssert(! expectedThrow);
                        IAssert(dest.Len() == ed.Len());
                        for (int i = 0; i < dest.Len(); i++) IAssert(dest[i] == ed[i]);
                }
                catch (TUnicodeException e)
                {
                        if (f) {
                                fprintf(f, "\n -> expDest  "); for (int i = 0; i < expectedDest.Len(); i++) fprintf(f, (decode ? " %x" : fmt), uint(expectedDest[i]));
                                fprintf(f, "\n    exception \"%s\" at %d (char 0x%02x)\n", e.message.CStr(), int(e.srcIdx), uint(e.srcChar)); }
                        IAssert(expectedThrow);
                }
        }
}

Here is the call graph for this function:

void TUniCodec::TestUtf16 ( )

Definition at line 412 of file unicode.cpp.

References TVec< TVal, TSizeTy >::Add(), boLittleEndian, boMachineEndian, bomAllowed, bomRequired, TVec< TVal, TSizeTy >::Clr(), errorHandling, TVec< TVal, TSizeTy >::Gen(), IsMachineLittleEndian(), TVec< TVal, TSizeTy >::Len(), TUInt::Mx, replacementChar, TVec< TVal, TSizeTy >::Reserve(), skipBom, strict, SwapBytes(), TestDecodeUtf16(), uehReplace, uehThrow, Utf16FirstSurrogate, and Utf16SecondSurrogate.

Referenced by TestDecodeUtf16().

{
        TIntV utf16ReplCh; utf16ReplCh.Add(replacementChar);
        for (int skipBom_ = 0; skipBom_ < 2; skipBom_++)
        for (int strict_ = 0; strict_ < 2; strict_++)
        for (int errMode_ = 0; errMode_ < 4; errMode_++)
        for (int bomHandling_ = 0; bomHandling_ < 3; bomHandling_++)
        for (int byteOrder_ = 0; byteOrder_ < 3; byteOrder_++)
        for (int insertBom_ = 0; insertBom_ < 2; insertBom_++)
        {
                strict = (strict_ == 1); errorHandling = TUnicodeErrorHandling(errMode_); skipBom = (skipBom_ == 1);
                bool insertBom = (insertBom_ == 1);
                TUniByteOrder byteOrder = (TUniByteOrder) byteOrder_;
                TUtf16BomHandling bomHandling = (TUtf16BomHandling) bomHandling_;
                TRnd rnd = TRnd(123);
                // Test DecodeUtf16 on various random UTF-16-encoded sequences.
                for (int i = 0; i < 10; i++)
                {
                        TestDecodeUtf16(rnd, "A", bomHandling, byteOrder, insertBom);
                        TestDecodeUtf16(rnd, "AAA", bomHandling, byteOrder, insertBom);
                        TestDecodeUtf16(rnd, "B", bomHandling, byteOrder, insertBom);
                        TestDecodeUtf16(rnd, "DDAADADAAADDDAA", bomHandling, byteOrder, insertBom);
                        TestDecodeUtf16(rnd, "DEEEDAAEEDADEEAAEEADEEDDAA", bomHandling, byteOrder, insertBom);
                        TestDecodeUtf16(rnd, "DEaEaEDAAEaEDADEaEAAEEADEEDDAA", bomHandling, byteOrder, insertBom);
                        TestDecodeUtf16(rnd, "CABDEBACCEaB", bomHandling, byteOrder, insertBom);
                        TestDecodeUtf16(rnd, "EaEEEEaBBACABXABYXXEaYDDXBDCEA", bomHandling, byteOrder, insertBom);
                        TestDecodeUtf16(rnd, "EaEEEEaBDCAAXADYXXEaYDDXDCEA", bomHandling, byteOrder, insertBom);
                }
                //continue;
                // Test both DecodeUtf16 and EncodeUtf16 systematically on various characters
                // close to powers of 2.
                TIntV src, expectedDest, src2;
                expectedDest.Gen(1); src.Reserve(6); src2.Gen(1);
                for (int pow = 8; pow <= 32; pow++)
                {
                        uint uFrom, uTo;
                        if (pow == 8) uFrom = 0, uTo = 1u << pow;
                        else if (pow == 32) uFrom = TUInt::Mx - (1u << 8), uTo = TUInt::Mx;
                        else uFrom = (1u << pow) - (1u << 8), uTo = (1u << pow) + (1u << 8);
                        printf("%u..%u          \r", uFrom, uTo);
                        for (uint u = uFrom; ; u++)
                        {
                                int nWords = 0;
                                if (u < 0x10000) nWords = 1;
                                else nWords = 2;
                                bool isMachineLe = IsMachineLittleEndian(), isDestLe = (byteOrder == boLittleEndian || (byteOrder == boMachineEndian && isMachineLe));
                                bool swap = (isMachineLe != isDestLe);
                                bool err = (u > 0x10ffff) || (Utf16FirstSurrogate <= u && u <= Utf16FirstSurrogate + 1023) || (strict && Utf16SecondSurrogate <= u && u <= Utf16SecondSurrogate + 1023);
                                src.Gen(3, (err ? 0 : nWords) + (insertBom ? 1 : 0));
                                if (insertBom) src[0] = (swap ? 0xfffe : 0xfeff);
                                if (! ((u > 0x10ffff) || (Utf16FirstSurrogate <= u && u <= Utf16FirstSurrogate + 1023)))
                                {
                                        // Try to encode 'u' and see if it gets decoded correctly.
                                        if (nWords == 1) src[insertBom ? 1 : 0] = (swap ? SwapBytes(u) : u);
                                        else {
                                                int u1 = Utf16FirstSurrogate + (((u - 0x10000) >> 10) & 1023);
                                                int u2 = Utf16SecondSurrogate + ((u - 0x10000) & 1023);
                                                src[insertBom ? 1 : 0] = (swap ? SwapBytes(u1) : u1);
                                                src[insertBom ? 2 : 1] = (swap ? SwapBytes(u2) : u2); }
                                        if (! ((u == 0xfffe || u == 0xfeff) && bomHandling == bomAllowed && ! insertBom)) // this will just create a mess when decoding
                                        {
                                                expectedDest.Reserve(2, 0);
                                                if (insertBom && ! skipBom) expectedDest.Add(0xfeff);
                                                if (err && errorHandling == uehReplace) expectedDest.Add(replacementChar);
                                                else if (! err) expectedDest.Add(u);
                                                int erv = (err ? 0 : expectedDest.Len());
                                                if (skipBom && (u == 0xfeff || u == 0xfffe) && ! insertBom) expectedDest.Clr(), erv = 0;
                                                bool errD = err;
                                                if (bomHandling == bomRequired && ! insertBom) {
                                                        expectedDest.Clr(false);
                                                        if (u == 0xfeff || u == 0xfffe) { erv = (skipBom ? 0 : 1); if (! skipBom) expectedDest.Add(0xfeff); }
                                                        else { erv = -1; errD = true;
                                                                /*if (errorHandling == uehReplace) expectedDest.Add(replacementChar);*/ }}
                                                TestUtf16(true, erv, (errD && errorHandling == uehThrow), src, expectedDest, bomHandling, byteOrder, insertBom, 0);
                                        }
                                }
                                // We can also test the UTF-16 encoder.
                                src2[0] = u;
                                if (err) {
                                        src.Clr(false); if (insertBom) src.Add(swap ? 0xfffe : 0xfeff);
                                        if (errorHandling == uehReplace) {
                                                src.Add(swap ? SwapBytes(replacementChar) : replacementChar);
                                                /*if (byteOrder == boBigEndian || (byteOrder == boMachineEndian && ! TUniCodec::IsMachineLittleEndian())) { src.Add((replacementChar >> 8) & 0xff); src.Add(replacementChar & 0xff); }
                                                else { src.Add(replacementChar & 0xff); src.Add((replacementChar >> 8) & 0xff); } */
                                        }}
                                TestUtf16(false, (err ? 0 : 1) + (insertBom ? 1 : 0), (err && errorHandling == uehThrow), src2, src, bomHandling, byteOrder, insertBom, 0);
                                //
                                if (u == uTo) break;
                        }
                }
        }
}

Here is the call graph for this function:

Here is the caller graph for this function:

void TUniCodec::TestUtf8	(	bool	decode,
		size_t	expectedRetVal,
		bool	expectedThrow,
		const TIntV &	src,
		const TIntV &	expectedDest,
		FILE *	f
	)		`[protected]`

Definition at line 103 of file unicode.cpp.

References TStr::CStr(), DecodeUtf8(), EncodeUtf8(), errorHandling, IAssert, TVec< TVal, TSizeTy >::Len(), TUnicodeException::message, replacementChar, skipBom, TUnicodeException::srcChar, TUnicodeException::srcIdx, strict, uehAbort, uehIgnore, uehReplace, and uehThrow.

{
        TIntV dest;
        if (f) {
                fprintf(f, "Settings: %s  %s  %s   replacementChar = %x\n",
                        (errorHandling == uehAbort ? "abort" : errorHandling == uehThrow ? "throw" : errorHandling == uehIgnore ? "ignore" : errorHandling == uehReplace ? "replace" : "????"),
                        (strict ? "STRICT" : ""), (skipBom ? "skipBom" : ""), uint(replacementChar));
                fprintf(f, "src: "); for (int i = 0; i < src.Len(); i++) fprintf(f, (decode ? " %02x" : " %x"), uint(src[i])); }
        try
        {
                size_t retVal = (decode ? DecodeUtf8(src, 0, src.Len(), dest, true) : EncodeUtf8(src, 0, src.Len(), dest, true));
                if (f) {
                        fprintf(f, "\n -> dest:    "); for (int i = 0; i < dest.Len(); i++) fprintf(f, (decode ? " %x" :  " %02x"), uint(dest[i]));
                        fprintf(f, "\n    expDest  "); for (int i = 0; i < expectedDest.Len(); i++) fprintf(f, (decode ? " %x" :  " %02x"), uint(expectedDest[i]));
                        fprintf(f, "\n    retVal = %llu (expected %llu)\n", static_cast<long long unsigned int> (retVal), static_cast<long long unsigned int> (expectedRetVal)); }
                if (retVal != expectedRetVal)
                        printf("!!!");
                IAssert(retVal == expectedRetVal); IAssert(! expectedThrow);
                if (dest.Len() != expectedDest.Len())
                        printf("!!!");
                IAssert(dest.Len() == expectedDest.Len());
                for (int i = 0; i < dest.Len(); i++) IAssert(dest[i] == expectedDest[i]);
        }
        catch (TUnicodeException e)
        {
                if (f) {
                        fprintf(f, "\n -> expDest  "); for (int i = 0; i < expectedDest.Len(); i++) fprintf(f, " %x", uint(expectedDest[i]));
                        fprintf(f, "\n    exception \"%s\" at %d (char 0x%02x)\n", e.message.CStr(), int(e.srcIdx), uint(e.srcChar)); }
                IAssert(expectedThrow);
        }
}

Here is the call graph for this function:

void TUniCodec::TestUtf8 ( )

Definition at line 198 of file unicode.cpp.

References TVec< TVal, TSizeTy >::Add(), TVec< TVal, TSizeTy >::Clr(), EncodeUtf8(), errorHandling, TVec< TVal, TSizeTy >::Gen(), TUInt::Mx, replacementChar, TVec< TVal, TSizeTy >::Reserve(), skipBom, strict, TestDecodeUtf8(), uehReplace, and uehThrow.

Referenced by TestDecodeUtf8().

{
        TIntV utf8ReplCh; EncodeUtf8((TVectorBuilder(), replacementChar).v, 0, 1, utf8ReplCh, true);
        for (int skipBom_ = 0; skipBom_ < 2; skipBom_++)
        for (int strict_ = 0; strict_ < 2; strict_++)
        for (int errMode_ = 0; errMode_ < 4; errMode_++)
        {
                strict = (strict_ == 1); errorHandling = TUnicodeErrorHandling(errMode_); skipBom = (skipBom_ == 1);
                TRnd rnd = TRnd(123);
                // Test DecodeUtf8 on various random UTF-8-encoded sequences.
                for (int i = 0; i < 10; i++)
                {
                        TestDecodeUtf8(rnd, "X3A1A2A3A4A5A6B2B3B4B5B6C3C4C5C6D4D5D6E5E6F6G6");
                        TestDecodeUtf8(rnd, "X3A5dA6d");
                        TestDecodeUtf8(rnd, "X3A1B2C3D4E4F5A1G6H6Y3X3A1");
                        TestDecodeUtf8(rnd, "X3A1B2C3D4E4F5A2G6H6Y3X3A1");
                        TestDecodeUtf8(rnd, "Y3A1B2C3D4E4F5A1G6H6Y3X3A1");
                        TestDecodeUtf8(rnd, "A1B2C3D4E4F5A1G6H6Y3X3A1");
                        TestDecodeUtf8(rnd, "G6A1A1D4E4A1B2");
                        TestDecodeUtf8(rnd, "D4A1A1C3A1B2A1B2");
                        TestDecodeUtf8(rnd, "D4A1A1C3A1B2A1B2D4a");
                        TestDecodeUtf8(rnd, "X3A1B2C3D5E4F5A1G6H6Y3X3A1");
                        TestDecodeUtf8(rnd, "X3A1B2C3D4E5F5A1G6H6Y3X3A1");
                        TestDecodeUtf8(rnd, "X3A1B2C3D4aE4F5A1G6H6Y3X3A1");
                        TestDecodeUtf8(rnd, "X3A1B2C3D4bE4F5A1G6H6Y3X3A1");
                        TestDecodeUtf8(rnd, "X3A2aA3aA4aA5aA6aB2aB3aB4aB5aB6aC3aC4aC5aC6aD4aD5aD6aE5aE6aF6aG6a");
                        TestDecodeUtf8(rnd, "X3A3bA4bA5bA6aB3bB4bB5bB6bC3bC4bC5bC6bD4bD5bD6bE5bE6bF6bG6b");
                        TestDecodeUtf8(rnd, "X3A4cA5cA6cB4cB5cB6cC4cC5cC6cD4cD5cD6cE5cE6cF6cG6c");
                        TestDecodeUtf8(rnd, "X3A5dA6dB5dB6dC5dC6dD5dD6dE5dE6dF6dG6d");
                        TestDecodeUtf8(rnd, "X3A6eB6eC6eD6eE6eF6eG6e");
                }
                // Test both DecodeUtf8 and EncodeUtf8 systematically on various characters
                // close to powers of 2.
                TIntV src, expectedDest, src2;
                expectedDest.Gen(1); src.Reserve(6); src2.Gen(1);
                for (int pow = 8; pow <= 32; pow++)
                {
                        uint uFrom, uTo;
                        if (pow == 8) uFrom = 0, uTo = 1u << pow;
                        else if (pow == 32) uFrom = TUInt::Mx - (1u << 8), uTo = TUInt::Mx;
                        else uFrom = (1u << pow) - (1u << 8), uTo = (1u << pow) + (1u << 8);
                        printf("%u..%u          \r", uFrom, uTo);
                        for (uint u = uFrom; ; u++)
                        {
                                int nBytes = 0;
                                if (u < (1u << 7)) nBytes = 1;
                                else if (u < (1u << 11)) nBytes = 2;
                                else if (u < (1u << 16)) nBytes = 3;
                                else if (u < (1u << 21)) nBytes = 4;
                                else if (u < (1u << 26)) nBytes = 5;
                                else nBytes = 6;
                                src.Gen(6, nBytes);
                                if (nBytes == 1) src[0] = u;
                                else {
                                        src[0] = (((1 << nBytes) - 1) << (8 - nBytes)) | (u >> (6 * (nBytes - 1)));
                                        for (int i = 1; i < nBytes; i++) src[i] = 0x80 | ((u >> (6 * (nBytes - i - 1))) & _0011_1111); }
                                bool err = (strict && u > 0x10ffff);
                                expectedDest.Reserve(1, 0);
                                if (err && errorHandling == uehReplace) expectedDest.Add(replacementChar);
                                else if (! err) expectedDest.Add(u);
                                int erv = (err ? 0 : 1);
                                if (skipBom && (u == 0xfeff || u == 0xfffe)) expectedDest.Clr(), erv = 0;
                                TestUtf8(true, erv, (err && errorHandling == uehThrow), src, expectedDest, 0);
                                // We can also test the UTF-8 encoder.
                                src2[0] = u;
                                if (err) {
                                        if (errorHandling == uehReplace) src = utf8ReplCh;
                                        else src.Clr(false); }
                                TestUtf8(false, (err ? 0 : 1), (err && errorHandling == uehThrow), src2, src, 0);
                                //
                                if (u == uTo) break;
                        }
                }
        }
}

Here is the call graph for this function:

Here is the caller graph for this function:

void TUniCodec::WordsToBytes	(	const TIntV &	src,
		TIntV &	dest
	)		`[protected]`

Definition at line 278 of file unicode.cpp.

References TVec< TVal, TSizeTy >::Add(), TVec< TVal, TSizeTy >::Clr(), IsMachineLittleEndian(), and TVec< TVal, TSizeTy >::Len().

Referenced by TestUtf16().

{
        dest.Clr();
        bool isLE = IsMachineLittleEndian();
        for (int i = 0; i < src.Len(); i++) {
                int c = src[i] & 0xffff;
                if (isLE) { dest.Add(c & 0xff); dest.Add((c >> 8) & 0xff); }
                else { dest.Add((c >> 8) & 0xff); dest.Add(c & 0xff); } }
}