diff -r 000000000000 -r 1fb32624e06b charconvfw/Charconv/ongoing/Source/utf/UTF.CPP --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/charconvfw/Charconv/ongoing/Source/utf/UTF.CPP Tue Feb 02 02:02:46 2010 +0200 @@ -0,0 +1,1198 @@ +/* +* Copyright (c) 1997-2004 Nokia Corporation and/or its subsidiary(-ies). +* All rights reserved. +* This component and the accompanying materials are made available +* under the terms of "Eclipse Public License v1.0" +* which accompanies this distribution, and is available +* at the URL "http://www.eclipse.org/legal/epl-v10.html". +* +* Initial Contributors: +* Nokia Corporation - initial contribution. +* +* Contributors: +* +* Description: +* +*/ + + + + + + + + +#include +#include +#include + +const TUint KNotInBase64Alphabet=KMaxTUint; + +enum TPanic + { + EPanicBad6BitNumber=1, + EPanicBadUtf7Pointers1, + EPanicBadUtf7Pointers2, + EPanicBadUtf7Pointers3, + EPanicBadUtf7Pointers4, + EPanicBadUtf7Pointers5, + EPanicBadUtf7Pointers6, + EPanicBadUtf7Pointers7, + EPanicBadUtf7Pointers8, + EPanicBadUtf7Pointers9, + EPanicBadUtf7Pointers10, + EPanicBadUtf7Pointers11, + EPanicNotInBase64Block, + EPanicBadUnicodePointers1, + EPanicBadUnicodePointers2, + EPanicBadUnicodePointers3, + EPanicBadUnicodePointers4, + EPanicBadUnicodePointers5, + EPanicBadUnicodePointers6, + EPanicBadUnicodePointers7, + EPanicBadUnicodePointers8, + EPanicBadUnicodePointers9, + EPanicBadUnicodePointers10, + EPanicBadBitBufferState1, + EPanicBadBitBufferState2, + EPanicBadBitBufferState3, + EPanicBadBitBufferState4, + EPanicBadBitBufferState5, + EPanicBadBitBufferState6, + EPanicBadBitBufferState7, + EPanicBadBitBufferState8, + EPanicBadBitBufferState9, + EPanicBadBitBufferState10, + EPanicBadBitBufferState11, + EPanicBadBitBufferState12, + EPanicBadBitBufferState13, + EPanicBadBitBufferState14, + EPanicBadBitBufferState15, + EPanicBadBitBufferState16, + EPanicBadBitBufferState17, + EPanicUnexpectedNumberOfLoopIterations, + EPanicInitialEscapeCharacterButNoBase64, + EPanicBase64SequenceDoesNotFallOnUnicodeCharacterBoundary, + EPanicBadUtf8Pointers1, + EPanicBadUtf8Pointers2, + EPanicBadUtf8Pointers3, + EPanicBadUtf8Pointers4, + EPanicBadUtf8Pointers5, + EPanicBadUtf8Pointers6, + EPanicBadUtf8Pointers7, + EPanicOutOfSyncUtf7Byte1, + EPanicOutOfSyncUtf7Byte2, + EPanicOutOfSyncBase64Decoding + }; + +_LIT(KLitPanicText, "CHARCONV-UTF"); + +LOCAL_C void Panic(TPanic aPanic) + { + User::Panic(KLitPanicText, aPanic); + } + +inline TUint EscapeCharacterForStartingBase64Block(TBool aIsImapUtf7) {return aIsImapUtf7? '&': '+';} + +LOCAL_C TUint Base64Decoding(TUint aMemberOfBase64Alphabet, TBool aIsImapUtf7) + { + if ((aMemberOfBase64Alphabet>='A') && (aMemberOfBase64Alphabet<='Z')) + { + return aMemberOfBase64Alphabet-'A'; + } + if ((aMemberOfBase64Alphabet>='a') && (aMemberOfBase64Alphabet<='z')) + { + return aMemberOfBase64Alphabet-('a'-26); + } + if ((aMemberOfBase64Alphabet>='0') && (aMemberOfBase64Alphabet<='9')) + { + return aMemberOfBase64Alphabet+((26*2)-'0'); + } + if (aMemberOfBase64Alphabet=='+') + { + return 62; + } + if (aMemberOfBase64Alphabet==STATIC_CAST(TUint, aIsImapUtf7? ',': '/')) + { + return 63; + } + return KNotInBase64Alphabet; + } + +LOCAL_C TUint Base64Encoding(TUint a6BitNumber, TBool aIsImapUtf7) + { + __ASSERT_DEBUG(a6BitNumber<64, Panic(EPanicBad6BitNumber)); + if ((a6BitNumber==63) && aIsImapUtf7) + { + return ','; + } + static const TUint8 base64Alphabet[64]={'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'}; + return base64Alphabet[a6BitNumber]; + } + +LOCAL_C TUint8* PointerToEscapeCharacterStartingBase64Block(TUint8* aPointerToUtf7Byte, const TUint8* aPointerToFirstUtf7Byte, TBool aIsImapUtf7) + { + __ASSERT_DEBUG(aPointerToUtf7Byte>=aPointerToFirstUtf7Byte, Panic(EPanicBadUtf7Pointers1)); + TUint8* pointerToCandidateEscapeCharacter=NULL; + FOREVER + { + const TUint utf7Byte=*aPointerToUtf7Byte; + if (utf7Byte==EscapeCharacterForStartingBase64Block(aIsImapUtf7)) + { + pointerToCandidateEscapeCharacter=aPointerToUtf7Byte; + } + else if (Base64Decoding(utf7Byte, aIsImapUtf7)==KNotInBase64Alphabet) + { + break; + } + __ASSERT_DEBUG(aPointerToUtf7Byte>=aPointerToFirstUtf7Byte, Panic(EPanicBadUtf7Pointers2)); + if (aPointerToUtf7Byte<=aPointerToFirstUtf7Byte) + { + break; + } + --aPointerToUtf7Byte; + } + __ASSERT_DEBUG(pointerToCandidateEscapeCharacter!=NULL, Panic(EPanicNotInBase64Block)); + return pointerToCandidateEscapeCharacter; + } + +LOCAL_C TBool EncodeInUtf7Directly(TUint aUnicodeCharacter, TBool aIsImapUtf7, TBool aEncodeOptionalDirectCharactersInBase64) + { + if (aIsImapUtf7) + { + return (aUnicodeCharacter>=0x0020) && (aUnicodeCharacter<=0x007e); + } + if ((aUnicodeCharacter>=0x0021) && (aUnicodeCharacter<=0x007d)) + { + if (aEncodeOptionalDirectCharactersInBase64) + { + return (((aUnicodeCharacter>=0x0041) && (aUnicodeCharacter<=0x005a)) || + ((aUnicodeCharacter>=0x0061) && (aUnicodeCharacter<=0x007a)) || + ((aUnicodeCharacter>=0x0027) && (aUnicodeCharacter<=0x0029)) || + ((aUnicodeCharacter>=0x002b) && (aUnicodeCharacter<=0x003a)) || + (aUnicodeCharacter==0x003f)); + } + return aUnicodeCharacter!=0x005c; + } + return (aUnicodeCharacter==0x0020) || (aUnicodeCharacter==0x0009) || (aUnicodeCharacter==0x000d) || (aUnicodeCharacter==0x000a); + } + +inline TBool BitBufferContainsNonZeroBits(TUint aBitBuffer, TInt aNumberOfBitsInBuffer) + { + return (aBitBuffer&((1< buf; + HBufC8* hBuf8 = HBufC8::NewLC(length); + TPtr8 utf7 = hBuf8->Des(); + + FOREVER + { + TInt unconverted = ConvertFromUnicodeToUtf7(buf, unicode, aEncodeOptionalDirectCharactersInBase64); + if( unconverted == EErrorIllFormedInput || unconverted < 0) + User::Leave(KErrCorrupt); + + if (utf7.Length() + buf.Length() > utf7.MaxLength()) + { + // Reallocate the hBuf8 + hBuf8 = hBuf8->ReAllocL(utf7.Length() + buf.Length()); + CleanupStack::Pop(); + CleanupStack::PushL(hBuf8); + utf7.Set(hBuf8->Des()); + } + utf7.Append(buf); + if (unconverted ==0) + break; + unicode.Set(unicode.Right(unconverted)); + } + CleanupStack::Pop(); + return hBuf8; + + } + +/** Converts Unicode text into UTF-7 encoding. + +@param aUtf7 On return, contains the UTF-7 encoded output string. +@param aUnicode A UCS-2 encoded input string. +@param aEncodeOptionalDirectCharactersInBase64 If ETrue then characters from +UTF-7 set O (optional direct characters) are encoded in Modified Base64. If +EFalse the characters are encoded directly, as their ASCII equivalents. +@return The number of unconverted characters left at the end of the input +descriptor, or one of the error values defined in TError. */ +EXPORT_C TInt CnvUtfConverter::ConvertFromUnicodeToUtf7( + TDes8& aUtf7, + const TDesC16& aUnicode, + TBool aEncodeOptionalDirectCharactersInBase64) + { + return ConvertFromUnicodeToUtf7(aUtf7, aUnicode, EFalse, aEncodeOptionalDirectCharactersInBase64); + } + +TInt CnvUtfConverter::ConvertFromUnicodeToUtf7(TDes8& aUtf7, + const TDesC16& aUnicode, + TBool aIsImapUtf7, + TBool aEncodeOptionalDirectCharactersInBase64) + { + if (aUnicode.Length()==0) + { + aUtf7.SetLength(0); + return 0; + } + if (aUtf7.MaxLength()==0) + { + return aUnicode.Length(); + } + const TUint escapeCharacterForStartingBase64Block=EscapeCharacterForStartingBase64Block(aIsImapUtf7); + TUint8* pointerToPreviousUtf7Byte=CONST_CAST(TUint8*, aUtf7.Ptr()-1); + const TUint8* const pointerToLastUtf7Byte=pointerToPreviousUtf7Byte+aUtf7.MaxLength(); + const TUint16* pointerToPreviousUnicodeCharacter=aUnicode.Ptr()-1; + const TUint16* const pointerToLastUnicodeCharacter=pointerToPreviousUnicodeCharacter+aUnicode.Length(); + const TUint KIsInBase64Block=0x80000000u; + TUint bitBuffer=0; + TInt numberOfBitsInBuffer=0; + FOREVER + { + __ASSERT_DEBUG(pointerToPreviousUtf7Byte<=pointerToLastUtf7Byte, Panic(EPanicBadUtf7Pointers3)); + __ASSERT_DEBUG(pointerToPreviousUnicodeCharacter<=pointerToLastUnicodeCharacter, Panic(EPanicBadUnicodePointers1)); + TUint currentUnicodeCharacter=(pointerToPreviousUnicodeCharacter==pointerToLastUnicodeCharacter)? 0: *(pointerToPreviousUnicodeCharacter+1); + if ((pointerToPreviousUnicodeCharacter==pointerToLastUnicodeCharacter) || EncodeInUtf7Directly(currentUnicodeCharacter, aIsImapUtf7, aEncodeOptionalDirectCharactersInBase64)) + { + __ASSERT_DEBUG((bitBuffer&KIsInBase64Block) || (numberOfBitsInBuffer==0), Panic(EPanicBadBitBufferState1)); + __ASSERT_DEBUG((numberOfBitsInBuffer==0) || (numberOfBitsInBuffer==2) || (numberOfBitsInBuffer==4), Panic(EPanicBadBitBufferState2)); + if (bitBuffer&KIsInBase64Block) + { + if (numberOfBitsInBuffer!=0) + { + if (pointerToLastUtf7Byte-pointerToPreviousUtf7Byte<2) // make sure there is enough space for the trailing '-' as well as the remains of the bitBuffer as the KIsInBase64Block flag is about to turned off, thus the trailing '-' may never get written + { + break; + } + ++pointerToPreviousUtf7Byte; + *pointerToPreviousUtf7Byte=STATIC_CAST(TUint8, Base64Encoding((bitBuffer<<(6-numberOfBitsInBuffer))&0x3f, aIsImapUtf7)); + } + else + { + if (pointerToPreviousUtf7Byte==pointerToLastUtf7Byte) + { + break; + } + } + ++pointerToPreviousUtf7Byte; + *pointerToPreviousUtf7Byte='-'; + bitBuffer=0; + numberOfBitsInBuffer=0; + } + __ASSERT_DEBUG(pointerToPreviousUnicodeCharacter<=pointerToLastUnicodeCharacter, Panic(EPanicBadUnicodePointers2)); + if (pointerToPreviousUnicodeCharacter>=pointerToLastUnicodeCharacter) + { + break; + } + __ASSERT_DEBUG(pointerToPreviousUtf7Byte<=pointerToLastUtf7Byte, Panic(EPanicBadUtf7Pointers4)); + if (pointerToLastUtf7Byte-pointerToPreviousUtf7Byte<((currentUnicodeCharacter==escapeCharacterForStartingBase64Block)? 2: 1)) + { + break; + } + ++pointerToPreviousUtf7Byte; + *pointerToPreviousUtf7Byte=STATIC_CAST(TUint8, currentUnicodeCharacter); + ++pointerToPreviousUnicodeCharacter; + if (currentUnicodeCharacter==escapeCharacterForStartingBase64Block) + { + ++pointerToPreviousUtf7Byte; + *pointerToPreviousUtf7Byte='-'; + } + } + else + { + { + TInt numberOfUtf7BytesRequired=(numberOfBitsInBuffer+16)/6; // "(numberOfBitsInBuffer+16)/6" is the number of iterations that will happen in the while loop below + if (~bitBuffer&KIsInBase64Block) + { + ++numberOfUtf7BytesRequired; // for the initial escapeCharacterForStartingBase64Block + } + if (pointerToLastUtf7Byte-pointerToPreviousUtf7Byte=6) + { + numberOfBitsInBuffer-=6; + __ASSERT_DEBUG(pointerToPreviousUtf7Byte>numberOfBitsInBuffer)&0x3f, aIsImapUtf7)); + } + bitBuffer&=((1<=((numberOfBitsInBuffer==0)? 1: 2)) // if there's room to finish off the base-64 sequence by (i) flushing the bit-buffer and (ii) appending the trailing '-' + { + if (numberOfBitsInBuffer!=0) + { + __ASSERT_DEBUG(pointerToPreviousUtf7Byte0, Panic(EPanicInitialEscapeCharacterButNoBase64)); + __ASSERT_DEBUG(((oldNumberOfBase64Characters*6)+numberOfBitsInBuffer)%16==0, Panic(EPanicBase64SequenceDoesNotFallOnUnicodeCharacterBoundary)); + pointerToPreviousUnicodeCharacter-=((oldNumberOfBase64Characters*6)+numberOfBitsInBuffer)/16; // move back pointerToPreviousUnicodeCharacter to before the equivalent of the base-64 sequence + pointerToPreviousUtf7Byte=pointerToEscapeCharacterStartingBase64Block; + __ASSERT_DEBUG(*pointerToPreviousUtf7Byte==escapeCharacterForStartingBase64Block, Panic(EPanicBadUtf7Pointers10)); + if (oldNumberOfBase64Characters<4) // if the new base-64 sequence will be so short that it won't even be able to contain the UTF-7 encoding of a single Unicode character + { + --pointerToPreviousUtf7Byte; // move back pointerToPreviousUtf7Byte to before the escapeCharacterForStartingBase64Block + break; + } + const TInt newNumberOfUnicodeCharacters=((oldNumberOfBase64Characters-1)*3)/8; + pointerToPreviousUnicodeCharacter+=newNumberOfUnicodeCharacters; + pointerToPreviousUtf7Byte+=((newNumberOfUnicodeCharacters*8)+2)/3; + const TInt numberOfBitsToBeZeroedInLastBase64Character=(newNumberOfUnicodeCharacters%3)*2; + if (numberOfBitsToBeZeroedInLastBase64Character!=0) + { + *pointerToPreviousUtf7Byte=STATIC_CAST(TUint8, Base64Encoding(Base64Decoding(*pointerToPreviousUtf7Byte, aIsImapUtf7)&0x3f&~((1< buf; + HBufC8* hBuf8 = HBufC8::NewLC(length); + TPtr8 utf8 = hBuf8->Des(); + + FOREVER + { + TInt unconverted = ConvertFromUnicodeToUtf8(buf, unicode); + if( unconverted == EErrorIllFormedInput || unconverted < 0) + User::Leave(KErrCorrupt); + + if (utf8.Length() + buf.Length() > utf8.MaxLength()) + { + // Reallocate the hBuf8 + hBuf8 = hBuf8->ReAllocL(utf8.Length() + buf.Length()); + CleanupStack::Pop(); + CleanupStack::PushL(hBuf8); + utf8.Set(hBuf8->Des()); + } + utf8.Append(buf); + if (unconverted ==0) + break; + unicode.Set(unicode.Right(unconverted)); + } + CleanupStack::Pop(); + return hBuf8; + } + +/** Converts Unicode text into UTF-8 encoding. + +The variant of UTF-8 used internally by Java differs slightly from standard +UTF-8. The TBool argument controls the UTF-8 variant generated by this function. + +@param aUtf8 On return, contains the UTF-8 encoded output string. +@param aUnicode A UCS-2 encoded input string. +@param aGenerateJavaConformantUtf8 EFalse for orthodox UTF-8. ETrue for Java +UTF-8. The default is EFalse. +@return The number of unconverted characters left at the end of the input descriptor, +or one of the error values defined in TError. */ +TInt CnvUtfConverter::ConvertFromUnicodeToUtf8(TDes8& aUtf8, + const TDesC16& aUnicode, + TBool aGenerateJavaConformantUtf8) + { + if (aUnicode.Length()==0) + { + aUtf8.SetLength(0); + return 0; + } + if (aUtf8.MaxLength()==0) + { + return aUnicode.Length(); + } + TUint8* pointerToCurrentUtf8Byte=CONST_CAST(TUint8*, aUtf8.Ptr()); + const TUint8* pointerToLastUtf8Byte=pointerToCurrentUtf8Byte+(aUtf8.MaxLength()-1); + const TUint16* pointerToCurrentUnicodeCharacter=aUnicode.Ptr(); + const TUint16* pointerToLastUnicodeCharacter=pointerToCurrentUnicodeCharacter+(aUnicode.Length()-1); + TBool inputIsTruncated=EFalse; + FOREVER + { + __ASSERT_DEBUG(pointerToCurrentUtf8Byte<=pointerToLastUtf8Byte, Panic(EPanicBadUtf8Pointers1)); + __ASSERT_DEBUG(pointerToCurrentUnicodeCharacter<=pointerToLastUnicodeCharacter, Panic(EPanicBadUnicodePointers3)); + TUint currentUnicodeCharacter=*pointerToCurrentUnicodeCharacter; + if (((currentUnicodeCharacter&0xff80)==0x0000) && ((currentUnicodeCharacter!=0x0000) || !aGenerateJavaConformantUtf8)) + { + *pointerToCurrentUtf8Byte=STATIC_CAST(TUint8, currentUnicodeCharacter); + } + else if ((currentUnicodeCharacter&0xf800)==0x0000) + { + if (pointerToCurrentUtf8Byte==pointerToLastUtf8Byte) + { + --pointerToCurrentUtf8Byte; + --pointerToCurrentUnicodeCharacter; + break; + } + *pointerToCurrentUtf8Byte=STATIC_CAST(TUint8, 0xc0|(currentUnicodeCharacter>>6)); + ++pointerToCurrentUtf8Byte; + *pointerToCurrentUtf8Byte=STATIC_CAST(TUint8, 0x80|(currentUnicodeCharacter&0x3f)); + } + else if (((currentUnicodeCharacter&0xfc00)==0xd800) && !aGenerateJavaConformantUtf8) + { + __ASSERT_DEBUG(pointerToCurrentUtf8Byte<=pointerToLastUtf8Byte, Panic(EPanicBadUtf8Pointers2)); + if (pointerToLastUtf8Byte-pointerToCurrentUtf8Byte<3) + { + --pointerToCurrentUtf8Byte; + --pointerToCurrentUnicodeCharacter; + break; + } + __ASSERT_DEBUG(pointerToCurrentUnicodeCharacter<=pointerToLastUnicodeCharacter, Panic(EPanicBadUnicodePointers4)); + if (pointerToCurrentUnicodeCharacter>=pointerToLastUnicodeCharacter) + { + --pointerToCurrentUtf8Byte; + --pointerToCurrentUnicodeCharacter; + inputIsTruncated=ETrue; + break; + } + currentUnicodeCharacter+=0x0040; + *pointerToCurrentUtf8Byte=STATIC_CAST(TUint8, 0xf0|((currentUnicodeCharacter>>8)&0x07)); + ++pointerToCurrentUtf8Byte; + *pointerToCurrentUtf8Byte=STATIC_CAST(TUint8, 0x80|((currentUnicodeCharacter>>2)&0x3f)); + { + TUint currentUtf8Byte=(0x80|((currentUnicodeCharacter&0x03)<<4)); + ++pointerToCurrentUnicodeCharacter; + currentUnicodeCharacter=*pointerToCurrentUnicodeCharacter; + if ((currentUnicodeCharacter&0xfc00)!=0xdc00) + { + return EErrorIllFormedInput; + } + currentUtf8Byte|=((currentUnicodeCharacter>>6)&0x0f); + ++pointerToCurrentUtf8Byte; + *pointerToCurrentUtf8Byte=STATIC_CAST(TUint8, currentUtf8Byte); + } + ++pointerToCurrentUtf8Byte; + *pointerToCurrentUtf8Byte=STATIC_CAST(TUint8, 0x80|(currentUnicodeCharacter&0x3f)); + } + else + { + if (pointerToLastUtf8Byte-pointerToCurrentUtf8Byte<2) + { + --pointerToCurrentUtf8Byte; + --pointerToCurrentUnicodeCharacter; + break; + } + *pointerToCurrentUtf8Byte=STATIC_CAST(TUint8, 0xe0|(currentUnicodeCharacter>>12)); + ++pointerToCurrentUtf8Byte; + *pointerToCurrentUtf8Byte=STATIC_CAST(TUint8, 0x80|((currentUnicodeCharacter>>6)&0x3f)); + ++pointerToCurrentUtf8Byte; + *pointerToCurrentUtf8Byte=STATIC_CAST(TUint8, 0x80|(currentUnicodeCharacter&0x3f)); + } + if ((pointerToCurrentUnicodeCharacter==pointerToLastUnicodeCharacter) || (pointerToCurrentUtf8Byte==pointerToLastUtf8Byte)) + { + break; + } + ++pointerToCurrentUtf8Byte; + ++pointerToCurrentUnicodeCharacter; + } + if ((pointerToCurrentUnicodeCharacter buf; + HBufC16* hBuf = HBufC16::NewLC(length); + TPtr unicode = hBuf->Des(); + + FOREVER + { + TInt unconverted = ConvertToUnicodeFromUtf7(buf, utf7, state); + if( unconverted == EErrorIllFormedInput || unconverted < 0) + User::Leave(KErrCorrupt); + + if (unicode.Length() + buf.Length() > unicode.MaxLength()) + { + // Reallocate hBuf + hBuf = hBuf->ReAllocL(unicode.Length() + buf.Length()); + CleanupStack::Pop(); + CleanupStack::PushL(hBuf); + unicode.Set(hBuf->Des()); + } + unicode.Append(buf); + if (unconverted ==0) + break; + utf7.Set(utf7.Right(unconverted)); + } + CleanupStack::Pop(); + return hBuf; + } + + + +/** Converts text encoded using the Unicode transformation format UTF-7 into the +Unicode UCS-2 character set. + +If the conversion is achieved using a series of calls to this function, where +each call starts off where the previous call reached in the input descriptor, +the state of the conversion is stored. The initial value of the state variable +should be set as KStateDefault when the conversion is started, and afterwards +simply passed unchanged into each function call. + +@param aUnicode On return, contains the Unicode encoded output string. +@param aUtf7 The UTF-7 encoded input string. +@param aState For the first call of the function set to KStateDefault. For +subsequent calls, pass in the variable unchanged. +@return The number of unconverted bytes left at the end of the input descriptor, +or one of the error values defined in TError. */ +EXPORT_C TInt CnvUtfConverter::ConvertToUnicodeFromUtf7(TDes16& aUnicode, + const TDesC8& aUtf7, + TInt& aState) + { + return ConvertToUnicodeFromUtf7(aUnicode, aUtf7, EFalse, aState); + } + +TInt CnvUtfConverter::ConvertToUnicodeFromUtf7(TDes16& aUnicode, + const TDesC8& aUtf7, + TBool aIsImapUtf7, + TInt& aState) + { + if (aUtf7.Length()==0) + { + aUnicode.SetLength(0); + return 0; + } + if (aUnicode.MaxLength()==0) + { + return aUtf7.Length(); + } + const TUint escapeCharacterForStartingBase64Block=EscapeCharacterForStartingBase64Block(aIsImapUtf7); + TUint16* pointerToPreviousUnicodeCharacter=CONST_CAST(TUint16*, aUnicode.Ptr()-1); + const TUint16* pointerToLastUnicodeCharacter=pointerToPreviousUnicodeCharacter+aUnicode.MaxLength(); + const TUint8* pointerToCurrentUtf7Byte=aUtf7.Ptr(); + const TUint8* pointerToLastUtf7Byte=pointerToCurrentUtf7Byte+(aUtf7.Length()-1); + TUint currentUtf7Byte=*pointerToCurrentUtf7Byte; + const TUint KIsInBase64Block=0x80000000u; + TUint bitBuffer=STATIC_CAST(TUint, aState); + TInt numberOfBitsInBuffer=((bitBuffer&0xf0)>>4); + bitBuffer&=~0xf0; // turn off the bits that stored numberOfBitsInBuffer + if (bitBuffer&KIsInBase64Block) + { + __ASSERT_ALWAYS((numberOfBitsInBuffer==0) || (numberOfBitsInBuffer==2) || (numberOfBitsInBuffer==4) || ((numberOfBitsInBuffer<16) && (numberOfBitsInBuffer%2==0) && !BitBufferContainsNonZeroBits(bitBuffer, numberOfBitsInBuffer)), Panic(EPanicBadBitBufferState7)); + __ASSERT_ALWAYS((bitBuffer&~(KIsInBase64Block|0x0000000f))==0, Panic(EPanicBadBitBufferState8)); + } + else + { + __ASSERT_ALWAYS(bitBuffer==0, Panic(EPanicBadBitBufferState9)); + __ASSERT_ALWAYS(numberOfBitsInBuffer==0, Panic(EPanicBadBitBufferState10)); + } + aState=KStateDefault; + if (bitBuffer&KIsInBase64Block) + { + currentUtf7Byte=Base64Decoding(currentUtf7Byte, aIsImapUtf7); + } + TBool inputIsTruncated=EFalse; + FOREVER + { + __ASSERT_DEBUG(pointerToPreviousUnicodeCharacter=16+6) || ((numberOfBitsInBuffer>=16) && !BitBufferContainsNonZeroBits(bitBuffer, numberOfBitsInBuffer-16))) + { + numberOfBitsInBuffer-=16; + __ASSERT_DEBUG(pointerToPreviousUnicodeCharacter>numberOfBitsInBuffer); + bitBuffer&=((1<=6, Panic(EPanicBadBitBufferState14)); + pointerToCurrentUtf7Byte-=numberOfBitsInBuffer/6; + const TInt newNumberOfBitsInBuffer=numberOfBitsInBuffer%6; + bitBuffer&=~KIsInBase64Block; // temporarily turn off the KIsInBase64Block for the right-shift + bitBuffer>>=(numberOfBitsInBuffer-newNumberOfBitsInBuffer); + bitBuffer|=KIsInBase64Block; // must be turned back on again as the bit-buffer is packed into aState + numberOfBitsInBuffer=newNumberOfBitsInBuffer; + __ASSERT_DEBUG((numberOfBitsInBuffer==0) || (numberOfBitsInBuffer==2) || (numberOfBitsInBuffer==4), Panic(EPanicBadBitBufferState15)); + } + __ASSERT_DEBUG((numberOfBitsInBuffer<16) && (numberOfBitsInBuffer%2==0), Panic(EPanicBadBitBufferState16)); + aState=STATIC_CAST(TInt, bitBuffer); + aState|=(numberOfBitsInBuffer<<4); + __ASSERT_DEBUG(aState&KIsInBase64Block, Panic(EPanicBadBitBufferState17)); + bitBuffer=0; + numberOfBitsInBuffer=0; + } + if ((pointerToCurrentUtf7Byte buf; + HBufC16* hBuf = HBufC16::NewLC(length); + TPtr unicode = hBuf->Des(); + + FOREVER + { + TInt unconverted = ConvertToUnicodeFromUtf8(buf, utf8); + if( unconverted == EErrorIllFormedInput || unconverted < 0) + User::Leave(KErrCorrupt); + + if (unicode.Length() + buf.Length() > unicode.MaxLength()) + { + // Reallocate hBuf + hBuf = hBuf->ReAllocL(unicode.Length() + buf.Length()); + CleanupStack::Pop(); + CleanupStack::PushL(hBuf); + unicode.Set(hBuf->Des()); + } + unicode.Append(buf); + if (unconverted ==0) + break; + utf8.Set(utf8.Right(unconverted)); + } + CleanupStack::Pop(); + return hBuf; + } + +/** Converts text encoded using the Unicode transformation format UTF-8 into the +Unicode UCS-2 character set. + +@param aUnicode On return, contains the Unicode encoded output string. +@param aUtf8 The UTF-8 encoded input string +@return The number of unconverted bytes left at the end of the input descriptor, +or one of the error values defined in TError. */ +EXPORT_C TInt CnvUtfConverter::ConvertToUnicodeFromUtf8(TDes16& aUnicode, const TDesC8& aUtf8) + { + return ConvertToUnicodeFromUtf8(aUnicode, aUtf8, EFalse); + } + +static void UpdateUnconvertibleInfo(TInt& aNumberOfUnconvertibleCharacters, + TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter, TUint8 aIndex) + { + if (aNumberOfUnconvertibleCharacters<=0) + { + aIndexOfFirstByteOfFirstUnconvertibleCharacter = aIndex; + } + ++aNumberOfUnconvertibleCharacters; + } + +/** Converts text encoded using the Unicode transformation format UTF-8 into the +Unicode UCS-2 character set. + +@param aUnicode On return, contains the Unicode encoded output string. +@param aUtf8 The UTF-8 encoded input string +@param aGenerateJavaConformantUtf8 EFalse for orthodox UTF-8. ETrue for Java +@return The number of unconverted bytes left at the end of the input descriptor, +or one of the error values defined in TError. */ +TInt CnvUtfConverter::ConvertToUnicodeFromUtf8(TDes16& aUnicode, const TDesC8& aUtf8, TBool aGenerateJavaConformantUtf8) + { + TInt dummyUnconverted, dummyUnconvertedIndex; + return ConvertToUnicodeFromUtf8(aUnicode, aUtf8, aGenerateJavaConformantUtf8, dummyUnconverted, dummyUnconvertedIndex); + } + +/** Converts text encoded using the Unicode transformation format UTF-8 into the +Unicode UCS-2 character set. Surrogate pairs can be created when a valid 4 byte UTF-8 is input. + +The variant of UTF-8 used internally by Java differs slightly from standard +UTF-8. The TBool argument controls the UTF-8 variant generated by this function. + +@param aUnicode On return, contains the Unicode encoded output string. +@param aUtf8 The UTF-8 encoded input string +@param aGenerateJavaConformantUtf8 EFalse for orthodox UTF-8. ETrue for Java +UTF-8. The default is EFalse. +@param aNumberOfUnconvertibleCharacters On return, contains the number of bytes +which were not converted. +@param aIndexOfFirstByteOfFirstUnconvertibleCharacter On return, the index +of the first byte of the first unconvertible character. For instance if the +first character in the input descriptor (aForeign) could not be converted, +then this parameter is set to the first byte of that character, i.e. zero. +A negative value is returned if all the characters were converted. +@return The number of unconverted bytes left at the end of the input descriptor, +or one of the error values defined in TError. */ + +/* of note: conformance. Unicode standard 5.0 section 3.9, table 3-7 + * Well formed UTF-8 Byte Sequences, full table. + * +----------------------------------------------------------------+ + * | Code Points | 1st byte | 2nd byte | 3rd byte | 4th byte | + * +--------------------+----------+----------+----------+----------+ + * | U+0000..U+007F | 00..7D | | | | 1 byte, ascii + * | U+0080..U+07FF | C2..DF | 80..BF | | | 2 bytes, error if 1st < 0xC2 + * | U+0800..U+0FFF | E0 | A0..BF | 80..BF | | 3 bytes, 1st == 0xE0, error if 2nd < 0xA0 + * | U+1000..U+CFFF | E1..EC | 80..BF | 80..BF | | normal + * | U+D000..U+D7FF | ED | 80..9F | 80..BF | | 3 bytes, 1st == 0xED, error if 2nd > 0x9F + * | U+E000..U+FFFF | EE..EF | 80..BF | 80..BF | | normal + * | U+10000..U+3FFFF | F0 | 90..BF | 80..BF | 80..BF | 4 bytes, 1st == 0xf0, error if 2nd < 0x90 + * | U+40000..U+FFFFF | F1..F3 | 80..BF | 80..BF | 80..BF | normal + * | U+100000..U+10FFFF | F4 | 80..8F | 80..BF | 80..BF | 4 bytes, 1st == 0xF4, error if 2nd > 0x8F + * +--------------------+----------+----------+----------+----------+ + * + * As a consequence of the well-formedness conditions specified in table 3-7, + * the following byte values are disallowed in UTF-8: C0-C1, F5-FF. + */ +TInt CnvUtfConverter::ConvertToUnicodeFromUtf8(TDes16& aUnicode, const TDesC8& aUtf8, TBool aGenerateJavaConformantUtf8, + TInt& aNumberOfUnconvertibleCharacters, TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter) + { + aUnicode.SetLength(0); + if (aUtf8.Length()==0) + { + return 0; + } + if (aUnicode.MaxLength()==0) + { + return aUtf8.Length(); + } + + TUint16* pointerToCurrentUnicodeCharacter=CONST_CAST(TUint16*, aUnicode.Ptr()); + const TUint16* pointerToLastUnicodeCharacter=pointerToCurrentUnicodeCharacter+(aUnicode.MaxLength()-1); + const TUint8* pointerToCurrentUtf8Byte=aUtf8.Ptr(); + const TUint8* pointerToPendingUtf8Byte=aUtf8.Ptr(); + const TUint8* pointerToLastUtf8Byte=pointerToCurrentUtf8Byte+(aUtf8.Length()-1); + TUint16 replacementcharacter = 0xFFFD; + TUint8 currentUtf8Byte; + TUint currentUnicodeCharacter; + TInt sequenceLength; + + FOREVER + { + __ASSERT_DEBUG(pointerToCurrentUnicodeCharacter<=pointerToLastUnicodeCharacter, Panic(EPanicBadUnicodePointers8)); + __ASSERT_DEBUG(pointerToCurrentUtf8Byte<=pointerToLastUtf8Byte, Panic(EPanicBadUtf8Pointers3)); + currentUtf8Byte=*pointerToCurrentUtf8Byte; + pointerToPendingUtf8Byte = pointerToCurrentUtf8Byte; + sequenceLength=100; + + for(TInt i=0;i<7;i++) + { + if ((currentUtf8Byte&(0xf8<6) && sequenceLength!=0) + { + currentUnicodeCharacter=replacementcharacter; + UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters, + aIndexOfFirstByteOfFirstUnconvertibleCharacter, pointerToCurrentUtf8Byte-aUtf8.Ptr()); + } + else + { + if ((pointerToLastUtf8Byte-pointerToCurrentUtf8Byte+1)>sequenceLength); + + for(TInt i=sequenceLength;i>1; i--) + { + currentUtf8Byte = *(++pointerToCurrentUtf8Byte); + if ((currentUtf8Byte&0xc0)==0x80) + { + currentUnicodeCharacter = (currentUnicodeCharacter<<6)|(currentUtf8Byte&0x3F); + } + else + { + currentUnicodeCharacter=replacementcharacter; + UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters, + aIndexOfFirstByteOfFirstUnconvertibleCharacter, pointerToCurrentUtf8Byte-aUtf8.Ptr()); + --pointerToCurrentUtf8Byte; + } + } + } + + if (currentUnicodeCharacter > 0xFFFF) + { + if(pointerToCurrentUnicodeCharacter>=pointerToLastUnicodeCharacter) + { + pointerToCurrentUtf8Byte=pointerToPendingUtf8Byte; + break; + } + + TUint surrogate = (currentUnicodeCharacter>>10) + 0xD7C0; + *pointerToCurrentUnicodeCharacter=STATIC_CAST(TUint16, surrogate); + ++pointerToCurrentUnicodeCharacter; + + surrogate = (currentUnicodeCharacter&0x3FF)+0xDC00; + *pointerToCurrentUnicodeCharacter=STATIC_CAST(TUint16, surrogate); + ++pointerToCurrentUnicodeCharacter; + ++pointerToCurrentUtf8Byte; + } + else + { + *pointerToCurrentUnicodeCharacter=STATIC_CAST(TUint16, currentUnicodeCharacter); + ++pointerToCurrentUnicodeCharacter; + ++pointerToCurrentUtf8Byte; + } + + if ((pointerToCurrentUtf8Byte>pointerToLastUtf8Byte) || (pointerToCurrentUnicodeCharacter>pointerToLastUnicodeCharacter)) + { + break; + } + } + + aUnicode.SetLength(pointerToCurrentUnicodeCharacter-aUnicode.Ptr()); + return pointerToLastUtf8Byte-pointerToCurrentUtf8Byte+1; + } + + +GLREF_C void IsCharacterSetUTF8 (TInt& aConfidenceLevel, const TDesC8& aSample) + { + + TInt sampleLength = aSample.Length(); + if (sampleLength == 0) + { + aConfidenceLevel = 89; + return; + } + aConfidenceLevel=sampleLength; + TInt bytesRemaining=0; + + const TUint8* buffer=&aSample[0]; + for(TInt index=0; index!=sampleLength; ++index) + { + if(bytesRemaining>0) + { + // bytesRemaining > 0, means that a byte representing the start of a + // multibyte sequence was encountered and the bytesRemaining is the + // number of bytes to follow. The remaining bytes have to conform to + // values within the range 0x80 and 0xbf + if((buffer[index]&0xc0)==0x80) // the value is within range + { + --bytesRemaining; + continue; + } + else + { + bytesRemaining=0; + aConfidenceLevel=0; + break; + } + } + if (bytesRemaining==0) + { + if((buffer[index]&0x80)==0x00) + { + // The value of aSample[index] is in the range 0x00-0x7f + //UTF8 maintains ASCII transparency. So it's a valid + //UTF8. Do nothing, check next value. + } + else if((buffer[index]&0xe0)==0xc0) + { + bytesRemaining=1; + } + else if((buffer[index]&0xf0)==0xe0) + { + bytesRemaining=2; + } + else if((buffer[index]&0xf8)==0xf0) + { + bytesRemaining=3; + } + else + { + // wasn't anything expected so must be an illegal/irregular UTF8 coded value + aConfidenceLevel=0; + break; + } + } + } // for + aConfidenceLevel = (aConfidenceLevel > 0)?100:0; + } + +GLREF_C void IsCharacterSetUTF7(TInt& aConfidenceLevel, const TDesC8& aSample) + { + TInt sampleLength = aSample.Length(); + aConfidenceLevel = 70; + for (TInt i=0; i= sampleLength) + break; + for (TInt j=0; j < 11; ++j) + { + if (aSample[increment1] == smsExtensionTable[j]) + { + aConfidenceLevel-=10; + } + } + } + // The UTF-7 escape char is 0x2b. The values that follow the escape sequence + // the values following the escape char value must belong to the modified base64 + // or '-' else it is an ill-formed sequence, so probably not UTF-7 + else if ( (aSample[i]==0x2b) && (i = 0x41) && (aSample[increment1] <= 0x5a)) || + ((aSample[increment1] >= 0x61) && (aSample[increment1] <= 0x7a))) + { + aConfidenceLevel+=5; + } + else + { + aConfidenceLevel-=15; + } + i++; // should this be here or up in the if loop ?? + } + } //for + aConfidenceLevel =(aConfidenceLevel >0)? ((aConfidenceLevel > 100)? 100: aConfidenceLevel): 0; + }