|
1 /* |
|
2 * Copyright (c) 1997-2004 Nokia Corporation and/or its subsidiary(-ies). |
|
3 * All rights reserved. |
|
4 * This component and the accompanying materials are made available |
|
5 * under the terms of "Eclipse Public License v1.0" |
|
6 * which accompanies this distribution, and is available |
|
7 * at the URL "http://www.eclipse.org/legal/epl-v10.html". |
|
8 * |
|
9 * Initial Contributors: |
|
10 * Nokia Corporation - initial contribution. |
|
11 * |
|
12 * Contributors: |
|
13 * |
|
14 * Description: |
|
15 * |
|
16 */ |
|
17 |
|
18 |
|
19 |
|
20 |
|
21 |
|
22 |
|
23 |
|
24 |
|
25 #include <e32std.h> |
|
26 #include <e32base.h> |
|
27 #include <utf.h> |
|
28 |
|
29 const TUint KNotInBase64Alphabet=KMaxTUint; |
|
30 |
|
31 enum TPanic |
|
32 { |
|
33 EPanicBad6BitNumber=1, |
|
34 EPanicBadUtf7Pointers1, |
|
35 EPanicBadUtf7Pointers2, |
|
36 EPanicBadUtf7Pointers3, |
|
37 EPanicBadUtf7Pointers4, |
|
38 EPanicBadUtf7Pointers5, |
|
39 EPanicBadUtf7Pointers6, |
|
40 EPanicBadUtf7Pointers7, |
|
41 EPanicBadUtf7Pointers8, |
|
42 EPanicBadUtf7Pointers9, |
|
43 EPanicBadUtf7Pointers10, |
|
44 EPanicBadUtf7Pointers11, |
|
45 EPanicNotInBase64Block, |
|
46 EPanicBadUnicodePointers1, |
|
47 EPanicBadUnicodePointers2, |
|
48 EPanicBadUnicodePointers3, |
|
49 EPanicBadUnicodePointers4, |
|
50 EPanicBadUnicodePointers5, |
|
51 EPanicBadUnicodePointers6, |
|
52 EPanicBadUnicodePointers7, |
|
53 EPanicBadUnicodePointers8, |
|
54 EPanicBadUnicodePointers9, |
|
55 EPanicBadUnicodePointers10, |
|
56 EPanicBadBitBufferState1, |
|
57 EPanicBadBitBufferState2, |
|
58 EPanicBadBitBufferState3, |
|
59 EPanicBadBitBufferState4, |
|
60 EPanicBadBitBufferState5, |
|
61 EPanicBadBitBufferState6, |
|
62 EPanicBadBitBufferState7, |
|
63 EPanicBadBitBufferState8, |
|
64 EPanicBadBitBufferState9, |
|
65 EPanicBadBitBufferState10, |
|
66 EPanicBadBitBufferState11, |
|
67 EPanicBadBitBufferState12, |
|
68 EPanicBadBitBufferState13, |
|
69 EPanicBadBitBufferState14, |
|
70 EPanicBadBitBufferState15, |
|
71 EPanicBadBitBufferState16, |
|
72 EPanicBadBitBufferState17, |
|
73 EPanicUnexpectedNumberOfLoopIterations, |
|
74 EPanicInitialEscapeCharacterButNoBase64, |
|
75 EPanicBase64SequenceDoesNotFallOnUnicodeCharacterBoundary, |
|
76 EPanicBadUtf8Pointers1, |
|
77 EPanicBadUtf8Pointers2, |
|
78 EPanicBadUtf8Pointers3, |
|
79 EPanicBadUtf8Pointers4, |
|
80 EPanicBadUtf8Pointers5, |
|
81 EPanicBadUtf8Pointers6, |
|
82 EPanicBadUtf8Pointers7, |
|
83 EPanicOutOfSyncUtf7Byte1, |
|
84 EPanicOutOfSyncUtf7Byte2, |
|
85 EPanicOutOfSyncBase64Decoding |
|
86 }; |
|
87 |
|
88 _LIT(KLitPanicText, "CHARCONV-UTF"); |
|
89 |
|
90 LOCAL_C void Panic(TPanic aPanic) |
|
91 { |
|
92 User::Panic(KLitPanicText, aPanic); |
|
93 } |
|
94 |
|
95 inline TUint EscapeCharacterForStartingBase64Block(TBool aIsImapUtf7) {return aIsImapUtf7? '&': '+';} |
|
96 |
|
97 LOCAL_C TUint Base64Decoding(TUint aMemberOfBase64Alphabet, TBool aIsImapUtf7) |
|
98 { |
|
99 if ((aMemberOfBase64Alphabet>='A') && (aMemberOfBase64Alphabet<='Z')) |
|
100 { |
|
101 return aMemberOfBase64Alphabet-'A'; |
|
102 } |
|
103 if ((aMemberOfBase64Alphabet>='a') && (aMemberOfBase64Alphabet<='z')) |
|
104 { |
|
105 return aMemberOfBase64Alphabet-('a'-26); |
|
106 } |
|
107 if ((aMemberOfBase64Alphabet>='0') && (aMemberOfBase64Alphabet<='9')) |
|
108 { |
|
109 return aMemberOfBase64Alphabet+((26*2)-'0'); |
|
110 } |
|
111 if (aMemberOfBase64Alphabet=='+') |
|
112 { |
|
113 return 62; |
|
114 } |
|
115 if (aMemberOfBase64Alphabet==STATIC_CAST(TUint, aIsImapUtf7? ',': '/')) |
|
116 { |
|
117 return 63; |
|
118 } |
|
119 return KNotInBase64Alphabet; |
|
120 } |
|
121 |
|
122 LOCAL_C TUint Base64Encoding(TUint a6BitNumber, TBool aIsImapUtf7) |
|
123 { |
|
124 __ASSERT_DEBUG(a6BitNumber<64, Panic(EPanicBad6BitNumber)); |
|
125 if ((a6BitNumber==63) && aIsImapUtf7) |
|
126 { |
|
127 return ','; |
|
128 } |
|
129 static const TUint8 base64Alphabet[64]={'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'}; |
|
130 return base64Alphabet[a6BitNumber]; |
|
131 } |
|
132 |
|
133 LOCAL_C TUint8* PointerToEscapeCharacterStartingBase64Block(TUint8* aPointerToUtf7Byte, const TUint8* aPointerToFirstUtf7Byte, TBool aIsImapUtf7) |
|
134 { |
|
135 __ASSERT_DEBUG(aPointerToUtf7Byte>=aPointerToFirstUtf7Byte, Panic(EPanicBadUtf7Pointers1)); |
|
136 TUint8* pointerToCandidateEscapeCharacter=NULL; |
|
137 FOREVER |
|
138 { |
|
139 const TUint utf7Byte=*aPointerToUtf7Byte; |
|
140 if (utf7Byte==EscapeCharacterForStartingBase64Block(aIsImapUtf7)) |
|
141 { |
|
142 pointerToCandidateEscapeCharacter=aPointerToUtf7Byte; |
|
143 } |
|
144 else if (Base64Decoding(utf7Byte, aIsImapUtf7)==KNotInBase64Alphabet) |
|
145 { |
|
146 break; |
|
147 } |
|
148 __ASSERT_DEBUG(aPointerToUtf7Byte>=aPointerToFirstUtf7Byte, Panic(EPanicBadUtf7Pointers2)); |
|
149 if (aPointerToUtf7Byte<=aPointerToFirstUtf7Byte) |
|
150 { |
|
151 break; |
|
152 } |
|
153 --aPointerToUtf7Byte; |
|
154 } |
|
155 __ASSERT_DEBUG(pointerToCandidateEscapeCharacter!=NULL, Panic(EPanicNotInBase64Block)); |
|
156 return pointerToCandidateEscapeCharacter; |
|
157 } |
|
158 |
|
159 LOCAL_C TBool EncodeInUtf7Directly(TUint aUnicodeCharacter, TBool aIsImapUtf7, TBool aEncodeOptionalDirectCharactersInBase64) |
|
160 { |
|
161 if (aIsImapUtf7) |
|
162 { |
|
163 return (aUnicodeCharacter>=0x0020) && (aUnicodeCharacter<=0x007e); |
|
164 } |
|
165 if ((aUnicodeCharacter>=0x0021) && (aUnicodeCharacter<=0x007d)) |
|
166 { |
|
167 if (aEncodeOptionalDirectCharactersInBase64) |
|
168 { |
|
169 return (((aUnicodeCharacter>=0x0041) && (aUnicodeCharacter<=0x005a)) || |
|
170 ((aUnicodeCharacter>=0x0061) && (aUnicodeCharacter<=0x007a)) || |
|
171 ((aUnicodeCharacter>=0x0027) && (aUnicodeCharacter<=0x0029)) || |
|
172 ((aUnicodeCharacter>=0x002b) && (aUnicodeCharacter<=0x003a)) || |
|
173 (aUnicodeCharacter==0x003f)); |
|
174 } |
|
175 return aUnicodeCharacter!=0x005c; |
|
176 } |
|
177 return (aUnicodeCharacter==0x0020) || (aUnicodeCharacter==0x0009) || (aUnicodeCharacter==0x000d) || (aUnicodeCharacter==0x000a); |
|
178 } |
|
179 |
|
180 inline TBool BitBufferContainsNonZeroBits(TUint aBitBuffer, TInt aNumberOfBitsInBuffer) |
|
181 { |
|
182 return (aBitBuffer&((1<<aNumberOfBitsInBuffer)-1))!=0; |
|
183 } |
|
184 |
|
185 |
|
186 |
|
187 /** Converts Unicode text into UTF-7 encoding. The fucntion leaves with |
|
188 KErrCorrupt if the input string is corrupt. |
|
189 |
|
190 @param aUnicode A UCS-2 encoded input string. |
|
191 @param aEncodeOptionalDirectCharactersInBase64 If ETrue then |
|
192 characters from UTF-7 set O (optional direct characters) are encoded in |
|
193 Modified Base64. If EFalse the characters are encoded directly, |
|
194 as their ASCII equivalents. |
|
195 @return A descriptor containing the UTF-7 encoded output string. */ |
|
196 EXPORT_C HBufC8* CnvUtfConverter::ConvertFromUnicodeToUtf7L( |
|
197 const TDesC16& aUnicode, |
|
198 TBool aEncodeOptionalDirectCharactersInBase64) |
|
199 { |
|
200 // If aUnicode is Null string, return an empty HBufC |
|
201 if (aUnicode.Length() == 0) |
|
202 { |
|
203 HBufC8* hBuf8 = HBufC8::NewL(1); |
|
204 return hBuf8; |
|
205 } |
|
206 |
|
207 // Otherwise, convert and store result in a buffer, reallocating that buffer if needed. |
|
208 TInt length = aUnicode.Length(); |
|
209 const TInt bufsize = 100; |
|
210 |
|
211 TPtrC16 unicode (aUnicode); |
|
212 TBuf8<bufsize> buf; |
|
213 HBufC8* hBuf8 = HBufC8::NewLC(length); |
|
214 TPtr8 utf7 = hBuf8->Des(); |
|
215 |
|
216 FOREVER |
|
217 { |
|
218 TInt unconverted = ConvertFromUnicodeToUtf7(buf, unicode, aEncodeOptionalDirectCharactersInBase64); |
|
219 if( unconverted == EErrorIllFormedInput || unconverted < 0) |
|
220 User::Leave(KErrCorrupt); |
|
221 |
|
222 if (utf7.Length() + buf.Length() > utf7.MaxLength()) |
|
223 { |
|
224 // Reallocate the hBuf8 |
|
225 hBuf8 = hBuf8->ReAllocL(utf7.Length() + buf.Length()); |
|
226 CleanupStack::Pop(); |
|
227 CleanupStack::PushL(hBuf8); |
|
228 utf7.Set(hBuf8->Des()); |
|
229 } |
|
230 utf7.Append(buf); |
|
231 if (unconverted ==0) |
|
232 break; |
|
233 unicode.Set(unicode.Right(unconverted)); |
|
234 } |
|
235 CleanupStack::Pop(); |
|
236 return hBuf8; |
|
237 |
|
238 } |
|
239 |
|
240 /** Converts Unicode text into UTF-7 encoding. |
|
241 |
|
242 @param aUtf7 On return, contains the UTF-7 encoded output string. |
|
243 @param aUnicode A UCS-2 encoded input string. |
|
244 @param aEncodeOptionalDirectCharactersInBase64 If ETrue then characters from |
|
245 UTF-7 set O (optional direct characters) are encoded in Modified Base64. If |
|
246 EFalse the characters are encoded directly, as their ASCII equivalents. |
|
247 @return The number of unconverted characters left at the end of the input |
|
248 descriptor, or one of the error values defined in TError. */ |
|
249 EXPORT_C TInt CnvUtfConverter::ConvertFromUnicodeToUtf7( |
|
250 TDes8& aUtf7, |
|
251 const TDesC16& aUnicode, |
|
252 TBool aEncodeOptionalDirectCharactersInBase64) |
|
253 { |
|
254 return ConvertFromUnicodeToUtf7(aUtf7, aUnicode, EFalse, aEncodeOptionalDirectCharactersInBase64); |
|
255 } |
|
256 |
|
257 TInt CnvUtfConverter::ConvertFromUnicodeToUtf7(TDes8& aUtf7, |
|
258 const TDesC16& aUnicode, |
|
259 TBool aIsImapUtf7, |
|
260 TBool aEncodeOptionalDirectCharactersInBase64) |
|
261 { |
|
262 if (aUnicode.Length()==0) |
|
263 { |
|
264 aUtf7.SetLength(0); |
|
265 return 0; |
|
266 } |
|
267 if (aUtf7.MaxLength()==0) |
|
268 { |
|
269 return aUnicode.Length(); |
|
270 } |
|
271 const TUint escapeCharacterForStartingBase64Block=EscapeCharacterForStartingBase64Block(aIsImapUtf7); |
|
272 TUint8* pointerToPreviousUtf7Byte=CONST_CAST(TUint8*, aUtf7.Ptr()-1); |
|
273 const TUint8* const pointerToLastUtf7Byte=pointerToPreviousUtf7Byte+aUtf7.MaxLength(); |
|
274 const TUint16* pointerToPreviousUnicodeCharacter=aUnicode.Ptr()-1; |
|
275 const TUint16* const pointerToLastUnicodeCharacter=pointerToPreviousUnicodeCharacter+aUnicode.Length(); |
|
276 const TUint KIsInBase64Block=0x80000000u; |
|
277 TUint bitBuffer=0; |
|
278 TInt numberOfBitsInBuffer=0; |
|
279 FOREVER |
|
280 { |
|
281 __ASSERT_DEBUG(pointerToPreviousUtf7Byte<=pointerToLastUtf7Byte, Panic(EPanicBadUtf7Pointers3)); |
|
282 __ASSERT_DEBUG(pointerToPreviousUnicodeCharacter<=pointerToLastUnicodeCharacter, Panic(EPanicBadUnicodePointers1)); |
|
283 TUint currentUnicodeCharacter=(pointerToPreviousUnicodeCharacter==pointerToLastUnicodeCharacter)? 0: *(pointerToPreviousUnicodeCharacter+1); |
|
284 if ((pointerToPreviousUnicodeCharacter==pointerToLastUnicodeCharacter) || EncodeInUtf7Directly(currentUnicodeCharacter, aIsImapUtf7, aEncodeOptionalDirectCharactersInBase64)) |
|
285 { |
|
286 __ASSERT_DEBUG((bitBuffer&KIsInBase64Block) || (numberOfBitsInBuffer==0), Panic(EPanicBadBitBufferState1)); |
|
287 __ASSERT_DEBUG((numberOfBitsInBuffer==0) || (numberOfBitsInBuffer==2) || (numberOfBitsInBuffer==4), Panic(EPanicBadBitBufferState2)); |
|
288 if (bitBuffer&KIsInBase64Block) |
|
289 { |
|
290 if (numberOfBitsInBuffer!=0) |
|
291 { |
|
292 if (pointerToLastUtf7Byte-pointerToPreviousUtf7Byte<2) // make sure there is enough space for the trailing '-' as well as the remains of the bitBuffer as the KIsInBase64Block flag is about to turned off, thus the trailing '-' may never get written |
|
293 { |
|
294 break; |
|
295 } |
|
296 ++pointerToPreviousUtf7Byte; |
|
297 *pointerToPreviousUtf7Byte=STATIC_CAST(TUint8, Base64Encoding((bitBuffer<<(6-numberOfBitsInBuffer))&0x3f, aIsImapUtf7)); |
|
298 } |
|
299 else |
|
300 { |
|
301 if (pointerToPreviousUtf7Byte==pointerToLastUtf7Byte) |
|
302 { |
|
303 break; |
|
304 } |
|
305 } |
|
306 ++pointerToPreviousUtf7Byte; |
|
307 *pointerToPreviousUtf7Byte='-'; |
|
308 bitBuffer=0; |
|
309 numberOfBitsInBuffer=0; |
|
310 } |
|
311 __ASSERT_DEBUG(pointerToPreviousUnicodeCharacter<=pointerToLastUnicodeCharacter, Panic(EPanicBadUnicodePointers2)); |
|
312 if (pointerToPreviousUnicodeCharacter>=pointerToLastUnicodeCharacter) |
|
313 { |
|
314 break; |
|
315 } |
|
316 __ASSERT_DEBUG(pointerToPreviousUtf7Byte<=pointerToLastUtf7Byte, Panic(EPanicBadUtf7Pointers4)); |
|
317 if (pointerToLastUtf7Byte-pointerToPreviousUtf7Byte<((currentUnicodeCharacter==escapeCharacterForStartingBase64Block)? 2: 1)) |
|
318 { |
|
319 break; |
|
320 } |
|
321 ++pointerToPreviousUtf7Byte; |
|
322 *pointerToPreviousUtf7Byte=STATIC_CAST(TUint8, currentUnicodeCharacter); |
|
323 ++pointerToPreviousUnicodeCharacter; |
|
324 if (currentUnicodeCharacter==escapeCharacterForStartingBase64Block) |
|
325 { |
|
326 ++pointerToPreviousUtf7Byte; |
|
327 *pointerToPreviousUtf7Byte='-'; |
|
328 } |
|
329 } |
|
330 else |
|
331 { |
|
332 { |
|
333 TInt numberOfUtf7BytesRequired=(numberOfBitsInBuffer+16)/6; // "(numberOfBitsInBuffer+16)/6" is the number of iterations that will happen in the while loop below |
|
334 if (~bitBuffer&KIsInBase64Block) |
|
335 { |
|
336 ++numberOfUtf7BytesRequired; // for the initial escapeCharacterForStartingBase64Block |
|
337 } |
|
338 if (pointerToLastUtf7Byte-pointerToPreviousUtf7Byte<numberOfUtf7BytesRequired) |
|
339 { |
|
340 break; |
|
341 } |
|
342 } |
|
343 if (~bitBuffer&KIsInBase64Block) |
|
344 { |
|
345 __ASSERT_DEBUG(pointerToPreviousUtf7Byte<pointerToLastUtf7Byte, Panic(EPanicBadUtf7Pointers5)); |
|
346 ++pointerToPreviousUtf7Byte; |
|
347 *pointerToPreviousUtf7Byte=STATIC_CAST(TUint8, escapeCharacterForStartingBase64Block); |
|
348 } |
|
349 bitBuffer<<=16; |
|
350 bitBuffer|=currentUnicodeCharacter; |
|
351 numberOfBitsInBuffer+=16; |
|
352 ++pointerToPreviousUnicodeCharacter; |
|
353 __ASSERT_DEBUG(numberOfBitsInBuffer<=20, Panic(EPanicBadBitBufferState3)); |
|
354 while (numberOfBitsInBuffer>=6) |
|
355 { |
|
356 numberOfBitsInBuffer-=6; |
|
357 __ASSERT_DEBUG(pointerToPreviousUtf7Byte<pointerToLastUtf7Byte, Panic(EPanicBadUtf7Pointers6)); |
|
358 ++pointerToPreviousUtf7Byte; |
|
359 *pointerToPreviousUtf7Byte=STATIC_CAST(TUint8, Base64Encoding((bitBuffer>>numberOfBitsInBuffer)&0x3f, aIsImapUtf7)); |
|
360 } |
|
361 bitBuffer&=((1<<numberOfBitsInBuffer)-1); // zero all the consumed bits - not strictly necessary but it leaves the buffer in a cleaner state |
|
362 bitBuffer|=KIsInBase64Block; |
|
363 } |
|
364 } |
|
365 __ASSERT_DEBUG((bitBuffer&KIsInBase64Block) || (numberOfBitsInBuffer==0), Panic(EPanicBadBitBufferState4)); |
|
366 __ASSERT_DEBUG((numberOfBitsInBuffer==0) || (numberOfBitsInBuffer==2) || (numberOfBitsInBuffer==4), Panic(EPanicBadBitBufferState5)); |
|
367 if (bitBuffer&KIsInBase64Block) |
|
368 { |
|
369 #if defined(_DEBUG) |
|
370 TInt numberOfLoopIterations=1; |
|
371 #endif |
|
372 FOREVER // there should never be more than 2 iterations of this loop - the first "if" should always succeed the second time if it doesn't succeed the first time |
|
373 { |
|
374 __ASSERT_DEBUG(pointerToPreviousUtf7Byte<=pointerToLastUtf7Byte, Panic(EPanicBadUtf7Pointers7)); |
|
375 __ASSERT_DEBUG((numberOfBitsInBuffer==0) || (numberOfBitsInBuffer==2) || (numberOfBitsInBuffer==4), Panic(EPanicBadBitBufferState6)); |
|
376 __ASSERT_DEBUG(numberOfLoopIterations<=2, Panic(EPanicUnexpectedNumberOfLoopIterations)); |
|
377 #if defined(_DEBUG) |
|
378 ++numberOfLoopIterations; |
|
379 #endif |
|
380 if (pointerToLastUtf7Byte-pointerToPreviousUtf7Byte>=((numberOfBitsInBuffer==0)? 1: 2)) // if there's room to finish off the base-64 sequence by (i) flushing the bit-buffer and (ii) appending the trailing '-' |
|
381 { |
|
382 if (numberOfBitsInBuffer!=0) |
|
383 { |
|
384 __ASSERT_DEBUG(pointerToPreviousUtf7Byte<pointerToLastUtf7Byte, Panic(EPanicBadUtf7Pointers8)); |
|
385 ++pointerToPreviousUtf7Byte; |
|
386 *pointerToPreviousUtf7Byte=STATIC_CAST(TUint8, Base64Encoding((bitBuffer<<(6-numberOfBitsInBuffer))&0x3f, aIsImapUtf7)); |
|
387 } |
|
388 __ASSERT_DEBUG(pointerToPreviousUtf7Byte<pointerToLastUtf7Byte, Panic(EPanicBadUtf7Pointers9)); |
|
389 ++pointerToPreviousUtf7Byte; |
|
390 *pointerToPreviousUtf7Byte='-'; |
|
391 break; |
|
392 } |
|
393 // it is now necessary to move back pointerToPreviousUtf7Byte so that the base-64 sequence can be terminated - note it must be terminated on a Unicode character boundary hence the reason why pointerToPreviousUnicodeCharacter may be moved back too |
|
394 TUint8* pointerToEscapeCharacterStartingBase64Block=PointerToEscapeCharacterStartingBase64Block(pointerToPreviousUtf7Byte, aUtf7.Ptr(), aIsImapUtf7); |
|
395 const TInt oldNumberOfBase64Characters=pointerToPreviousUtf7Byte-pointerToEscapeCharacterStartingBase64Block; |
|
396 __ASSERT_DEBUG(oldNumberOfBase64Characters>0, Panic(EPanicInitialEscapeCharacterButNoBase64)); |
|
397 __ASSERT_DEBUG(((oldNumberOfBase64Characters*6)+numberOfBitsInBuffer)%16==0, Panic(EPanicBase64SequenceDoesNotFallOnUnicodeCharacterBoundary)); |
|
398 pointerToPreviousUnicodeCharacter-=((oldNumberOfBase64Characters*6)+numberOfBitsInBuffer)/16; // move back pointerToPreviousUnicodeCharacter to before the equivalent of the base-64 sequence |
|
399 pointerToPreviousUtf7Byte=pointerToEscapeCharacterStartingBase64Block; |
|
400 __ASSERT_DEBUG(*pointerToPreviousUtf7Byte==escapeCharacterForStartingBase64Block, Panic(EPanicBadUtf7Pointers10)); |
|
401 if (oldNumberOfBase64Characters<4) // if the new base-64 sequence will be so short that it won't even be able to contain the UTF-7 encoding of a single Unicode character |
|
402 { |
|
403 --pointerToPreviousUtf7Byte; // move back pointerToPreviousUtf7Byte to before the escapeCharacterForStartingBase64Block |
|
404 break; |
|
405 } |
|
406 const TInt newNumberOfUnicodeCharacters=((oldNumberOfBase64Characters-1)*3)/8; |
|
407 pointerToPreviousUnicodeCharacter+=newNumberOfUnicodeCharacters; |
|
408 pointerToPreviousUtf7Byte+=((newNumberOfUnicodeCharacters*8)+2)/3; |
|
409 const TInt numberOfBitsToBeZeroedInLastBase64Character=(newNumberOfUnicodeCharacters%3)*2; |
|
410 if (numberOfBitsToBeZeroedInLastBase64Character!=0) |
|
411 { |
|
412 *pointerToPreviousUtf7Byte=STATIC_CAST(TUint8, Base64Encoding(Base64Decoding(*pointerToPreviousUtf7Byte, aIsImapUtf7)&0x3f&~((1<<numberOfBitsToBeZeroedInLastBase64Character)-1), aIsImapUtf7)); |
|
413 } |
|
414 bitBuffer=KIsInBase64Block; |
|
415 numberOfBitsInBuffer=0; |
|
416 } |
|
417 } |
|
418 aUtf7.SetLength((pointerToPreviousUtf7Byte-aUtf7.Ptr())+1); |
|
419 return pointerToLastUnicodeCharacter-pointerToPreviousUnicodeCharacter; |
|
420 } |
|
421 |
|
422 |
|
423 |
|
424 /** Converts Unicode text into UTF-8 encoding. |
|
425 |
|
426 @param aUtf8 On return, contains the UTF-8 encoded output string. |
|
427 @param aUnicode The Unicode-encoded input string. |
|
428 @return The number of unconverted characters left at the end of the input |
|
429 descriptor, or one of the error values defined in TError. */ |
|
430 EXPORT_C TInt CnvUtfConverter::ConvertFromUnicodeToUtf8(TDes8& aUtf8, const TDesC16& aUnicode) |
|
431 { |
|
432 return ConvertFromUnicodeToUtf8(aUtf8, aUnicode, EFalse); |
|
433 } |
|
434 |
|
435 |
|
436 /** Converts Unicode text into UTF-8 encoding. |
|
437 |
|
438 The variant of UTF-8 used internally by Java differs slightly from |
|
439 standard UTF-8. The TBool argument controls the UTF-8 |
|
440 variant generated by this function. This function leaves with a |
|
441 KErrCorrupt if the input string is corrupt. |
|
442 |
|
443 @param aUnicode A UCS-2 encoded input string. |
|
444 @return A pointer to an HBufC8 containing the converted UTF8. */ |
|
445 EXPORT_C HBufC8* CnvUtfConverter::ConvertFromUnicodeToUtf8L(const TDesC16& aUnicode) |
|
446 { |
|
447 // If aUnicode is Null string, return an empty HBufC |
|
448 if (aUnicode.Length() == 0) |
|
449 { |
|
450 HBufC8* hBuf8 = HBufC8::NewL(1); |
|
451 return hBuf8; |
|
452 } |
|
453 |
|
454 // Otherwise, convert and store result in a buffer, reallocating that buffer if needed. |
|
455 const TInt length = aUnicode.Length(); |
|
456 const TInt bufsize = 100; |
|
457 |
|
458 TPtrC16 unicode (aUnicode); |
|
459 TBuf8<bufsize> buf; |
|
460 HBufC8* hBuf8 = HBufC8::NewLC(length); |
|
461 TPtr8 utf8 = hBuf8->Des(); |
|
462 |
|
463 FOREVER |
|
464 { |
|
465 TInt unconverted = ConvertFromUnicodeToUtf8(buf, unicode); |
|
466 if( unconverted == EErrorIllFormedInput || unconverted < 0) |
|
467 User::Leave(KErrCorrupt); |
|
468 |
|
469 if (utf8.Length() + buf.Length() > utf8.MaxLength()) |
|
470 { |
|
471 // Reallocate the hBuf8 |
|
472 hBuf8 = hBuf8->ReAllocL(utf8.Length() + buf.Length()); |
|
473 CleanupStack::Pop(); |
|
474 CleanupStack::PushL(hBuf8); |
|
475 utf8.Set(hBuf8->Des()); |
|
476 } |
|
477 utf8.Append(buf); |
|
478 if (unconverted ==0) |
|
479 break; |
|
480 unicode.Set(unicode.Right(unconverted)); |
|
481 } |
|
482 CleanupStack::Pop(); |
|
483 return hBuf8; |
|
484 } |
|
485 |
|
486 /** Converts Unicode text into UTF-8 encoding. |
|
487 |
|
488 The variant of UTF-8 used internally by Java differs slightly from standard |
|
489 UTF-8. The TBool argument controls the UTF-8 variant generated by this function. |
|
490 |
|
491 @param aUtf8 On return, contains the UTF-8 encoded output string. |
|
492 @param aUnicode A UCS-2 encoded input string. |
|
493 @param aGenerateJavaConformantUtf8 EFalse for orthodox UTF-8. ETrue for Java |
|
494 UTF-8. The default is EFalse. |
|
495 @return The number of unconverted characters left at the end of the input descriptor, |
|
496 or one of the error values defined in TError. */ |
|
497 TInt CnvUtfConverter::ConvertFromUnicodeToUtf8(TDes8& aUtf8, |
|
498 const TDesC16& aUnicode, |
|
499 TBool aGenerateJavaConformantUtf8) |
|
500 { |
|
501 if (aUnicode.Length()==0) |
|
502 { |
|
503 aUtf8.SetLength(0); |
|
504 return 0; |
|
505 } |
|
506 if (aUtf8.MaxLength()==0) |
|
507 { |
|
508 return aUnicode.Length(); |
|
509 } |
|
510 TUint8* pointerToCurrentUtf8Byte=CONST_CAST(TUint8*, aUtf8.Ptr()); |
|
511 const TUint8* pointerToLastUtf8Byte=pointerToCurrentUtf8Byte+(aUtf8.MaxLength()-1); |
|
512 const TUint16* pointerToCurrentUnicodeCharacter=aUnicode.Ptr(); |
|
513 const TUint16* pointerToLastUnicodeCharacter=pointerToCurrentUnicodeCharacter+(aUnicode.Length()-1); |
|
514 TBool inputIsTruncated=EFalse; |
|
515 FOREVER |
|
516 { |
|
517 __ASSERT_DEBUG(pointerToCurrentUtf8Byte<=pointerToLastUtf8Byte, Panic(EPanicBadUtf8Pointers1)); |
|
518 __ASSERT_DEBUG(pointerToCurrentUnicodeCharacter<=pointerToLastUnicodeCharacter, Panic(EPanicBadUnicodePointers3)); |
|
519 TUint currentUnicodeCharacter=*pointerToCurrentUnicodeCharacter; |
|
520 if (((currentUnicodeCharacter&0xff80)==0x0000) && ((currentUnicodeCharacter!=0x0000) || !aGenerateJavaConformantUtf8)) |
|
521 { |
|
522 *pointerToCurrentUtf8Byte=STATIC_CAST(TUint8, currentUnicodeCharacter); |
|
523 } |
|
524 else if ((currentUnicodeCharacter&0xf800)==0x0000) |
|
525 { |
|
526 if (pointerToCurrentUtf8Byte==pointerToLastUtf8Byte) |
|
527 { |
|
528 --pointerToCurrentUtf8Byte; |
|
529 --pointerToCurrentUnicodeCharacter; |
|
530 break; |
|
531 } |
|
532 *pointerToCurrentUtf8Byte=STATIC_CAST(TUint8, 0xc0|(currentUnicodeCharacter>>6)); |
|
533 ++pointerToCurrentUtf8Byte; |
|
534 *pointerToCurrentUtf8Byte=STATIC_CAST(TUint8, 0x80|(currentUnicodeCharacter&0x3f)); |
|
535 } |
|
536 else if (((currentUnicodeCharacter&0xfc00)==0xd800) && !aGenerateJavaConformantUtf8) |
|
537 { |
|
538 __ASSERT_DEBUG(pointerToCurrentUtf8Byte<=pointerToLastUtf8Byte, Panic(EPanicBadUtf8Pointers2)); |
|
539 if (pointerToLastUtf8Byte-pointerToCurrentUtf8Byte<3) |
|
540 { |
|
541 --pointerToCurrentUtf8Byte; |
|
542 --pointerToCurrentUnicodeCharacter; |
|
543 break; |
|
544 } |
|
545 __ASSERT_DEBUG(pointerToCurrentUnicodeCharacter<=pointerToLastUnicodeCharacter, Panic(EPanicBadUnicodePointers4)); |
|
546 if (pointerToCurrentUnicodeCharacter>=pointerToLastUnicodeCharacter) |
|
547 { |
|
548 --pointerToCurrentUtf8Byte; |
|
549 --pointerToCurrentUnicodeCharacter; |
|
550 inputIsTruncated=ETrue; |
|
551 break; |
|
552 } |
|
553 currentUnicodeCharacter+=0x0040; |
|
554 *pointerToCurrentUtf8Byte=STATIC_CAST(TUint8, 0xf0|((currentUnicodeCharacter>>8)&0x07)); |
|
555 ++pointerToCurrentUtf8Byte; |
|
556 *pointerToCurrentUtf8Byte=STATIC_CAST(TUint8, 0x80|((currentUnicodeCharacter>>2)&0x3f)); |
|
557 { |
|
558 TUint currentUtf8Byte=(0x80|((currentUnicodeCharacter&0x03)<<4)); |
|
559 ++pointerToCurrentUnicodeCharacter; |
|
560 currentUnicodeCharacter=*pointerToCurrentUnicodeCharacter; |
|
561 if ((currentUnicodeCharacter&0xfc00)!=0xdc00) |
|
562 { |
|
563 return EErrorIllFormedInput; |
|
564 } |
|
565 currentUtf8Byte|=((currentUnicodeCharacter>>6)&0x0f); |
|
566 ++pointerToCurrentUtf8Byte; |
|
567 *pointerToCurrentUtf8Byte=STATIC_CAST(TUint8, currentUtf8Byte); |
|
568 } |
|
569 ++pointerToCurrentUtf8Byte; |
|
570 *pointerToCurrentUtf8Byte=STATIC_CAST(TUint8, 0x80|(currentUnicodeCharacter&0x3f)); |
|
571 } |
|
572 else |
|
573 { |
|
574 if (pointerToLastUtf8Byte-pointerToCurrentUtf8Byte<2) |
|
575 { |
|
576 --pointerToCurrentUtf8Byte; |
|
577 --pointerToCurrentUnicodeCharacter; |
|
578 break; |
|
579 } |
|
580 *pointerToCurrentUtf8Byte=STATIC_CAST(TUint8, 0xe0|(currentUnicodeCharacter>>12)); |
|
581 ++pointerToCurrentUtf8Byte; |
|
582 *pointerToCurrentUtf8Byte=STATIC_CAST(TUint8, 0x80|((currentUnicodeCharacter>>6)&0x3f)); |
|
583 ++pointerToCurrentUtf8Byte; |
|
584 *pointerToCurrentUtf8Byte=STATIC_CAST(TUint8, 0x80|(currentUnicodeCharacter&0x3f)); |
|
585 } |
|
586 if ((pointerToCurrentUnicodeCharacter==pointerToLastUnicodeCharacter) || (pointerToCurrentUtf8Byte==pointerToLastUtf8Byte)) |
|
587 { |
|
588 break; |
|
589 } |
|
590 ++pointerToCurrentUtf8Byte; |
|
591 ++pointerToCurrentUnicodeCharacter; |
|
592 } |
|
593 if ((pointerToCurrentUnicodeCharacter<aUnicode.Ptr()) && inputIsTruncated) |
|
594 { |
|
595 return EErrorIllFormedInput; |
|
596 } |
|
597 aUtf8.SetLength((pointerToCurrentUtf8Byte-aUtf8.Ptr())+1); |
|
598 return pointerToLastUnicodeCharacter-pointerToCurrentUnicodeCharacter; |
|
599 } |
|
600 |
|
601 |
|
602 |
|
603 /** Converts text encoded using the Unicode transformation format UTF-7 |
|
604 into the Unicode UCS-2 character set. |
|
605 |
|
606 @param aUtf7 The UTF-7 encoded input string. |
|
607 @return A pointer to an HBufC16 containing the converted Unicode string */ |
|
608 EXPORT_C HBufC16* CnvUtfConverter::ConvertToUnicodeFromUtf7L(const TDesC8& aUtf7) |
|
609 { |
|
610 // If aUtf8 is an empty string return |
|
611 if (aUtf7.Length()==0) |
|
612 { |
|
613 HBufC16* hBuf = HBufC16::NewL(1); |
|
614 return hBuf; |
|
615 } |
|
616 |
|
617 // else convert aUtf8 to Unicode storing the result in a buffer, reallocating |
|
618 // it when needed. |
|
619 TInt length = aUtf7.Length(); |
|
620 const TInt bufsize = 100; |
|
621 TInt state = KStateDefault; |
|
622 |
|
623 TPtrC8 utf7 (aUtf7); |
|
624 TBuf<bufsize> buf; |
|
625 HBufC16* hBuf = HBufC16::NewLC(length); |
|
626 TPtr unicode = hBuf->Des(); |
|
627 |
|
628 FOREVER |
|
629 { |
|
630 TInt unconverted = ConvertToUnicodeFromUtf7(buf, utf7, state); |
|
631 if( unconverted == EErrorIllFormedInput || unconverted < 0) |
|
632 User::Leave(KErrCorrupt); |
|
633 |
|
634 if (unicode.Length() + buf.Length() > unicode.MaxLength()) |
|
635 { |
|
636 // Reallocate hBuf |
|
637 hBuf = hBuf->ReAllocL(unicode.Length() + buf.Length()); |
|
638 CleanupStack::Pop(); |
|
639 CleanupStack::PushL(hBuf); |
|
640 unicode.Set(hBuf->Des()); |
|
641 } |
|
642 unicode.Append(buf); |
|
643 if (unconverted ==0) |
|
644 break; |
|
645 utf7.Set(utf7.Right(unconverted)); |
|
646 } |
|
647 CleanupStack::Pop(); |
|
648 return hBuf; |
|
649 } |
|
650 |
|
651 |
|
652 |
|
653 /** Converts text encoded using the Unicode transformation format UTF-7 into the |
|
654 Unicode UCS-2 character set. |
|
655 |
|
656 If the conversion is achieved using a series of calls to this function, where |
|
657 each call starts off where the previous call reached in the input descriptor, |
|
658 the state of the conversion is stored. The initial value of the state variable |
|
659 should be set as KStateDefault when the conversion is started, and afterwards |
|
660 simply passed unchanged into each function call. |
|
661 |
|
662 @param aUnicode On return, contains the Unicode encoded output string. |
|
663 @param aUtf7 The UTF-7 encoded input string. |
|
664 @param aState For the first call of the function set to KStateDefault. For |
|
665 subsequent calls, pass in the variable unchanged. |
|
666 @return The number of unconverted bytes left at the end of the input descriptor, |
|
667 or one of the error values defined in TError. */ |
|
668 EXPORT_C TInt CnvUtfConverter::ConvertToUnicodeFromUtf7(TDes16& aUnicode, |
|
669 const TDesC8& aUtf7, |
|
670 TInt& aState) |
|
671 { |
|
672 return ConvertToUnicodeFromUtf7(aUnicode, aUtf7, EFalse, aState); |
|
673 } |
|
674 |
|
675 TInt CnvUtfConverter::ConvertToUnicodeFromUtf7(TDes16& aUnicode, |
|
676 const TDesC8& aUtf7, |
|
677 TBool aIsImapUtf7, |
|
678 TInt& aState) |
|
679 { |
|
680 if (aUtf7.Length()==0) |
|
681 { |
|
682 aUnicode.SetLength(0); |
|
683 return 0; |
|
684 } |
|
685 if (aUnicode.MaxLength()==0) |
|
686 { |
|
687 return aUtf7.Length(); |
|
688 } |
|
689 const TUint escapeCharacterForStartingBase64Block=EscapeCharacterForStartingBase64Block(aIsImapUtf7); |
|
690 TUint16* pointerToPreviousUnicodeCharacter=CONST_CAST(TUint16*, aUnicode.Ptr()-1); |
|
691 const TUint16* pointerToLastUnicodeCharacter=pointerToPreviousUnicodeCharacter+aUnicode.MaxLength(); |
|
692 const TUint8* pointerToCurrentUtf7Byte=aUtf7.Ptr(); |
|
693 const TUint8* pointerToLastUtf7Byte=pointerToCurrentUtf7Byte+(aUtf7.Length()-1); |
|
694 TUint currentUtf7Byte=*pointerToCurrentUtf7Byte; |
|
695 const TUint KIsInBase64Block=0x80000000u; |
|
696 TUint bitBuffer=STATIC_CAST(TUint, aState); |
|
697 TInt numberOfBitsInBuffer=((bitBuffer&0xf0)>>4); |
|
698 bitBuffer&=~0xf0; // turn off the bits that stored numberOfBitsInBuffer |
|
699 if (bitBuffer&KIsInBase64Block) |
|
700 { |
|
701 __ASSERT_ALWAYS((numberOfBitsInBuffer==0) || (numberOfBitsInBuffer==2) || (numberOfBitsInBuffer==4) || ((numberOfBitsInBuffer<16) && (numberOfBitsInBuffer%2==0) && !BitBufferContainsNonZeroBits(bitBuffer, numberOfBitsInBuffer)), Panic(EPanicBadBitBufferState7)); |
|
702 __ASSERT_ALWAYS((bitBuffer&~(KIsInBase64Block|0x0000000f))==0, Panic(EPanicBadBitBufferState8)); |
|
703 } |
|
704 else |
|
705 { |
|
706 __ASSERT_ALWAYS(bitBuffer==0, Panic(EPanicBadBitBufferState9)); |
|
707 __ASSERT_ALWAYS(numberOfBitsInBuffer==0, Panic(EPanicBadBitBufferState10)); |
|
708 } |
|
709 aState=KStateDefault; |
|
710 if (bitBuffer&KIsInBase64Block) |
|
711 { |
|
712 currentUtf7Byte=Base64Decoding(currentUtf7Byte, aIsImapUtf7); |
|
713 } |
|
714 TBool inputIsTruncated=EFalse; |
|
715 FOREVER |
|
716 { |
|
717 __ASSERT_DEBUG(pointerToPreviousUnicodeCharacter<pointerToLastUnicodeCharacter, Panic(EPanicBadUnicodePointers5)); |
|
718 __ASSERT_DEBUG(pointerToCurrentUtf7Byte<=pointerToLastUtf7Byte, Panic(EPanicBadUtf7Pointers11)); |
|
719 __ASSERT_DEBUG((bitBuffer&KIsInBase64Block) || (currentUtf7Byte==*pointerToCurrentUtf7Byte), Panic(EPanicOutOfSyncUtf7Byte1)); |
|
720 __ASSERT_DEBUG((~bitBuffer&KIsInBase64Block) || (currentUtf7Byte==Base64Decoding(*pointerToCurrentUtf7Byte, aIsImapUtf7)), Panic(EPanicOutOfSyncUtf7Byte2)); |
|
721 __ASSERT_DEBUG((bitBuffer&KIsInBase64Block) || ((bitBuffer==0) && (numberOfBitsInBuffer==0)), Panic(EPanicBadBitBufferState11)); |
|
722 if ((~bitBuffer&KIsInBase64Block) && (currentUtf7Byte==escapeCharacterForStartingBase64Block)) |
|
723 { |
|
724 if (pointerToCurrentUtf7Byte==pointerToLastUtf7Byte) |
|
725 { |
|
726 --pointerToCurrentUtf7Byte; |
|
727 inputIsTruncated=ETrue; |
|
728 goto end; |
|
729 } |
|
730 ++pointerToCurrentUtf7Byte; |
|
731 currentUtf7Byte=*pointerToCurrentUtf7Byte; |
|
732 if (currentUtf7Byte=='-') |
|
733 { |
|
734 currentUtf7Byte=escapeCharacterForStartingBase64Block; |
|
735 } |
|
736 else |
|
737 { |
|
738 currentUtf7Byte=Base64Decoding(currentUtf7Byte, aIsImapUtf7); |
|
739 if (currentUtf7Byte==KNotInBase64Alphabet) |
|
740 { |
|
741 return EErrorIllFormedInput; |
|
742 } |
|
743 bitBuffer=KIsInBase64Block; |
|
744 } |
|
745 } |
|
746 if (bitBuffer&KIsInBase64Block) |
|
747 { |
|
748 FOREVER |
|
749 { |
|
750 __ASSERT_DEBUG(currentUtf7Byte==Base64Decoding(*pointerToCurrentUtf7Byte, aIsImapUtf7), Panic(EPanicOutOfSyncBase64Decoding)); |
|
751 __ASSERT_DEBUG((numberOfBitsInBuffer<16) || (BitBufferContainsNonZeroBits(bitBuffer, numberOfBitsInBuffer-16) && (numberOfBitsInBuffer<16+6)), Panic(EPanicBadBitBufferState12)); |
|
752 if (currentUtf7Byte==KNotInBase64Alphabet) |
|
753 { |
|
754 if (BitBufferContainsNonZeroBits(bitBuffer, numberOfBitsInBuffer)) |
|
755 { |
|
756 return EErrorIllFormedInput; |
|
757 } |
|
758 bitBuffer=0; |
|
759 numberOfBitsInBuffer=0; |
|
760 currentUtf7Byte=*pointerToCurrentUtf7Byte; |
|
761 if (currentUtf7Byte=='-') |
|
762 { |
|
763 if (pointerToCurrentUtf7Byte==pointerToLastUtf7Byte) |
|
764 { |
|
765 goto end; |
|
766 } |
|
767 ++pointerToCurrentUtf7Byte; |
|
768 currentUtf7Byte=*pointerToCurrentUtf7Byte; |
|
769 } |
|
770 break; |
|
771 } |
|
772 bitBuffer<<=6; |
|
773 bitBuffer|=currentUtf7Byte; |
|
774 bitBuffer|=KIsInBase64Block; |
|
775 numberOfBitsInBuffer+=6; |
|
776 // only flush the buffer if it contains a whole Unicode character and the remainder is either all zero-bits (hence would be a legal point to end the base-64 sequence) or at least 6 bits long (therefore would leave at least one UTF-7 byte unconverted at the end of the input descriptor) |
|
777 if ((numberOfBitsInBuffer>=16+6) || ((numberOfBitsInBuffer>=16) && !BitBufferContainsNonZeroBits(bitBuffer, numberOfBitsInBuffer-16))) |
|
778 { |
|
779 numberOfBitsInBuffer-=16; |
|
780 __ASSERT_DEBUG(pointerToPreviousUnicodeCharacter<pointerToLastUnicodeCharacter, Panic(EPanicBadUnicodePointers6)); |
|
781 ++pointerToPreviousUnicodeCharacter; |
|
782 *pointerToPreviousUnicodeCharacter=STATIC_CAST(TUint16, bitBuffer>>numberOfBitsInBuffer); |
|
783 bitBuffer&=((1<<numberOfBitsInBuffer)-1); // zero all the consumed bits - must be done as bitBuffer is stored along with numberOfBitsInBuffer in aState if the output descriptor runs out of space or if the input descriptor was truncated |
|
784 bitBuffer|=KIsInBase64Block; // turn it back on as the line above turned it off |
|
785 if (pointerToPreviousUnicodeCharacter==pointerToLastUnicodeCharacter) |
|
786 { |
|
787 goto end; |
|
788 } |
|
789 } |
|
790 if (pointerToCurrentUtf7Byte==pointerToLastUtf7Byte) |
|
791 { |
|
792 inputIsTruncated=ETrue; |
|
793 goto end; |
|
794 } |
|
795 ++pointerToCurrentUtf7Byte; |
|
796 currentUtf7Byte=Base64Decoding(*pointerToCurrentUtf7Byte, aIsImapUtf7); |
|
797 } |
|
798 } |
|
799 else |
|
800 { |
|
801 __ASSERT_DEBUG(pointerToPreviousUnicodeCharacter<pointerToLastUnicodeCharacter, Panic(EPanicBadUnicodePointers7)); |
|
802 ++pointerToPreviousUnicodeCharacter; |
|
803 *pointerToPreviousUnicodeCharacter=STATIC_CAST(TUint16, currentUtf7Byte); |
|
804 if ((pointerToPreviousUnicodeCharacter==pointerToLastUnicodeCharacter) || (pointerToCurrentUtf7Byte==pointerToLastUtf7Byte)) |
|
805 { |
|
806 goto end; |
|
807 } |
|
808 ++pointerToCurrentUtf7Byte; |
|
809 currentUtf7Byte=*pointerToCurrentUtf7Byte; |
|
810 } |
|
811 } |
|
812 end: |
|
813 if (bitBuffer&KIsInBase64Block) |
|
814 { |
|
815 __ASSERT_DEBUG((numberOfBitsInBuffer<16) || (BitBufferContainsNonZeroBits(bitBuffer, numberOfBitsInBuffer-16) && (numberOfBitsInBuffer<16+6)), Panic(EPanicBadBitBufferState13)); |
|
816 if (BitBufferContainsNonZeroBits(bitBuffer, numberOfBitsInBuffer)) |
|
817 { |
|
818 // rewind how far we've got in the UTF-7 descriptor to indicate to the user (by returning a value greater than zero) that not all of the input could be converted as it ended with a truncated base-64 sequence |
|
819 __ASSERT_DEBUG(numberOfBitsInBuffer>=6, Panic(EPanicBadBitBufferState14)); |
|
820 pointerToCurrentUtf7Byte-=numberOfBitsInBuffer/6; |
|
821 const TInt newNumberOfBitsInBuffer=numberOfBitsInBuffer%6; |
|
822 bitBuffer&=~KIsInBase64Block; // temporarily turn off the KIsInBase64Block for the right-shift |
|
823 bitBuffer>>=(numberOfBitsInBuffer-newNumberOfBitsInBuffer); |
|
824 bitBuffer|=KIsInBase64Block; // must be turned back on again as the bit-buffer is packed into aState |
|
825 numberOfBitsInBuffer=newNumberOfBitsInBuffer; |
|
826 __ASSERT_DEBUG((numberOfBitsInBuffer==0) || (numberOfBitsInBuffer==2) || (numberOfBitsInBuffer==4), Panic(EPanicBadBitBufferState15)); |
|
827 } |
|
828 __ASSERT_DEBUG((numberOfBitsInBuffer<16) && (numberOfBitsInBuffer%2==0), Panic(EPanicBadBitBufferState16)); |
|
829 aState=STATIC_CAST(TInt, bitBuffer); |
|
830 aState|=(numberOfBitsInBuffer<<4); |
|
831 __ASSERT_DEBUG(aState&KIsInBase64Block, Panic(EPanicBadBitBufferState17)); |
|
832 bitBuffer=0; |
|
833 numberOfBitsInBuffer=0; |
|
834 } |
|
835 if ((pointerToCurrentUtf7Byte<aUtf7.Ptr()) && inputIsTruncated) |
|
836 { |
|
837 return EErrorIllFormedInput; |
|
838 } |
|
839 aUnicode.SetLength((pointerToPreviousUnicodeCharacter+1)-aUnicode.Ptr()); |
|
840 return pointerToLastUtf7Byte-pointerToCurrentUtf7Byte; |
|
841 } |
|
842 |
|
843 |
|
844 |
|
845 /** Converts text encoded using the Unicode transformation format UTF-8 |
|
846 into the Unicode UCS-2 character set. This function leaves with an |
|
847 error code of the input string is corrupted. |
|
848 |
|
849 @param aUtf8 The UTF-8 encoded input string |
|
850 @return A pointer to an HBufC16 with the converted Unicode string. */ |
|
851 EXPORT_C HBufC16* CnvUtfConverter::ConvertToUnicodeFromUtf8L(const TDesC8& aUtf8) |
|
852 { |
|
853 // If aUtf8 is an empty string return |
|
854 if (aUtf8.Length()==0) |
|
855 { |
|
856 HBufC16* hBuf = HBufC16::NewL(1); |
|
857 return hBuf; |
|
858 } |
|
859 |
|
860 // else convert aUtf8 to Unicode storing the result in a buffer, reallocating |
|
861 // it when needed. |
|
862 TInt length = aUtf8.Length(); |
|
863 const TInt bufsize = 100; |
|
864 |
|
865 TPtrC8 utf8 (aUtf8); |
|
866 TBuf<bufsize> buf; |
|
867 HBufC16* hBuf = HBufC16::NewLC(length); |
|
868 TPtr unicode = hBuf->Des(); |
|
869 |
|
870 FOREVER |
|
871 { |
|
872 TInt unconverted = ConvertToUnicodeFromUtf8(buf, utf8); |
|
873 if( unconverted == EErrorIllFormedInput || unconverted < 0) |
|
874 User::Leave(KErrCorrupt); |
|
875 |
|
876 if (unicode.Length() + buf.Length() > unicode.MaxLength()) |
|
877 { |
|
878 // Reallocate hBuf |
|
879 hBuf = hBuf->ReAllocL(unicode.Length() + buf.Length()); |
|
880 CleanupStack::Pop(); |
|
881 CleanupStack::PushL(hBuf); |
|
882 unicode.Set(hBuf->Des()); |
|
883 } |
|
884 unicode.Append(buf); |
|
885 if (unconverted ==0) |
|
886 break; |
|
887 utf8.Set(utf8.Right(unconverted)); |
|
888 } |
|
889 CleanupStack::Pop(); |
|
890 return hBuf; |
|
891 } |
|
892 |
|
893 /** Converts text encoded using the Unicode transformation format UTF-8 into the |
|
894 Unicode UCS-2 character set. |
|
895 |
|
896 @param aUnicode On return, contains the Unicode encoded output string. |
|
897 @param aUtf8 The UTF-8 encoded input string |
|
898 @return The number of unconverted bytes left at the end of the input descriptor, |
|
899 or one of the error values defined in TError. */ |
|
900 EXPORT_C TInt CnvUtfConverter::ConvertToUnicodeFromUtf8(TDes16& aUnicode, const TDesC8& aUtf8) |
|
901 { |
|
902 return ConvertToUnicodeFromUtf8(aUnicode, aUtf8, EFalse); |
|
903 } |
|
904 |
|
905 static void UpdateUnconvertibleInfo(TInt& aNumberOfUnconvertibleCharacters, |
|
906 TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter, TUint8 aIndex) |
|
907 { |
|
908 if (aNumberOfUnconvertibleCharacters<=0) |
|
909 { |
|
910 aIndexOfFirstByteOfFirstUnconvertibleCharacter = aIndex; |
|
911 } |
|
912 ++aNumberOfUnconvertibleCharacters; |
|
913 } |
|
914 |
|
915 /** Converts text encoded using the Unicode transformation format UTF-8 into the |
|
916 Unicode UCS-2 character set. |
|
917 |
|
918 @param aUnicode On return, contains the Unicode encoded output string. |
|
919 @param aUtf8 The UTF-8 encoded input string |
|
920 @param aGenerateJavaConformantUtf8 EFalse for orthodox UTF-8. ETrue for Java |
|
921 @return The number of unconverted bytes left at the end of the input descriptor, |
|
922 or one of the error values defined in TError. */ |
|
923 TInt CnvUtfConverter::ConvertToUnicodeFromUtf8(TDes16& aUnicode, const TDesC8& aUtf8, TBool aGenerateJavaConformantUtf8) |
|
924 { |
|
925 TInt dummyUnconverted, dummyUnconvertedIndex; |
|
926 return ConvertToUnicodeFromUtf8(aUnicode, aUtf8, aGenerateJavaConformantUtf8, dummyUnconverted, dummyUnconvertedIndex); |
|
927 } |
|
928 |
|
929 /** Converts text encoded using the Unicode transformation format UTF-8 into the |
|
930 Unicode UCS-2 character set. Surrogate pairs can be created when a valid 4 byte UTF-8 is input. |
|
931 |
|
932 The variant of UTF-8 used internally by Java differs slightly from standard |
|
933 UTF-8. The TBool argument controls the UTF-8 variant generated by this function. |
|
934 |
|
935 @param aUnicode On return, contains the Unicode encoded output string. |
|
936 @param aUtf8 The UTF-8 encoded input string |
|
937 @param aGenerateJavaConformantUtf8 EFalse for orthodox UTF-8. ETrue for Java |
|
938 UTF-8. The default is EFalse. |
|
939 @param aNumberOfUnconvertibleCharacters On return, contains the number of bytes |
|
940 which were not converted. |
|
941 @param aIndexOfFirstByteOfFirstUnconvertibleCharacter On return, the index |
|
942 of the first byte of the first unconvertible character. For instance if the |
|
943 first character in the input descriptor (aForeign) could not be converted, |
|
944 then this parameter is set to the first byte of that character, i.e. zero. |
|
945 A negative value is returned if all the characters were converted. |
|
946 @return The number of unconverted bytes left at the end of the input descriptor, |
|
947 or one of the error values defined in TError. */ |
|
948 |
|
949 /* of note: conformance. Unicode standard 5.0 section 3.9, table 3-7 |
|
950 * Well formed UTF-8 Byte Sequences, full table. |
|
951 * +----------------------------------------------------------------+ |
|
952 * | Code Points | 1st byte | 2nd byte | 3rd byte | 4th byte | |
|
953 * +--------------------+----------+----------+----------+----------+ |
|
954 * | U+0000..U+007F | 00..7D | | | | 1 byte, ascii |
|
955 * | U+0080..U+07FF | C2..DF | 80..BF | | | 2 bytes, error if 1st < 0xC2 |
|
956 * | U+0800..U+0FFF | E0 | A0..BF | 80..BF | | 3 bytes, 1st == 0xE0, error if 2nd < 0xA0 |
|
957 * | U+1000..U+CFFF | E1..EC | 80..BF | 80..BF | | normal |
|
958 * | U+D000..U+D7FF | ED | 80..9F | 80..BF | | 3 bytes, 1st == 0xED, error if 2nd > 0x9F |
|
959 * | U+E000..U+FFFF | EE..EF | 80..BF | 80..BF | | normal |
|
960 * | U+10000..U+3FFFF | F0 | 90..BF | 80..BF | 80..BF | 4 bytes, 1st == 0xf0, error if 2nd < 0x90 |
|
961 * | U+40000..U+FFFFF | F1..F3 | 80..BF | 80..BF | 80..BF | normal |
|
962 * | U+100000..U+10FFFF | F4 | 80..8F | 80..BF | 80..BF | 4 bytes, 1st == 0xF4, error if 2nd > 0x8F |
|
963 * +--------------------+----------+----------+----------+----------+ |
|
964 * |
|
965 * As a consequence of the well-formedness conditions specified in table 3-7, |
|
966 * the following byte values are disallowed in UTF-8: C0-C1, F5-FF. |
|
967 */ |
|
968 TInt CnvUtfConverter::ConvertToUnicodeFromUtf8(TDes16& aUnicode, const TDesC8& aUtf8, TBool aGenerateJavaConformantUtf8, |
|
969 TInt& aNumberOfUnconvertibleCharacters, TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter) |
|
970 { |
|
971 aUnicode.SetLength(0); |
|
972 if (aUtf8.Length()==0) |
|
973 { |
|
974 return 0; |
|
975 } |
|
976 if (aUnicode.MaxLength()==0) |
|
977 { |
|
978 return aUtf8.Length(); |
|
979 } |
|
980 |
|
981 TUint16* pointerToCurrentUnicodeCharacter=CONST_CAST(TUint16*, aUnicode.Ptr()); |
|
982 const TUint16* pointerToLastUnicodeCharacter=pointerToCurrentUnicodeCharacter+(aUnicode.MaxLength()-1); |
|
983 const TUint8* pointerToCurrentUtf8Byte=aUtf8.Ptr(); |
|
984 const TUint8* pointerToPendingUtf8Byte=aUtf8.Ptr(); |
|
985 const TUint8* pointerToLastUtf8Byte=pointerToCurrentUtf8Byte+(aUtf8.Length()-1); |
|
986 TUint16 replacementcharacter = 0xFFFD; |
|
987 TUint8 currentUtf8Byte; |
|
988 TUint currentUnicodeCharacter; |
|
989 TInt sequenceLength; |
|
990 |
|
991 FOREVER |
|
992 { |
|
993 __ASSERT_DEBUG(pointerToCurrentUnicodeCharacter<=pointerToLastUnicodeCharacter, Panic(EPanicBadUnicodePointers8)); |
|
994 __ASSERT_DEBUG(pointerToCurrentUtf8Byte<=pointerToLastUtf8Byte, Panic(EPanicBadUtf8Pointers3)); |
|
995 currentUtf8Byte=*pointerToCurrentUtf8Byte; |
|
996 pointerToPendingUtf8Byte = pointerToCurrentUtf8Byte; |
|
997 sequenceLength=100; |
|
998 |
|
999 for(TInt i=0;i<7;i++) |
|
1000 { |
|
1001 if ((currentUtf8Byte&(0xf8<<i))==(STATIC_CAST(TUint8,(0xF0<<i)))) |
|
1002 { |
|
1003 sequenceLength = 4-i; |
|
1004 break; |
|
1005 } |
|
1006 } |
|
1007 |
|
1008 if ((sequenceLength<2 || sequenceLength>6) && sequenceLength!=0) |
|
1009 { |
|
1010 currentUnicodeCharacter=replacementcharacter; |
|
1011 UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters, |
|
1012 aIndexOfFirstByteOfFirstUnconvertibleCharacter, pointerToCurrentUtf8Byte-aUtf8.Ptr()); |
|
1013 } |
|
1014 else |
|
1015 { |
|
1016 if ((pointerToLastUtf8Byte-pointerToCurrentUtf8Byte+1)<sequenceLength) |
|
1017 { |
|
1018 if((pointerToCurrentUnicodeCharacter-aUnicode.Ptr())==0) |
|
1019 return EErrorIllFormedInput; |
|
1020 |
|
1021 break; |
|
1022 } |
|
1023 |
|
1024 currentUnicodeCharacter = currentUtf8Byte&(0x7F>>sequenceLength); |
|
1025 |
|
1026 for(TInt i=sequenceLength;i>1; i--) |
|
1027 { |
|
1028 currentUtf8Byte = *(++pointerToCurrentUtf8Byte); |
|
1029 if ((currentUtf8Byte&0xc0)==0x80) |
|
1030 { |
|
1031 currentUnicodeCharacter = (currentUnicodeCharacter<<6)|(currentUtf8Byte&0x3F); |
|
1032 } |
|
1033 else |
|
1034 { |
|
1035 currentUnicodeCharacter=replacementcharacter; |
|
1036 UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters, |
|
1037 aIndexOfFirstByteOfFirstUnconvertibleCharacter, pointerToCurrentUtf8Byte-aUtf8.Ptr()); |
|
1038 --pointerToCurrentUtf8Byte; |
|
1039 } |
|
1040 } |
|
1041 } |
|
1042 |
|
1043 if (currentUnicodeCharacter > 0xFFFF) |
|
1044 { |
|
1045 if(pointerToCurrentUnicodeCharacter>=pointerToLastUnicodeCharacter) |
|
1046 { |
|
1047 pointerToCurrentUtf8Byte=pointerToPendingUtf8Byte; |
|
1048 break; |
|
1049 } |
|
1050 |
|
1051 TUint surrogate = (currentUnicodeCharacter>>10) + 0xD7C0; |
|
1052 *pointerToCurrentUnicodeCharacter=STATIC_CAST(TUint16, surrogate); |
|
1053 ++pointerToCurrentUnicodeCharacter; |
|
1054 |
|
1055 surrogate = (currentUnicodeCharacter&0x3FF)+0xDC00; |
|
1056 *pointerToCurrentUnicodeCharacter=STATIC_CAST(TUint16, surrogate); |
|
1057 ++pointerToCurrentUnicodeCharacter; |
|
1058 ++pointerToCurrentUtf8Byte; |
|
1059 } |
|
1060 else |
|
1061 { |
|
1062 *pointerToCurrentUnicodeCharacter=STATIC_CAST(TUint16, currentUnicodeCharacter); |
|
1063 ++pointerToCurrentUnicodeCharacter; |
|
1064 ++pointerToCurrentUtf8Byte; |
|
1065 } |
|
1066 |
|
1067 if ((pointerToCurrentUtf8Byte>pointerToLastUtf8Byte) || (pointerToCurrentUnicodeCharacter>pointerToLastUnicodeCharacter)) |
|
1068 { |
|
1069 break; |
|
1070 } |
|
1071 } |
|
1072 |
|
1073 aUnicode.SetLength(pointerToCurrentUnicodeCharacter-aUnicode.Ptr()); |
|
1074 return pointerToLastUtf8Byte-pointerToCurrentUtf8Byte+1; |
|
1075 } |
|
1076 |
|
1077 |
|
1078 GLREF_C void IsCharacterSetUTF8 (TInt& aConfidenceLevel, const TDesC8& aSample) |
|
1079 { |
|
1080 |
|
1081 TInt sampleLength = aSample.Length(); |
|
1082 if (sampleLength == 0) |
|
1083 { |
|
1084 aConfidenceLevel = 89; |
|
1085 return; |
|
1086 } |
|
1087 aConfidenceLevel=sampleLength; |
|
1088 TInt bytesRemaining=0; |
|
1089 |
|
1090 const TUint8* buffer=&aSample[0]; |
|
1091 for(TInt index=0; index!=sampleLength; ++index) |
|
1092 { |
|
1093 if(bytesRemaining>0) |
|
1094 { |
|
1095 // bytesRemaining > 0, means that a byte representing the start of a |
|
1096 // multibyte sequence was encountered and the bytesRemaining is the |
|
1097 // number of bytes to follow. The remaining bytes have to conform to |
|
1098 // values within the range 0x80 and 0xbf |
|
1099 if((buffer[index]&0xc0)==0x80) // the value is within range |
|
1100 { |
|
1101 --bytesRemaining; |
|
1102 continue; |
|
1103 } |
|
1104 else |
|
1105 { |
|
1106 bytesRemaining=0; |
|
1107 aConfidenceLevel=0; |
|
1108 break; |
|
1109 } |
|
1110 } |
|
1111 if (bytesRemaining==0) |
|
1112 { |
|
1113 if((buffer[index]&0x80)==0x00) |
|
1114 { |
|
1115 // The value of aSample[index] is in the range 0x00-0x7f |
|
1116 //UTF8 maintains ASCII transparency. So it's a valid |
|
1117 //UTF8. Do nothing, check next value. |
|
1118 } |
|
1119 else if((buffer[index]&0xe0)==0xc0) |
|
1120 { |
|
1121 bytesRemaining=1; |
|
1122 } |
|
1123 else if((buffer[index]&0xf0)==0xe0) |
|
1124 { |
|
1125 bytesRemaining=2; |
|
1126 } |
|
1127 else if((buffer[index]&0xf8)==0xf0) |
|
1128 { |
|
1129 bytesRemaining=3; |
|
1130 } |
|
1131 else |
|
1132 { |
|
1133 // wasn't anything expected so must be an illegal/irregular UTF8 coded value |
|
1134 aConfidenceLevel=0; |
|
1135 break; |
|
1136 } |
|
1137 } |
|
1138 } // for |
|
1139 aConfidenceLevel = (aConfidenceLevel > 0)?100:0; |
|
1140 } |
|
1141 |
|
1142 GLREF_C void IsCharacterSetUTF7(TInt& aConfidenceLevel, const TDesC8& aSample) |
|
1143 { |
|
1144 TInt sampleLength = aSample.Length(); |
|
1145 aConfidenceLevel = 70; |
|
1146 for (TInt i=0; i<sampleLength; ++i) |
|
1147 { |
|
1148 // UTF-7 value ranges only 7 bits |
|
1149 if((aSample[i]&0x80)!=0x00) |
|
1150 { |
|
1151 aConfidenceLevel= 0; |
|
1152 break; |
|
1153 } |
|
1154 |
|
1155 // there is no "~" in UTF-7 encoding. So if find either, it's not UTF-7 |
|
1156 else if (char(aSample[i])=='~') |
|
1157 { |
|
1158 aConfidenceLevel = 0; |
|
1159 break; |
|
1160 } |
|
1161 |
|
1162 // The SMS7Bit escape char value is 0x1b. Reduce confidence if it follows the following format |
|
1163 else if ( (aSample[i]==0x1b) && (i <sampleLength-1) ) |
|
1164 { |
|
1165 static const TInt smsExtensionTable[11] = |
|
1166 {0x0a, 0x14, 0x1b, 0x28, 0x29, 0x2f, 0x3c, 0x3d, 0x3e, 0x40, 0x65}; |
|
1167 TInt increment1 = i+1; |
|
1168 if (increment1>= sampleLength) |
|
1169 break; |
|
1170 for (TInt j=0; j < 11; ++j) |
|
1171 { |
|
1172 if (aSample[increment1] == smsExtensionTable[j]) |
|
1173 { |
|
1174 aConfidenceLevel-=10; |
|
1175 } |
|
1176 } |
|
1177 } |
|
1178 // The UTF-7 escape char is 0x2b. The values that follow the escape sequence |
|
1179 // the values following the escape char value must belong to the modified base64 |
|
1180 // or '-' else it is an ill-formed sequence, so probably not UTF-7 |
|
1181 else if ( (aSample[i]==0x2b) && (i <sampleLength-1) ) |
|
1182 { |
|
1183 TInt increment1 = i+1; |
|
1184 if ((aSample[increment1] == 0x2b) || (aSample[increment1] == 0x2d) || (aSample[increment1] == 0x2f) || |
|
1185 ((aSample[increment1] >= 0x41) && (aSample[increment1] <= 0x5a)) || |
|
1186 ((aSample[increment1] >= 0x61) && (aSample[increment1] <= 0x7a))) |
|
1187 { |
|
1188 aConfidenceLevel+=5; |
|
1189 } |
|
1190 else |
|
1191 { |
|
1192 aConfidenceLevel-=15; |
|
1193 } |
|
1194 i++; // should this be here or up in the if loop ?? |
|
1195 } |
|
1196 } //for |
|
1197 aConfidenceLevel =(aConfidenceLevel >0)? ((aConfidenceLevel > 100)? 100: aConfidenceLevel): 0; |
|
1198 } |