|
1 /* |
|
2 * Copyright (c) 1997-2009 Nokia Corporation and/or its subsidiary(-ies). |
|
3 * All rights reserved. |
|
4 * This component and the accompanying materials are made available |
|
5 * under the terms of the License "Eclipse Public License v1.0" |
|
6 * which accompanies this distribution, and is available |
|
7 * at the URL "http://www.eclipse.org/legal/epl-v10.html". |
|
8 * |
|
9 * Initial Contributors: |
|
10 * Nokia Corporation - initial contribution. |
|
11 * |
|
12 * Contributors: |
|
13 * |
|
14 * Description: |
|
15 * |
|
16 */ |
|
17 |
|
18 |
|
19 #include <e32std.h> |
|
20 #include <e32base.h> |
|
21 #include <utf.h> |
|
22 |
|
23 #define STATIC_CAST(t,v) static_cast<t>(v) |
|
24 #define CONST_CAST(t,v) const_cast<t>(v) |
|
25 #define FOREVER for(;;) |
|
26 |
|
27 const TUint KNotInBase64Alphabet=KMaxTUint; |
|
28 |
|
29 enum TPanic |
|
30 { |
|
31 EPanicBad6BitNumber=1, |
|
32 EPanicBadUtf7Pointers1, |
|
33 EPanicBadUtf7Pointers2, |
|
34 EPanicBadUtf7Pointers3, |
|
35 EPanicBadUtf7Pointers4, |
|
36 EPanicBadUtf7Pointers5, |
|
37 EPanicBadUtf7Pointers6, |
|
38 EPanicBadUtf7Pointers7, |
|
39 EPanicBadUtf7Pointers8, |
|
40 EPanicBadUtf7Pointers9, |
|
41 EPanicBadUtf7Pointers10, |
|
42 EPanicBadUtf7Pointers11, |
|
43 EPanicNotInBase64Block, |
|
44 EPanicBadUnicodePointers1, |
|
45 EPanicBadUnicodePointers2, |
|
46 EPanicBadUnicodePointers3, |
|
47 EPanicBadUnicodePointers4, |
|
48 EPanicBadUnicodePointers5, |
|
49 EPanicBadUnicodePointers6, |
|
50 EPanicBadUnicodePointers7, |
|
51 EPanicBadUnicodePointers8, |
|
52 EPanicBadUnicodePointers9, |
|
53 EPanicBadUnicodePointers10, |
|
54 EPanicBadBitBufferState1, |
|
55 EPanicBadBitBufferState2, |
|
56 EPanicBadBitBufferState3, |
|
57 EPanicBadBitBufferState4, |
|
58 EPanicBadBitBufferState5, |
|
59 EPanicBadBitBufferState6, |
|
60 EPanicBadBitBufferState7, |
|
61 EPanicBadBitBufferState8, |
|
62 EPanicBadBitBufferState9, |
|
63 EPanicBadBitBufferState10, |
|
64 EPanicBadBitBufferState11, |
|
65 EPanicBadBitBufferState12, |
|
66 EPanicBadBitBufferState13, |
|
67 EPanicBadBitBufferState14, |
|
68 EPanicBadBitBufferState15, |
|
69 EPanicBadBitBufferState16, |
|
70 EPanicBadBitBufferState17, |
|
71 EPanicUnexpectedNumberOfLoopIterations, |
|
72 EPanicInitialEscapeCharacterButNoBase64, |
|
73 EPanicBase64SequenceDoesNotFallOnUnicodeCharacterBoundary, |
|
74 EPanicBadUtf8Pointers1, |
|
75 EPanicBadUtf8Pointers2, |
|
76 EPanicBadUtf8Pointers3, |
|
77 EPanicBadUtf8Pointers4, |
|
78 EPanicBadUtf8Pointers5, |
|
79 EPanicBadUtf8Pointers6, |
|
80 EPanicBadUtf8Pointers7, |
|
81 EPanicOutOfSyncUtf7Byte1, |
|
82 EPanicOutOfSyncUtf7Byte2, |
|
83 EPanicOutOfSyncBase64Decoding |
|
84 }; |
|
85 |
|
86 _LIT(KLitPanicText, "CHARCONV-UTF"); |
|
87 |
|
88 LOCAL_C void Panic(TPanic aPanic) |
|
89 { |
|
90 User::Panic(KLitPanicText, aPanic); |
|
91 } |
|
92 |
|
93 inline TUint EscapeCharacterForStartingBase64Block(TBool aIsImapUtf7) {return aIsImapUtf7? '&': '+';} |
|
94 |
|
95 inline TBool BitBufferContainsNonZeroBits(TUint aBitBuffer, TInt aNumberOfBitsInBuffer) |
|
96 { |
|
97 return (aBitBuffer&((1<<aNumberOfBitsInBuffer)-1))!=0; |
|
98 } |
|
99 |
|
100 |
|
101 |
|
102 |
|
103 |
|
104 |
|
105 |
|
106 |
|
107 /** Converts Unicode text into UTF-8 encoding. |
|
108 |
|
109 @param aUtf8 On return, contains the UTF-8 encoded output string. |
|
110 @param aUnicode The Unicode-encoded input string. |
|
111 @return The number of unconverted characters left at the end of the input |
|
112 descriptor, or one of the error values defined in TError. */ |
|
113 EXPORT_C TInt CnvUtfConverter::ConvertFromUnicodeToUtf8(TDes8& aUtf8, const TDesC16& aUnicode) |
|
114 { |
|
115 return ConvertFromUnicodeToUtf8(aUtf8, aUnicode, EFalse); |
|
116 } |
|
117 |
|
118 |
|
119 |
|
120 /** Converts Unicode text into UTF-8 encoding. |
|
121 |
|
122 Surrogate pairs can be input which will result in a valid 4 byte UTF-8 value. |
|
123 |
|
124 The variant of UTF-8 used internally by Java differs slightly from standard |
|
125 UTF-8. The TBool argument controls the UTF-8 variant generated by this function. |
|
126 |
|
127 @param aUtf8 On return, contains the UTF-8 encoded output string. |
|
128 @param aUnicode A UCS-2 encoded input string. |
|
129 @param aGenerateJavaConformantUtf8 EFalse for orthodox UTF-8. ETrue for Java |
|
130 UTF-8. The default is EFalse. |
|
131 @return The number of unconverted characters left at the end of the input descriptor, |
|
132 or one of the error values defined in TError. */ |
|
133 TInt CnvUtfConverter::ConvertFromUnicodeToUtf8(TDes8& aUtf8, |
|
134 const TDesC16& aUnicode, |
|
135 TBool aGenerateJavaConformantUtf8) |
|
136 { |
|
137 if (aUnicode.Length() == 0) |
|
138 { |
|
139 aUtf8.SetLength(0); |
|
140 return 0; |
|
141 } |
|
142 if (aUtf8.MaxLength() == 0) |
|
143 { |
|
144 return aUnicode.Length(); |
|
145 } |
|
146 |
|
147 TUint8* pUtf8 = CONST_CAST(TUint8*, aUtf8.Ptr()); |
|
148 const TUint8* pointerToLastUtf8Byte = pUtf8 + (aUtf8.MaxLength() - 1); |
|
149 TBool inputIsTruncated = EFalse; |
|
150 const TUint16* pUnicode = aUnicode.Ptr(); |
|
151 const TUint16* pointerToLastUnicodeCharacter = pUnicode + (aUnicode.Length() - 1); |
|
152 |
|
153 FOREVER |
|
154 { |
|
155 __ASSERT_DEBUG(pUtf8 <= pointerToLastUtf8Byte, Panic(EPanicBadUtf8Pointers1)); |
|
156 __ASSERT_DEBUG(pUnicode <= pointerToLastUnicodeCharacter, Panic(EPanicBadUnicodePointers3)); |
|
157 |
|
158 if (pUnicode[0] < 0x80) |
|
159 { |
|
160 // ascii - 1 byte |
|
161 |
|
162 // internally java is different since the \x0000 character is |
|
163 // translated into \xC0 \x80. |
|
164 |
|
165 if ((aGenerateJavaConformantUtf8) && (pUnicode[0] == 0x0000)) |
|
166 { |
|
167 if (pUtf8 == pointerToLastUtf8Byte) |
|
168 { |
|
169 pUtf8--; |
|
170 pUnicode--; |
|
171 break; |
|
172 } |
|
173 *pUtf8++ = STATIC_CAST(TUint8, 0xc0); |
|
174 *pUtf8 = STATIC_CAST(TUint8, 0x80); |
|
175 } |
|
176 else |
|
177 { |
|
178 *pUtf8 = STATIC_CAST(TUint8, pUnicode[0]); |
|
179 } |
|
180 } |
|
181 else if (pUnicode[0] < 0x800) |
|
182 { |
|
183 // U+0080..U+07FF - 2 bytes |
|
184 |
|
185 if (pUtf8 == pointerToLastUtf8Byte) |
|
186 { |
|
187 pUtf8--; |
|
188 pUnicode--; |
|
189 break; |
|
190 } |
|
191 |
|
192 *pUtf8++ = STATIC_CAST(TUint8, 0xc0|(pUnicode[0]>>6)); |
|
193 *pUtf8 = STATIC_CAST(TUint8, 0x80|(pUnicode[0]&0x3f)); |
|
194 |
|
195 } |
|
196 |
|
197 // check to see if we have a surrogate in the stream, surrogates encode code points outside |
|
198 // the BMP and are 4 utf-8 chars, otherwise what we have here is 3 utf-8 chars. |
|
199 |
|
200 else if (((pUnicode[0] & 0xfc00) == 0xd800) && !aGenerateJavaConformantUtf8) |
|
201 { |
|
202 // surrogate pair - 4 bytes in utf-8 |
|
203 // U+10000..U+10FFFF |
|
204 |
|
205 __ASSERT_DEBUG(pUtf8 <= pointerToLastUtf8Byte, Panic(EPanicBadUtf8Pointers2)); |
|
206 // is there enough space to hold the character |
|
207 if ((pointerToLastUtf8Byte - pUtf8) < 3) |
|
208 { |
|
209 pUtf8--; |
|
210 pUnicode--; |
|
211 break; // no go to the exit condition |
|
212 } |
|
213 |
|
214 __ASSERT_DEBUG(pUnicode <= pointerToLastUnicodeCharacter, Panic(EPanicBadUnicodePointers4)); |
|
215 if (pUnicode >= pointerToLastUnicodeCharacter) |
|
216 { |
|
217 pUtf8--; |
|
218 pUnicode--; |
|
219 inputIsTruncated = ETrue; |
|
220 break; // middle of a surrogate pair. go to end condition |
|
221 } |
|
222 |
|
223 if ((pUnicode[1] & 0xfc00) != 0xdc00) |
|
224 { |
|
225 return EErrorIllFormedInput; |
|
226 } |
|
227 |
|
228 // convert utf-16 surrogate to utf-32 |
|
229 TUint ch = ((pUnicode[0] - 0xD800) << 10 | (pUnicode[1] - 0xDC00)) + 0x10000; |
|
230 |
|
231 // convert utf-32 to utf-8 |
|
232 *pUtf8++ = STATIC_CAST(TUint8,0xf0 | (ch >> 18)); |
|
233 *pUtf8++ = STATIC_CAST(TUint8,0x80 | ((ch >> 12) & 0x3f)); |
|
234 *pUtf8++ = STATIC_CAST(TUint8,0x80 | ((ch >> 6) & 0x3f)); |
|
235 *pUtf8 = STATIC_CAST(TUint8,0x80 | (ch & 0x3f)); |
|
236 |
|
237 // we consumed 2 utf-16 values, move this pointer |
|
238 pUnicode++; |
|
239 } |
|
240 else |
|
241 { |
|
242 // 3 byte - utf-8, U+800..U+FFFF rest of BMP. |
|
243 |
|
244 if (pointerToLastUtf8Byte - pUtf8 < 2) |
|
245 { |
|
246 pUtf8--; |
|
247 pUnicode--; |
|
248 break; |
|
249 } |
|
250 *pUtf8++ = STATIC_CAST(TUint8, 0xe0|(pUnicode[0]>>12)); |
|
251 *pUtf8++ = STATIC_CAST(TUint8, 0x80|((pUnicode[0]>>6)&0x3f)); |
|
252 *pUtf8 = STATIC_CAST(TUint8, 0x80|(pUnicode[0]&0x3f)); |
|
253 } |
|
254 |
|
255 if ((pUnicode == pointerToLastUnicodeCharacter) || (pUtf8 == pointerToLastUtf8Byte)) |
|
256 { |
|
257 break; |
|
258 } |
|
259 |
|
260 pUtf8++; |
|
261 pUnicode++; |
|
262 |
|
263 } |
|
264 |
|
265 if ((pUnicode < aUnicode.Ptr()) && inputIsTruncated) |
|
266 { |
|
267 return EErrorIllFormedInput; |
|
268 } |
|
269 |
|
270 aUtf8.SetLength((pUtf8 - aUtf8.Ptr())+1); |
|
271 return pointerToLastUnicodeCharacter-pUnicode; |
|
272 } |
|
273 |
|
274 |
|
275 |
|
276 |
|
277 |
|
278 |
|
279 |
|
280 |
|
281 |
|
282 |
|
283 |
|
284 /** Converts text encoded using the Unicode transformation format UTF-8 into the |
|
285 Unicode UCS-2 character set. |
|
286 |
|
287 @param aUnicode On return, contains the Unicode encoded output string. |
|
288 @param aUtf8 The UTF-8 encoded input string |
|
289 @return The number of unconverted bytes left at the end of the input descriptor, |
|
290 or one of the error values defined in TError. */ |
|
291 EXPORT_C TInt CnvUtfConverter::ConvertToUnicodeFromUtf8(TDes16& aUnicode, const TDesC8& aUtf8) |
|
292 { |
|
293 return ConvertToUnicodeFromUtf8(aUnicode, aUtf8, EFalse); |
|
294 } |
|
295 |
|
296 static void UpdateUnconvertibleInfo(TInt& aNumberOfUnconvertibleCharacters, |
|
297 TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter, TUint8 aIndex) |
|
298 { |
|
299 if (aNumberOfUnconvertibleCharacters<=0) |
|
300 { |
|
301 aIndexOfFirstByteOfFirstUnconvertibleCharacter = aIndex; |
|
302 } |
|
303 ++aNumberOfUnconvertibleCharacters; |
|
304 } |
|
305 |
|
306 /** Converts text encoded using the Unicode transformation format UTF-8 into the |
|
307 Unicode UCS-2 character set. |
|
308 |
|
309 @param aUnicode On return, contains the Unicode encoded output string. |
|
310 @param aUtf8 The UTF-8 encoded input string |
|
311 @param aGenerateJavaConformantUtf8 EFalse for orthodox UTF-8. ETrue for Java |
|
312 @return The number of unconverted bytes left at the end of the input descriptor, |
|
313 or one of the error values defined in TError. */ |
|
314 TInt CnvUtfConverter::ConvertToUnicodeFromUtf8(TDes16& aUnicode, const TDesC8& aUtf8, TBool aGenerateJavaConformantUtf8) |
|
315 { |
|
316 TInt dummyUnconverted, dummyUnconvertedIndex; |
|
317 return ConvertToUnicodeFromUtf8(aUnicode, aUtf8, aGenerateJavaConformantUtf8, dummyUnconverted, dummyUnconvertedIndex); |
|
318 } |
|
319 |
|
320 /** Converts text encoded using the Unicode transformation format UTF-8 into the |
|
321 Unicode UCS-2 character set. Surrogate pairs can be created when a valid 4 byte UTF-8 is input. |
|
322 |
|
323 The variant of UTF-8 used internally by Java differs slightly from standard |
|
324 UTF-8. The TBool argument controls the UTF-8 variant generated by this function. |
|
325 |
|
326 @param aUnicode On return, contains the Unicode encoded output string. |
|
327 @param aUtf8 The UTF-8 encoded input string |
|
328 @param aGenerateJavaConformantUtf8 EFalse for orthodox UTF-8. ETrue for Java |
|
329 UTF-8. The default is EFalse. |
|
330 @param aNumberOfUnconvertibleCharacters On return, contains the number of bytes |
|
331 which were not converted. |
|
332 @param aIndexOfFirstByteOfFirstUnconvertibleCharacter On return, the index |
|
333 of the first byte of the first unconvertible character. For instance if the |
|
334 first character in the input descriptor (aForeign) could not be converted, |
|
335 then this parameter is set to the first byte of that character, i.e. zero. |
|
336 A negative value is returned if all the characters were converted. |
|
337 @return The number of unconverted bytes left at the end of the input descriptor, |
|
338 or one of the error values defined in TError. */ |
|
339 |
|
340 /* of note: conformance. Unicode standard 5.0 section 3.9, table 3-7 |
|
341 * Well formed UTF-8 Byte Sequences, full table. |
|
342 * +----------------------------------------------------------------+ |
|
343 * | Code Points | 1st byte | 2nd byte | 3rd byte | 4th byte | |
|
344 * +--------------------+----------+----------+----------+----------+ |
|
345 * | U+0000..U+007F | 00..7D | | | | 1 byte, ascii |
|
346 * | U+0080..U+07FF | C2..DF | 80..BF | | | 2 bytes, error if 1st < 0xC2 |
|
347 * | U+0800..U+0FFF | E0 | A0..BF | 80..BF | | 3 bytes, 1st == 0xE0, error if 2nd < 0xA0 |
|
348 * | U+1000..U+CFFF | E1..EC | 80..BF | 80..BF | | normal |
|
349 * | U+D000..U+D7FF | ED | 80..9F | 80..BF | | 3 bytes, 1st == 0xED, error if 2nd > 0x9F |
|
350 * | U+E000..U+FFFF | EE..EF | 80..BF | 80..BF | | normal |
|
351 * | U+10000..U+3FFFF | F0 | 90..BF | 80..BF | 80..BF | 4 bytes, 1st == 0xf0, error if 2nd < 0x90 |
|
352 * | U+40000..U+FFFFF | F1..F3 | 80..BF | 80..BF | 80..BF | normal |
|
353 * | U+100000..U+10FFFF | F4 | 80..8F | 80..BF | 80..BF | 4 bytes, 1st == 0xF4, error if 2nd > 0x8F |
|
354 * +--------------------+----------+----------+----------+----------+ |
|
355 * |
|
356 * As a consequence of the well-formedness conditions specified in table 3-7, |
|
357 * the following byte values are disallowed in UTF-8: C0-C1, F5-FF. |
|
358 */ |
|
359 TInt CnvUtfConverter::ConvertToUnicodeFromUtf8(TDes16& aUnicode, const TDesC8& aUtf8, TBool aGenerateJavaConformantUtf8, |
|
360 TInt& aNumberOfUnconvertibleCharacters, TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter) |
|
361 { |
|
362 aUnicode.SetLength(0); |
|
363 |
|
364 if ((aUtf8.Length() == 0) || (aUnicode.MaxLength() == 0)) |
|
365 { |
|
366 return aUtf8.Length(); |
|
367 } |
|
368 |
|
369 TUint16* pUnicode = CONST_CAST(TUint16*, aUnicode.Ptr()); |
|
370 const TUint16* pLastUnicode = pUnicode + (aUnicode.MaxLength() - 1); |
|
371 const TUint8* pUtf8 = aUtf8.Ptr(); |
|
372 const TUint8* pLastUtf8 = pUtf8 + (aUtf8.Length() - 1); |
|
373 const TUint16 replacementcharacter = 0xFFFD; |
|
374 TUint currentUnicodeCharacter; |
|
375 TUint sequenceLength; |
|
376 |
|
377 |
|
378 FOREVER |
|
379 { |
|
380 TBool illFormed=EFalse; |
|
381 |
|
382 __ASSERT_DEBUG(pUnicode <= pLastUnicode, Panic(EPanicBadUnicodePointers8)); |
|
383 __ASSERT_DEBUG(pUtf8 <= pLastUtf8, Panic(EPanicBadUtf8Pointers3)); |
|
384 |
|
385 sequenceLength = 1; |
|
386 |
|
387 // ascii - optimisation (i.e. it isn't a sequence) |
|
388 if (pUtf8[0] < 0x80) |
|
389 { |
|
390 currentUnicodeCharacter = pUtf8[0]; |
|
391 } |
|
392 else |
|
393 { |
|
394 // see if well formed utf-8, use table above for reference |
|
395 if ((pUtf8[0] >= 0xc2) && (pUtf8[0] <= 0xdf)) |
|
396 { |
|
397 // 0xc1-0xc2 are not valid bytes |
|
398 sequenceLength = 2; |
|
399 } |
|
400 else if ((pUtf8[0] & 0xf0) == 0xe0) |
|
401 { |
|
402 sequenceLength = 3; |
|
403 } |
|
404 else if ((pUtf8[0] >= 0xf0) && (pUtf8[0] < 0xf5)) |
|
405 { |
|
406 // 0xf5-0xff, are not valid bytes |
|
407 sequenceLength = 4; |
|
408 } |
|
409 else if ((pUtf8[0] == 0xc0) && aGenerateJavaConformantUtf8) |
|
410 { |
|
411 if ((pUtf8 == pLastUtf8) || (pUtf8[1] == 0x80)) |
|
412 { |
|
413 // either we've split the 0xc0 0x80 (i.e. 0xc0 is |
|
414 // the last character in the string) or we've |
|
415 // discovered a valid 0xc0 0x80 sequence. |
|
416 sequenceLength = 2; |
|
417 } |
|
418 } |
|
419 |
|
420 /* checking to see if we got a valid sequence */ |
|
421 if (sequenceLength == 1) |
|
422 { |
|
423 // bad value in the leading byte, 0xc0-0xc1,0x5f-0xff for example |
|
424 currentUnicodeCharacter = replacementcharacter; |
|
425 UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters, |
|
426 aIndexOfFirstByteOfFirstUnconvertibleCharacter, pUtf8-aUtf8.Ptr()); |
|
427 } |
|
428 else |
|
429 { |
|
430 // this is a check to see if the sequence goes beyond the input |
|
431 // stream. if its not the first and only character in the input |
|
432 // stream this isn't an error, otherwise it is. |
|
433 if ((pUtf8 + sequenceLength - 1) > pLastUtf8) |
|
434 { |
|
435 // check to see if this sequence was the first character |
|
436 if ((pUnicode - aUnicode.Ptr()) == 0) |
|
437 { |
|
438 return EErrorIllFormedInput; |
|
439 } |
|
440 break; |
|
441 } |
|
442 |
|
443 currentUnicodeCharacter = pUtf8[0] & (0x7F>>sequenceLength); |
|
444 |
|
445 /* check the trailing bytes, they should begin with 10 */ |
|
446 TUint i = 1; |
|
447 |
|
448 do |
|
449 { |
|
450 if ((pUtf8[i] & 0xc0) == 0x80) |
|
451 { |
|
452 // add the trailing 6 bits to the current unicode char |
|
453 currentUnicodeCharacter = (currentUnicodeCharacter <<6 ) | (pUtf8[i] & 0x3F); |
|
454 } |
|
455 else |
|
456 { |
|
457 // ill formed character (doesn't have a lead 10) |
|
458 currentUnicodeCharacter = replacementcharacter; |
|
459 UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters, |
|
460 aIndexOfFirstByteOfFirstUnconvertibleCharacter, pUtf8-aUtf8.Ptr()); |
|
461 illFormed=ETrue; |
|
462 break; |
|
463 } |
|
464 i++; |
|
465 } |
|
466 while (i < sequenceLength); |
|
467 } |
|
468 |
|
469 /* conformance check. bits of above table for reference. |
|
470 * +----------------------------------------------------------------+ |
|
471 * | Code Points | 1st byte | 2nd byte | 3rd byte | 4th byte | |
|
472 * +--------------------+----------+----------+----------+----------+ |
|
473 * | U+0800..U+0FFF | E0 | A0..BF | 80..BF | | 3 bytes, 1st == 0xE0, 2nd < 0xA0 |
|
474 * | U+D000..U+D7FF | ED | 80..9F | 80..BF | | 3 bytes, 1st == 0xED, 2nd > 0x9F |
|
475 * | U+10000..U+3FFFF | F0 | 90..BF | 80..BF | 80..BF | 4 bytes, 1st == 0xf0, 2nd < 0x90 |
|
476 * | U+100000..U+10FFFF | F4 | 80..8F | 80..BF | 80..BF | 4 bytes, 1st == 0xF4, 2nd > 0x8F |
|
477 * +--------------------+----------+----------+----------+----------+ |
|
478 */ |
|
479 |
|
480 if (currentUnicodeCharacter != replacementcharacter) |
|
481 { |
|
482 if (sequenceLength == 3) |
|
483 { |
|
484 if ((pUtf8[0] == 0xE0) && (pUtf8[1] < 0xA0)) |
|
485 { |
|
486 currentUnicodeCharacter = replacementcharacter; |
|
487 UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters, |
|
488 aIndexOfFirstByteOfFirstUnconvertibleCharacter, pUtf8-aUtf8.Ptr()); |
|
489 illFormed=ETrue; |
|
490 } |
|
491 else if ((pUtf8[0] == 0xED) && (pUtf8[1] > 0x9F)) |
|
492 { |
|
493 currentUnicodeCharacter = replacementcharacter; |
|
494 UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters, |
|
495 aIndexOfFirstByteOfFirstUnconvertibleCharacter, pUtf8-aUtf8.Ptr()); |
|
496 illFormed=ETrue; |
|
497 } |
|
498 } |
|
499 else if (sequenceLength == 4) |
|
500 { |
|
501 if ((pUtf8[0] == 0xF0) && (pUtf8[1] < 0x90)) |
|
502 { |
|
503 currentUnicodeCharacter = replacementcharacter; |
|
504 UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters, |
|
505 aIndexOfFirstByteOfFirstUnconvertibleCharacter, pUtf8-aUtf8.Ptr()); |
|
506 illFormed=ETrue; |
|
507 } |
|
508 else if ((pUtf8[0] == 0xF4) && (pUtf8[1] > 0x8F)) |
|
509 { |
|
510 currentUnicodeCharacter = replacementcharacter; |
|
511 UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters, |
|
512 aIndexOfFirstByteOfFirstUnconvertibleCharacter, pUtf8-aUtf8.Ptr()); |
|
513 illFormed=ETrue; |
|
514 } |
|
515 } |
|
516 |
|
517 |
|
518 /* last conformance check - Unicode 5.0 section 3.9 D92 Because surrogate code points |
|
519 * are not Unicode scalar values, any UTF-8 byte sequence that would map to code |
|
520 * points D800..DFFF is ill formed */ |
|
521 |
|
522 if ((currentUnicodeCharacter >= 0xD800) && (currentUnicodeCharacter <= 0xDFFF)) |
|
523 { |
|
524 currentUnicodeCharacter = replacementcharacter; |
|
525 UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters, |
|
526 aIndexOfFirstByteOfFirstUnconvertibleCharacter, pUtf8-aUtf8.Ptr()); |
|
527 illFormed=ETrue; |
|
528 } |
|
529 } |
|
530 // end conformance check |
|
531 } |
|
532 |
|
533 // would this character generate a surrogate pair in UTF-16? |
|
534 if (currentUnicodeCharacter > 0xFFFF) |
|
535 { |
|
536 // is there enough space to hold a surrogate pair in the output? |
|
537 if (pUnicode >= pLastUnicode) |
|
538 { |
|
539 break; // no, end processing. |
|
540 } |
|
541 |
|
542 TUint surrogate = (currentUnicodeCharacter>>10) + 0xD7C0; |
|
543 *pUnicode++ = STATIC_CAST(TUint16, surrogate); |
|
544 |
|
545 surrogate = (currentUnicodeCharacter & 0x3FF) + 0xDC00; |
|
546 *pUnicode++ = STATIC_CAST(TUint16, surrogate); |
|
547 } |
|
548 else |
|
549 { |
|
550 *pUnicode++ = STATIC_CAST(TUint16, currentUnicodeCharacter); |
|
551 } |
|
552 |
|
553 // move the input pointer |
|
554 if (currentUnicodeCharacter != replacementcharacter) |
|
555 { |
|
556 pUtf8 += sequenceLength; |
|
557 } |
|
558 else if(illFormed == EFalse) |
|
559 { |
|
560 pUtf8 += (sequenceLength); |
|
561 } |
|
562 else |
|
563 { |
|
564 // we had a character we didn't recognize (i.e. it was invalid) |
|
565 // so move to the next character in the input |
|
566 pUtf8++; |
|
567 } |
|
568 |
|
569 if ((pUtf8 > pLastUtf8) || (pUnicode > pLastUnicode)) |
|
570 { |
|
571 break; // we've either reached the end of the input or the end of output |
|
572 } |
|
573 } |
|
574 |
|
575 aUnicode.SetLength(pUnicode - aUnicode.Ptr()); |
|
576 return (pLastUtf8 - pUtf8 + 1); |
|
577 } |
|
578 |
|
579 /** Given a sample text this function attempts to determine whether or not |
|
580 * the same text is encoded using the UTF-8 standard encoding scheme. |
|
581 |
|
582 @param TInt a confidence level, given at certain value. if the given sample |
|
583 is UTF-8 this value will not be changed (unless > 100) then its |
|
584 set to 100. Otherwise if the same isn't UTF-8, its set to 0. |
|
585 @param TDesC8 sample text. |
|
586 UTF-8. The default is EFalse. |
|
587 @return void |
|
588 */ |
|
589 |
|
590 /* of note: conformance. Unicode standard 5.0 section 3.9, table 3-7 |
|
591 * Well formed UTF-8 Byte Sequences, full table. |
|
592 * +----------------------------------------------------------------+ |
|
593 * | Code Points | 1st byte | 2nd byte | 3rd byte | 4th byte | |
|
594 * +--------------------+----------+----------+----------+----------+ |
|
595 * | U+0000..U+007F | 00..7D | | | | 1 byte, ascii |
|
596 * | U+0080..U+07FF | C2..DF | 80..BF | | | 2 bytes, error if 1st < 0xC2 |
|
597 * | U+0800..U+0FFF | E0 | A0..BF | 80..BF | | 3 bytes, 1st == 0xE0, error if 2nd < 0xA0 |
|
598 * | U+1000..U+CFFF | E1..EC | 80..BF | 80..BF | | normal |
|
599 * | U+D000..U+D7FF | ED | 80..9F | 80..BF | | 3 bytes, 1st == 0xED, error if 2nd > 0x9F |
|
600 * | U+E000..U+FFFF | EE..EF | 80..BF | 80..BF | | normal |
|
601 * | U+10000..U+3FFFF | F0 | 90..BF | 80..BF | 80..BF | 4 bytes, 1st == 0xf0, error if 2nd < 0x90 |
|
602 * | U+40000..U+FFFFF | F1..F3 | 80..BF | 80..BF | 80..BF | normal |
|
603 * | U+100000..U+10FFFF | F4 | 80..8F | 80..BF | 80..BF | 4 bytes, 1st == 0xF4, error if 2nd > 0x8F |
|
604 * +--------------------+----------+----------+----------+----------+ |
|
605 * |
|
606 * As a consequence of the well-formedness conditions specified in table 3-7, |
|
607 * the following byte values are disallowed in UTF-8: C0-C1, F5-FF. |
|
608 * |
|
609 * Code Rules: |
|
610 * R1: If the string contains any non-UTF-8 characters the returned confidence |
|
611 * is 0. Valid UTF-8 combinations are listed in the above table. |
|
612 * R2: Otherwise if the string starts with a UTF-8 BOM (byte order mark) in |
|
613 * the (see ) the returned confidence is 95. |
|
614 * R3: Otherwise the confidence returned is based upon the sample string |
|
615 * length. |
|
616 * R4: If the sample string is under 75 characters, the confidence is set to |
|
617 * 75. |
|
618 */ |
|
619 void IsCharacterSetUTF8(TInt& aConfidenceLevel, const TDesC8& aSample) |
|
620 { |
|
621 |
|
622 TInt sampleLength = aSample.Length(); |
|
623 |
|
624 if (sampleLength == 0) |
|
625 { |
|
626 aConfidenceLevel = 89; |
|
627 return; |
|
628 } |
|
629 TInt bytesRemaining = 0; |
|
630 TUint sequenceLength = 0; |
|
631 |
|
632 aConfidenceLevel = sampleLength; |
|
633 |
|
634 const TUint8* buffer = &aSample[0]; |
|
635 |
|
636 if (sampleLength < 95) |
|
637 { |
|
638 // check for the BOM |
|
639 if ((sampleLength >= 3) && |
|
640 ((buffer[0] == 0xEF) && |
|
641 (buffer[1] == 0xBB) && |
|
642 (buffer[2] == 0xBF)) |
|
643 ) |
|
644 { |
|
645 aConfidenceLevel = 95; |
|
646 } |
|
647 else if (sampleLength < 75) |
|
648 { |
|
649 aConfidenceLevel = 75; |
|
650 } |
|
651 } |
|
652 |
|
653 for (TInt index = 0;index != sampleLength;index++) |
|
654 { |
|
655 |
|
656 if (bytesRemaining > 0) |
|
657 { |
|
658 // bytesRemaining > 0, means that a byte representing the start of a |
|
659 // multibyte sequence was encountered and the bytesRemaining is the |
|
660 // number of bytes to follow. |
|
661 |
|
662 if ((buffer[index] & 0xc0) == 0x80) |
|
663 { |
|
664 // need to check for ill-formed sequences -- all are in the 2nd byte |
|
665 |
|
666 if ((sequenceLength == 3) && (bytesRemaining == 2)) |
|
667 { |
|
668 if ((buffer[index - 1] == 0xe0) && (buffer[index] < 0xa0)) |
|
669 { |
|
670 aConfidenceLevel = 0; |
|
671 break; |
|
672 } |
|
673 else if ((buffer[index - 1] == 0xed) && (buffer[index] > 0x9f)) |
|
674 { |
|
675 aConfidenceLevel = 0; |
|
676 break; |
|
677 } |
|
678 } |
|
679 else if ((sequenceLength == 4) && (bytesRemaining == 3)) |
|
680 { |
|
681 if ((buffer[index - 1] == 0xf0) && (buffer[index] < 0x90)) |
|
682 { |
|
683 aConfidenceLevel = 0; |
|
684 break; |
|
685 } |
|
686 else if ((buffer[index - 1] == 0xf4) && (buffer[index] > 0x8f)) |
|
687 { |
|
688 aConfidenceLevel = 0; |
|
689 break; |
|
690 } |
|
691 } |
|
692 |
|
693 --bytesRemaining; |
|
694 continue; |
|
695 } |
|
696 else |
|
697 { |
|
698 aConfidenceLevel = 0; |
|
699 break; |
|
700 } |
|
701 } |
|
702 |
|
703 if (bytesRemaining == 0) |
|
704 { |
|
705 if (buffer[index] < 0x80) |
|
706 { |
|
707 // The value of aSample[index] is in the range 0x00-0x7f |
|
708 //UTF8 maintains ASCII transparency. So it's a valid |
|
709 //UTF8. Do nothing, check next value. |
|
710 continue; |
|
711 } |
|
712 else if ((buffer[index] >= 0xc2) && (buffer[index] < 0xe0)) |
|
713 { |
|
714 // valid start of a 2 byte sequence (see conformance note) |
|
715 sequenceLength = 2; |
|
716 bytesRemaining = 1; |
|
717 } |
|
718 else if ((buffer[index] & 0xf0) == 0xe0) |
|
719 { |
|
720 // valid start of a 3 byte sequence |
|
721 sequenceLength = 3; |
|
722 bytesRemaining = 2; |
|
723 } |
|
724 else if ((buffer[index] >= 0xf0) && (buffer[index] < 0xf5)) |
|
725 { |
|
726 // valid start of a 4 byte sequence (see conformance note) |
|
727 sequenceLength = 4; |
|
728 bytesRemaining = 3; |
|
729 } |
|
730 else |
|
731 { |
|
732 // wasn't anything expected so must be an illegal/irregular UTF8 coded value |
|
733 aConfidenceLevel = 0; |
|
734 break; |
|
735 } |
|
736 } |
|
737 } // for |
|
738 |
|
739 aConfidenceLevel = (aConfidenceLevel > 0)? ((aConfidenceLevel > 100)? 100: aConfidenceLevel): 0; |
|
740 } |
|
741 |
|
742 // End of file |