|
1 /* |
|
2 * Copyright (C) 2007 Apple Inc. All rights reserved. |
|
3 * |
|
4 * Redistribution and use in source and binary forms, with or without |
|
5 * modification, are permitted provided that the following conditions |
|
6 * are met: |
|
7 * 1. Redistributions of source code must retain the above copyright |
|
8 * notice, this list of conditions and the following disclaimer. |
|
9 * 2. Redistributions in binary form must reproduce the above copyright |
|
10 * notice, this list of conditions and the following disclaimer in the |
|
11 * documentation and/or other materials provided with the distribution. |
|
12 * |
|
13 * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY |
|
14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
|
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
|
16 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR |
|
17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
|
18 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
|
19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
|
20 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY |
|
21 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
|
22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
|
23 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
|
24 */ |
|
25 |
|
26 #include "config.h" |
|
27 #include "UTF8.h" |
|
28 |
|
29 namespace WTF { |
|
30 namespace Unicode { |
|
31 |
|
32 inline int inlineUTF8SequenceLengthNonASCII(char b0) |
|
33 { |
|
34 if ((b0 & 0xC0) != 0xC0) |
|
35 return 0; |
|
36 if ((b0 & 0xE0) == 0xC0) |
|
37 return 2; |
|
38 if ((b0 & 0xF0) == 0xE0) |
|
39 return 3; |
|
40 if ((b0 & 0xF8) == 0xF0) |
|
41 return 4; |
|
42 return 0; |
|
43 } |
|
44 |
|
45 inline int inlineUTF8SequenceLength(char b0) |
|
46 { |
|
47 return (b0 & 0x80) == 0 ? 1 : inlineUTF8SequenceLengthNonASCII(b0); |
|
48 } |
|
49 |
|
50 int UTF8SequenceLength(char b0) |
|
51 { |
|
52 return (b0 & 0x80) == 0 ? 1 : inlineUTF8SequenceLengthNonASCII(b0); |
|
53 } |
|
54 |
|
55 int decodeUTF8Sequence(const char* sequence) |
|
56 { |
|
57 // Handle 0-byte sequences (never valid). |
|
58 const unsigned char b0 = sequence[0]; |
|
59 const int length = inlineUTF8SequenceLength(b0); |
|
60 if (length == 0) |
|
61 return -1; |
|
62 |
|
63 // Handle 1-byte sequences (plain ASCII). |
|
64 const unsigned char b1 = sequence[1]; |
|
65 if (length == 1) { |
|
66 if (b1) |
|
67 return -1; |
|
68 return b0; |
|
69 } |
|
70 |
|
71 // Handle 2-byte sequences. |
|
72 if ((b1 & 0xC0) != 0x80) |
|
73 return -1; |
|
74 const unsigned char b2 = sequence[2]; |
|
75 if (length == 2) { |
|
76 if (b2) |
|
77 return -1; |
|
78 const int c = ((b0 & 0x1F) << 6) | (b1 & 0x3F); |
|
79 if (c < 0x80) |
|
80 return -1; |
|
81 return c; |
|
82 } |
|
83 |
|
84 // Handle 3-byte sequences. |
|
85 if ((b2 & 0xC0) != 0x80) |
|
86 return -1; |
|
87 const unsigned char b3 = sequence[3]; |
|
88 if (length == 3) { |
|
89 if (b3) |
|
90 return -1; |
|
91 const int c = ((b0 & 0xF) << 12) | ((b1 & 0x3F) << 6) | (b2 & 0x3F); |
|
92 if (c < 0x800) |
|
93 return -1; |
|
94 // UTF-16 surrogates should never appear in UTF-8 data. |
|
95 if (c >= 0xD800 && c <= 0xDFFF) |
|
96 return -1; |
|
97 return c; |
|
98 } |
|
99 |
|
100 // Handle 4-byte sequences. |
|
101 if ((b3 & 0xC0) != 0x80) |
|
102 return -1; |
|
103 const unsigned char b4 = sequence[4]; |
|
104 if (length == 4) { |
|
105 if (b4) |
|
106 return -1; |
|
107 const int c = ((b0 & 0x7) << 18) | ((b1 & 0x3F) << 12) | ((b2 & 0x3F) << 6) | (b3 & 0x3F); |
|
108 if (c < 0x10000 || c > 0x10FFFF) |
|
109 return -1; |
|
110 return c; |
|
111 } |
|
112 |
|
113 return -1; |
|
114 } |
|
115 |
|
116 // Once the bits are split out into bytes of UTF-8, this is a mask OR-ed |
|
117 // into the first byte, depending on how many bytes follow. There are |
|
118 // as many entries in this table as there are UTF-8 sequence types. |
|
119 // (I.e., one byte sequence, two byte... etc.). Remember that sequencs |
|
120 // for *legal* UTF-8 will be 4 or fewer bytes total. |
|
121 static const unsigned char firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC }; |
|
122 |
|
123 ConversionResult convertUTF16ToUTF8( |
|
124 const UChar** sourceStart, const UChar* sourceEnd, |
|
125 char** targetStart, char* targetEnd, bool strict) |
|
126 { |
|
127 ConversionResult result = conversionOK; |
|
128 const UChar* source = *sourceStart; |
|
129 char* target = *targetStart; |
|
130 while (source < sourceEnd) { |
|
131 UChar32 ch; |
|
132 unsigned short bytesToWrite = 0; |
|
133 const UChar32 byteMask = 0xBF; |
|
134 const UChar32 byteMark = 0x80; |
|
135 const UChar* oldSource = source; // In case we have to back up because of target overflow. |
|
136 ch = static_cast<unsigned short>(*source++); |
|
137 // If we have a surrogate pair, convert to UChar32 first. |
|
138 if (ch >= 0xD800 && ch <= 0xDBFF) { |
|
139 // If the 16 bits following the high surrogate are in the source buffer... |
|
140 if (source < sourceEnd) { |
|
141 UChar32 ch2 = static_cast<unsigned short>(*source); |
|
142 // If it's a low surrogate, convert to UChar32. |
|
143 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { |
|
144 ch = ((ch - 0xD800) << 10) + (ch2 - 0xDC00) + 0x0010000; |
|
145 ++source; |
|
146 } else if (strict) { // it's an unpaired high surrogate |
|
147 --source; // return to the illegal value itself |
|
148 result = sourceIllegal; |
|
149 break; |
|
150 } |
|
151 } else { // We don't have the 16 bits following the high surrogate. |
|
152 --source; // return to the high surrogate |
|
153 result = sourceExhausted; |
|
154 break; |
|
155 } |
|
156 } else if (strict) { |
|
157 // UTF-16 surrogate values are illegal in UTF-32 |
|
158 if (ch >= 0xDC00 && ch <= 0xDFFF) { |
|
159 --source; // return to the illegal value itself |
|
160 result = sourceIllegal; |
|
161 break; |
|
162 } |
|
163 } |
|
164 // Figure out how many bytes the result will require |
|
165 if (ch < (UChar32)0x80) { |
|
166 bytesToWrite = 1; |
|
167 } else if (ch < (UChar32)0x800) { |
|
168 bytesToWrite = 2; |
|
169 } else if (ch < (UChar32)0x10000) { |
|
170 bytesToWrite = 3; |
|
171 } else if (ch < (UChar32)0x110000) { |
|
172 bytesToWrite = 4; |
|
173 } else { |
|
174 bytesToWrite = 3; |
|
175 ch = 0xFFFD; |
|
176 } |
|
177 |
|
178 target += bytesToWrite; |
|
179 if (target > targetEnd) { |
|
180 source = oldSource; // Back up source pointer! |
|
181 target -= bytesToWrite; |
|
182 result = targetExhausted; |
|
183 break; |
|
184 } |
|
185 switch (bytesToWrite) { // note: everything falls through. |
|
186 case 4: *--target = (char)((ch | byteMark) & byteMask); ch >>= 6; |
|
187 case 3: *--target = (char)((ch | byteMark) & byteMask); ch >>= 6; |
|
188 case 2: *--target = (char)((ch | byteMark) & byteMask); ch >>= 6; |
|
189 case 1: *--target = (char)(ch | firstByteMark[bytesToWrite]); |
|
190 } |
|
191 target += bytesToWrite; |
|
192 } |
|
193 *sourceStart = source; |
|
194 *targetStart = target; |
|
195 return result; |
|
196 } |
|
197 |
|
198 // This must be called with the length pre-determined by the first byte. |
|
199 // If presented with a length > 4, this returns false. The Unicode |
|
200 // definition of UTF-8 goes up to 4-byte sequences. |
|
201 static bool isLegalUTF8(const unsigned char* source, int length) |
|
202 { |
|
203 unsigned char a; |
|
204 const unsigned char* srcptr = source + length; |
|
205 switch (length) { |
|
206 default: return false; |
|
207 // Everything else falls through when "true"... |
|
208 case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; |
|
209 case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; |
|
210 case 2: if ((a = (*--srcptr)) > 0xBF) return false; |
|
211 |
|
212 switch (*source) { |
|
213 // no fall-through in this inner switch |
|
214 case 0xE0: if (a < 0xA0) return false; break; |
|
215 case 0xED: if (a > 0x9F) return false; break; |
|
216 case 0xF0: if (a < 0x90) return false; break; |
|
217 case 0xF4: if (a > 0x8F) return false; break; |
|
218 default: if (a < 0x80) return false; |
|
219 } |
|
220 |
|
221 case 1: if (*source >= 0x80 && *source < 0xC2) return false; |
|
222 } |
|
223 if (*source > 0xF4) |
|
224 return false; |
|
225 return true; |
|
226 } |
|
227 |
|
228 // Magic values subtracted from a buffer value during UTF8 conversion. |
|
229 // This table contains as many values as there might be trailing bytes |
|
230 // in a UTF-8 sequence. |
|
231 static const UChar32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL, |
|
232 0x03C82080UL, 0xFA082080UL, 0x82082080UL }; |
|
233 |
|
234 ConversionResult convertUTF8ToUTF16( |
|
235 const char** sourceStart, const char* sourceEnd, |
|
236 UChar** targetStart, UChar* targetEnd, bool strict) |
|
237 { |
|
238 ConversionResult result = conversionOK; |
|
239 const char* source = *sourceStart; |
|
240 UChar* target = *targetStart; |
|
241 while (source < sourceEnd) { |
|
242 UChar32 ch = 0; |
|
243 int extraBytesToRead = UTF8SequenceLength(*source) - 1; |
|
244 if (source + extraBytesToRead >= sourceEnd) { |
|
245 result = sourceExhausted; |
|
246 break; |
|
247 } |
|
248 // Do this check whether lenient or strict |
|
249 if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(source), extraBytesToRead + 1)) { |
|
250 result = sourceIllegal; |
|
251 break; |
|
252 } |
|
253 // The cases all fall through. |
|
254 switch (extraBytesToRead) { |
|
255 case 5: ch += static_cast<unsigned char>(*source++); ch <<= 6; // remember, illegal UTF-8 |
|
256 case 4: ch += static_cast<unsigned char>(*source++); ch <<= 6; // remember, illegal UTF-8 |
|
257 case 3: ch += static_cast<unsigned char>(*source++); ch <<= 6; |
|
258 case 2: ch += static_cast<unsigned char>(*source++); ch <<= 6; |
|
259 case 1: ch += static_cast<unsigned char>(*source++); ch <<= 6; |
|
260 case 0: ch += static_cast<unsigned char>(*source++); |
|
261 } |
|
262 ch -= offsetsFromUTF8[extraBytesToRead]; |
|
263 |
|
264 if (target >= targetEnd) { |
|
265 source -= (extraBytesToRead + 1); // Back up source pointer! |
|
266 result = targetExhausted; break; |
|
267 } |
|
268 if (ch <= 0xFFFF) { |
|
269 // UTF-16 surrogate values are illegal in UTF-32 |
|
270 if (ch >= 0xD800 && ch <= 0xDFFF) { |
|
271 if (strict) { |
|
272 source -= (extraBytesToRead + 1); // return to the illegal value itself |
|
273 result = sourceIllegal; |
|
274 break; |
|
275 } else |
|
276 *target++ = 0xFFFD; |
|
277 } else |
|
278 *target++ = (UChar)ch; // normal case |
|
279 } else if (ch > 0x10FFFF) { |
|
280 if (strict) { |
|
281 result = sourceIllegal; |
|
282 source -= (extraBytesToRead + 1); // return to the start |
|
283 break; // Bail out; shouldn't continue |
|
284 } else |
|
285 *target++ = 0xFFFD; |
|
286 } else { |
|
287 // target is a character in range 0xFFFF - 0x10FFFF |
|
288 if (target + 1 >= targetEnd) { |
|
289 source -= (extraBytesToRead + 1); // Back up source pointer! |
|
290 result = targetExhausted; |
|
291 break; |
|
292 } |
|
293 ch -= 0x0010000UL; |
|
294 *target++ = (UChar)((ch >> 10) + 0xD800); |
|
295 *target++ = (UChar)((ch & 0x03FF) + 0xDC00); |
|
296 } |
|
297 } |
|
298 *sourceStart = source; |
|
299 *targetStart = target; |
|
300 return result; |
|
301 } |
|
302 |
|
303 } |
|
304 } |