JavaScriptCore/wtf/unicode/UTF8.cpp
changeset 0 4f2f89ce4247
equal deleted inserted replaced
-1:000000000000 0:4f2f89ce4247
       
     1 /*
       
     2  * Copyright (C) 2007 Apple Inc.  All rights reserved.
       
     3  *
       
     4  * Redistribution and use in source and binary forms, with or without
       
     5  * modification, are permitted provided that the following conditions
       
     6  * are met:
       
     7  * 1. Redistributions of source code must retain the above copyright
       
     8  *    notice, this list of conditions and the following disclaimer.
       
     9  * 2. Redistributions in binary form must reproduce the above copyright
       
    10  *    notice, this list of conditions and the following disclaimer in the
       
    11  *    documentation and/or other materials provided with the distribution.
       
    12  *
       
    13  * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
       
    14  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
       
    15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
       
    16  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE COMPUTER, INC. OR
       
    17  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
       
    18  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
       
    19  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
       
    20  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
       
    21  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
       
    22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
       
    23  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
       
    24  */
       
    25 
       
    26 #include "config.h"
       
    27 #include "UTF8.h"
       
    28 
       
    29 namespace WTF {
       
    30 namespace Unicode {
       
    31 
       
    32 inline int inlineUTF8SequenceLengthNonASCII(char b0)
       
    33 {
       
    34     if ((b0 & 0xC0) != 0xC0)
       
    35         return 0;
       
    36     if ((b0 & 0xE0) == 0xC0)
       
    37         return 2;
       
    38     if ((b0 & 0xF0) == 0xE0)
       
    39         return 3;
       
    40     if ((b0 & 0xF8) == 0xF0)
       
    41         return 4;
       
    42     return 0;
       
    43 }
       
    44 
       
    45 inline int inlineUTF8SequenceLength(char b0)
       
    46 {
       
    47     return (b0 & 0x80) == 0 ? 1 : inlineUTF8SequenceLengthNonASCII(b0);
       
    48 }
       
    49 
       
    50 int UTF8SequenceLength(char b0)
       
    51 {
       
    52     return (b0 & 0x80) == 0 ? 1 : inlineUTF8SequenceLengthNonASCII(b0);
       
    53 }
       
    54 
       
    55 int decodeUTF8Sequence(const char* sequence)
       
    56 {
       
    57     // Handle 0-byte sequences (never valid).
       
    58     const unsigned char b0 = sequence[0];
       
    59     const int length = inlineUTF8SequenceLength(b0);
       
    60     if (length == 0)
       
    61         return -1;
       
    62 
       
    63     // Handle 1-byte sequences (plain ASCII).
       
    64     const unsigned char b1 = sequence[1];
       
    65     if (length == 1) {
       
    66         if (b1)
       
    67             return -1;
       
    68         return b0;
       
    69     }
       
    70 
       
    71     // Handle 2-byte sequences.
       
    72     if ((b1 & 0xC0) != 0x80)
       
    73         return -1;
       
    74     const unsigned char b2 = sequence[2];
       
    75     if (length == 2) {
       
    76         if (b2)
       
    77             return -1;
       
    78         const int c = ((b0 & 0x1F) << 6) | (b1 & 0x3F);
       
    79         if (c < 0x80)
       
    80             return -1;
       
    81         return c;
       
    82     }
       
    83 
       
    84     // Handle 3-byte sequences.
       
    85     if ((b2 & 0xC0) != 0x80)
       
    86         return -1;
       
    87     const unsigned char b3 = sequence[3];
       
    88     if (length == 3) {
       
    89         if (b3)
       
    90             return -1;
       
    91         const int c = ((b0 & 0xF) << 12) | ((b1 & 0x3F) << 6) | (b2 & 0x3F);
       
    92         if (c < 0x800)
       
    93             return -1;
       
    94         // UTF-16 surrogates should never appear in UTF-8 data.
       
    95         if (c >= 0xD800 && c <= 0xDFFF)
       
    96             return -1;
       
    97         return c;
       
    98     }
       
    99 
       
   100     // Handle 4-byte sequences.
       
   101     if ((b3 & 0xC0) != 0x80)
       
   102         return -1;
       
   103     const unsigned char b4 = sequence[4];
       
   104     if (length == 4) {
       
   105         if (b4)
       
   106             return -1;
       
   107         const int c = ((b0 & 0x7) << 18) | ((b1 & 0x3F) << 12) | ((b2 & 0x3F) << 6) | (b3 & 0x3F);
       
   108         if (c < 0x10000 || c > 0x10FFFF)
       
   109             return -1;
       
   110         return c;
       
   111     }
       
   112 
       
   113     return -1;
       
   114 }
       
   115 
       
   116 // Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
       
   117 // into the first byte, depending on how many bytes follow.  There are
       
   118 // as many entries in this table as there are UTF-8 sequence types.
       
   119 // (I.e., one byte sequence, two byte... etc.). Remember that sequencs
       
   120 // for *legal* UTF-8 will be 4 or fewer bytes total.
       
   121 static const unsigned char firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
       
   122 
       
   123 ConversionResult convertUTF16ToUTF8(
       
   124     const UChar** sourceStart, const UChar* sourceEnd, 
       
   125     char** targetStart, char* targetEnd, bool strict)
       
   126 {
       
   127     ConversionResult result = conversionOK;
       
   128     const UChar* source = *sourceStart;
       
   129     char* target = *targetStart;
       
   130     while (source < sourceEnd) {
       
   131         UChar32 ch;
       
   132         unsigned short bytesToWrite = 0;
       
   133         const UChar32 byteMask = 0xBF;
       
   134         const UChar32 byteMark = 0x80; 
       
   135         const UChar* oldSource = source; // In case we have to back up because of target overflow.
       
   136         ch = static_cast<unsigned short>(*source++);
       
   137         // If we have a surrogate pair, convert to UChar32 first.
       
   138         if (ch >= 0xD800 && ch <= 0xDBFF) {
       
   139             // If the 16 bits following the high surrogate are in the source buffer...
       
   140             if (source < sourceEnd) {
       
   141                 UChar32 ch2 = static_cast<unsigned short>(*source);
       
   142                 // If it's a low surrogate, convert to UChar32.
       
   143                 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
       
   144                     ch = ((ch - 0xD800) << 10) + (ch2 - 0xDC00) + 0x0010000;
       
   145                     ++source;
       
   146                 } else if (strict) { // it's an unpaired high surrogate
       
   147                     --source; // return to the illegal value itself
       
   148                     result = sourceIllegal;
       
   149                     break;
       
   150                 }
       
   151             } else { // We don't have the 16 bits following the high surrogate.
       
   152                 --source; // return to the high surrogate
       
   153                 result = sourceExhausted;
       
   154                 break;
       
   155             }
       
   156         } else if (strict) {
       
   157             // UTF-16 surrogate values are illegal in UTF-32
       
   158             if (ch >= 0xDC00 && ch <= 0xDFFF) {
       
   159                 --source; // return to the illegal value itself
       
   160                 result = sourceIllegal;
       
   161                 break;
       
   162             }
       
   163         }
       
   164         // Figure out how many bytes the result will require
       
   165         if (ch < (UChar32)0x80) {
       
   166             bytesToWrite = 1;
       
   167         } else if (ch < (UChar32)0x800) {
       
   168             bytesToWrite = 2;
       
   169         } else if (ch < (UChar32)0x10000) {
       
   170             bytesToWrite = 3;
       
   171         } else if (ch < (UChar32)0x110000) {
       
   172             bytesToWrite = 4;
       
   173         } else {
       
   174             bytesToWrite = 3;
       
   175             ch = 0xFFFD;
       
   176         }
       
   177 
       
   178         target += bytesToWrite;
       
   179         if (target > targetEnd) {
       
   180             source = oldSource; // Back up source pointer!
       
   181             target -= bytesToWrite;
       
   182             result = targetExhausted;
       
   183             break;
       
   184         }
       
   185         switch (bytesToWrite) { // note: everything falls through.
       
   186             case 4: *--target = (char)((ch | byteMark) & byteMask); ch >>= 6;
       
   187             case 3: *--target = (char)((ch | byteMark) & byteMask); ch >>= 6;
       
   188             case 2: *--target = (char)((ch | byteMark) & byteMask); ch >>= 6;
       
   189             case 1: *--target =  (char)(ch | firstByteMark[bytesToWrite]);
       
   190         }
       
   191         target += bytesToWrite;
       
   192     }
       
   193     *sourceStart = source;
       
   194     *targetStart = target;
       
   195     return result;
       
   196 }
       
   197 
       
   198 // This must be called with the length pre-determined by the first byte.
       
   199 // If presented with a length > 4, this returns false.  The Unicode
       
   200 // definition of UTF-8 goes up to 4-byte sequences.
       
   201 static bool isLegalUTF8(const unsigned char* source, int length)
       
   202 {
       
   203     unsigned char a;
       
   204     const unsigned char* srcptr = source + length;
       
   205     switch (length) {
       
   206         default: return false;
       
   207         // Everything else falls through when "true"...
       
   208         case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
       
   209         case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
       
   210         case 2: if ((a = (*--srcptr)) > 0xBF) return false;
       
   211 
       
   212         switch (*source) {
       
   213             // no fall-through in this inner switch
       
   214             case 0xE0: if (a < 0xA0) return false; break;
       
   215             case 0xED: if (a > 0x9F) return false; break;
       
   216             case 0xF0: if (a < 0x90) return false; break;
       
   217             case 0xF4: if (a > 0x8F) return false; break;
       
   218             default:   if (a < 0x80) return false;
       
   219         }
       
   220 
       
   221         case 1: if (*source >= 0x80 && *source < 0xC2) return false;
       
   222     }
       
   223     if (*source > 0xF4)
       
   224         return false;
       
   225     return true;
       
   226 }
       
   227 
       
   228 // Magic values subtracted from a buffer value during UTF8 conversion.
       
   229 // This table contains as many values as there might be trailing bytes
       
   230 // in a UTF-8 sequence.
       
   231 static const UChar32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL, 
       
   232             0x03C82080UL, 0xFA082080UL, 0x82082080UL };
       
   233 
       
   234 ConversionResult convertUTF8ToUTF16(
       
   235     const char** sourceStart, const char* sourceEnd, 
       
   236     UChar** targetStart, UChar* targetEnd, bool strict)
       
   237 {
       
   238     ConversionResult result = conversionOK;
       
   239     const char* source = *sourceStart;
       
   240     UChar* target = *targetStart;
       
   241     while (source < sourceEnd) {
       
   242         UChar32 ch = 0;
       
   243         int extraBytesToRead = UTF8SequenceLength(*source) - 1;
       
   244         if (source + extraBytesToRead >= sourceEnd) {
       
   245             result = sourceExhausted;
       
   246             break;
       
   247         }
       
   248         // Do this check whether lenient or strict
       
   249         if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(source), extraBytesToRead + 1)) {
       
   250             result = sourceIllegal;
       
   251             break;
       
   252         }
       
   253         // The cases all fall through.
       
   254         switch (extraBytesToRead) {
       
   255             case 5: ch += static_cast<unsigned char>(*source++); ch <<= 6; // remember, illegal UTF-8
       
   256             case 4: ch += static_cast<unsigned char>(*source++); ch <<= 6; // remember, illegal UTF-8
       
   257             case 3: ch += static_cast<unsigned char>(*source++); ch <<= 6;
       
   258             case 2: ch += static_cast<unsigned char>(*source++); ch <<= 6;
       
   259             case 1: ch += static_cast<unsigned char>(*source++); ch <<= 6;
       
   260             case 0: ch += static_cast<unsigned char>(*source++);
       
   261         }
       
   262         ch -= offsetsFromUTF8[extraBytesToRead];
       
   263 
       
   264         if (target >= targetEnd) {
       
   265             source -= (extraBytesToRead + 1); // Back up source pointer!
       
   266             result = targetExhausted; break;
       
   267         }
       
   268         if (ch <= 0xFFFF) {
       
   269             // UTF-16 surrogate values are illegal in UTF-32
       
   270             if (ch >= 0xD800 && ch <= 0xDFFF) {
       
   271                 if (strict) {
       
   272                     source -= (extraBytesToRead + 1); // return to the illegal value itself
       
   273                     result = sourceIllegal;
       
   274                     break;
       
   275                 } else
       
   276                     *target++ = 0xFFFD;
       
   277             } else
       
   278                 *target++ = (UChar)ch; // normal case
       
   279         } else if (ch > 0x10FFFF) {
       
   280             if (strict) {
       
   281                 result = sourceIllegal;
       
   282                 source -= (extraBytesToRead + 1); // return to the start
       
   283                 break; // Bail out; shouldn't continue
       
   284             } else
       
   285                 *target++ = 0xFFFD;
       
   286         } else {
       
   287             // target is a character in range 0xFFFF - 0x10FFFF
       
   288             if (target + 1 >= targetEnd) {
       
   289                 source -= (extraBytesToRead + 1); // Back up source pointer!
       
   290                 result = targetExhausted;
       
   291                 break;
       
   292             }
       
   293             ch -= 0x0010000UL;
       
   294             *target++ = (UChar)((ch >> 10) + 0xD800);
       
   295             *target++ = (UChar)((ch & 0x03FF) + 0xDC00);
       
   296         }
       
   297     }
       
   298     *sourceStart = source;
       
   299     *targetStart = target;
       
   300     return result;
       
   301 }
       
   302 
       
   303 }
       
   304 }