JavaScriptCore/pcre/pcre_compile.cpp
changeset 0 4f2f89ce4247
equal deleted inserted replaced
-1:000000000000 0:4f2f89ce4247
       
     1 /* This is JavaScriptCore's variant of the PCRE library. While this library
       
     2 started out as a copy of PCRE, many of the features of PCRE have been
       
     3 removed. This library now supports only the regular expression features
       
     4 required by the JavaScript language specification, and has only the functions
       
     5 needed by JavaScriptCore and the rest of WebKit.
       
     6 
       
     7                  Originally written by Philip Hazel
       
     8            Copyright (c) 1997-2006 University of Cambridge
       
     9     Copyright (C) 2002, 2004, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved.
       
    10     Copyright (C) 2007 Eric Seidel <eric@webkit.org>
       
    11 
       
    12 -----------------------------------------------------------------------------
       
    13 Redistribution and use in source and binary forms, with or without
       
    14 modification, are permitted provided that the following conditions are met:
       
    15 
       
    16     * Redistributions of source code must retain the above copyright notice,
       
    17       this list of conditions and the following disclaimer.
       
    18 
       
    19     * Redistributions in binary form must reproduce the above copyright
       
    20       notice, this list of conditions and the following disclaimer in the
       
    21       documentation and/or other materials provided with the distribution.
       
    22 
       
    23     * Neither the name of the University of Cambridge nor the names of its
       
    24       contributors may be used to endorse or promote products derived from
       
    25       this software without specific prior written permission.
       
    26 
       
    27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
       
    28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
       
    29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
       
    30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
       
    31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
       
    32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
       
    33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
       
    34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
       
    35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
       
    36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
       
    37 POSSIBILITY OF SUCH DAMAGE.
       
    38 -----------------------------------------------------------------------------
       
    39 */
       
    40 
       
    41 /* This module contains the external function jsRegExpExecute(), along with
       
    42 supporting internal functions that are not used by other modules. */
       
    43 
       
    44 #include "config.h"
       
    45 
       
    46 #include "pcre_internal.h"
       
    47 
       
    48 #include <string.h>
       
    49 #include <wtf/ASCIICType.h>
       
    50 #include <wtf/FastMalloc.h>
       
    51 #include <wtf/FixedArray.h>
       
    52 
       
    53 using namespace WTF;
       
    54 
       
    55 /* Negative values for the firstchar and reqchar variables */
       
    56 
       
    57 #define REQ_UNSET (-2)
       
    58 #define REQ_NONE  (-1)
       
    59 
       
    60 /*************************************************
       
    61 *      Code parameters and static tables         *
       
    62 *************************************************/
       
    63 
       
    64 /* Maximum number of items on the nested bracket stacks at compile time. This
       
    65 applies to the nesting of all kinds of parentheses. It does not limit
       
    66 un-nested, non-capturing parentheses. This number can be made bigger if
       
    67 necessary - it is used to dimension one int and one unsigned char vector at
       
    68 compile time. */
       
    69 
       
    70 #define BRASTACK_SIZE 200
       
    71 
       
    72 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
       
    73 are simple data values; negative values are for special things like \d and so
       
    74 on. Zero means further processing is needed (for things like \x), or the escape
       
    75 is invalid. */
       
    76 
       
    77 static const short escapes[] = {
       
    78      0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */
       
    79      0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */
       
    80    '@',      0, -ESC_B,      0, -ESC_D,      0,      0,      0,   /* @ - G */
       
    81      0,      0,      0,      0,      0,      0,      0,      0,   /* H - O */
       
    82      0,      0,      0, -ESC_S,      0,      0,      0, -ESC_W,   /* P - W */
       
    83      0,      0,      0,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */
       
    84    '`',      7, -ESC_b,      0, -ESC_d,      0,   '\f',      0,   /* ` - g */
       
    85      0,      0,      0,      0,      0,      0,   '\n',      0,   /* h - o */
       
    86      0,      0,    '\r', -ESC_s,   '\t',      0,  '\v', -ESC_w,   /* p - w */
       
    87      0,      0,      0                                            /* x - z */
       
    88 };
       
    89 
       
    90 /* Error code numbers. They are given names so that they can more easily be
       
    91 tracked. */
       
    92 
       
    93 enum ErrorCode {
       
    94     ERR0, ERR1, ERR2, ERR3, ERR4, ERR5, ERR6, ERR7, ERR8, ERR9,
       
    95     ERR10, ERR11, ERR12, ERR13, ERR14, ERR15, ERR16, ERR17
       
    96 };
       
    97 
       
    98 /* The texts of compile-time error messages. These are "char *" because they
       
    99 are passed to the outside world. */
       
   100 
       
   101 static const char* errorText(ErrorCode code)
       
   102 {
       
   103     static const char errorTexts[] =
       
   104       /* 1 */
       
   105       "\\ at end of pattern\0"
       
   106       "\\c at end of pattern\0"
       
   107       "character value in \\x{...} sequence is too large\0"
       
   108       "numbers out of order in {} quantifier\0"
       
   109       /* 5 */
       
   110       "number too big in {} quantifier\0"
       
   111       "missing terminating ] for character class\0"
       
   112       "internal error: code overflow\0"
       
   113       "range out of order in character class\0"
       
   114       "nothing to repeat\0"
       
   115       /* 10 */
       
   116       "unmatched parentheses\0"
       
   117       "internal error: unexpected repeat\0"
       
   118       "unrecognized character after (?\0"
       
   119       "failed to get memory\0"
       
   120       "missing )\0"
       
   121       /* 15 */
       
   122       "reference to non-existent subpattern\0"
       
   123       "regular expression too large\0"
       
   124       "parentheses nested too deeply"
       
   125     ;
       
   126 
       
   127     int i = code;
       
   128     const char* text = errorTexts;
       
   129     while (i > 1)
       
   130         i -= !*text++;
       
   131     return text;
       
   132 }
       
   133 
       
   134 /* Structure for passing "static" information around between the functions
       
   135 doing the compiling. */
       
   136 
       
   137 struct CompileData {
       
   138     CompileData() {
       
   139         topBackref = 0;
       
   140         backrefMap = 0;
       
   141         reqVaryOpt = 0;
       
   142         needOuterBracket = false;
       
   143         numCapturingBrackets = 0;
       
   144     }
       
   145     int topBackref;            /* Maximum back reference */
       
   146     unsigned backrefMap;       /* Bitmap of low back refs */
       
   147     int reqVaryOpt;            /* "After variable item" flag for reqByte */
       
   148     bool needOuterBracket;
       
   149     int numCapturingBrackets;
       
   150 };
       
   151 
       
   152 /* Definitions to allow mutual recursion */
       
   153 
       
   154 static bool compileBracket(int, int*, unsigned char**, const UChar**, const UChar*, ErrorCode*, int, int*, int*, CompileData&);
       
   155 static bool bracketIsAnchored(const unsigned char* code);
       
   156 static bool bracketNeedsLineStart(const unsigned char* code, unsigned captureMap, unsigned backrefMap);
       
   157 static int bracketFindFirstAssertedCharacter(const unsigned char* code, bool inassert);
       
   158 
       
   159 /*************************************************
       
   160 *            Handle escapes                      *
       
   161 *************************************************/
       
   162 
       
   163 /* This function is called when a \ has been encountered. It either returns a
       
   164 positive value for a simple escape such as \n, or a negative value which
       
   165 encodes one of the more complicated things such as \d. When UTF-8 is enabled,
       
   166 a positive value greater than 255 may be returned. On entry, ptr is pointing at
       
   167 the \. On exit, it is on the final character of the escape sequence.
       
   168 
       
   169 Arguments:
       
   170   ptrPtr         points to the pattern position pointer
       
   171   errorCodePtr   points to the errorcode variable
       
   172   bracount       number of previous extracting brackets
       
   173   options        the options bits
       
   174   isClass        true if inside a character class
       
   175 
       
   176 Returns:         zero or positive => a data character
       
   177                  negative => a special escape sequence
       
   178                  on error, errorPtr is set
       
   179 */
       
   180 
       
   181 static int checkEscape(const UChar** ptrPtr, const UChar* patternEnd, ErrorCode* errorCodePtr, int bracount, bool isClass)
       
   182 {
       
   183     const UChar* ptr = *ptrPtr + 1;
       
   184 
       
   185     /* If backslash is at the end of the pattern, it's an error. */
       
   186     if (ptr == patternEnd) {
       
   187         *errorCodePtr = ERR1;
       
   188         *ptrPtr = ptr;
       
   189         return 0;
       
   190     }
       
   191     
       
   192     int c = *ptr;
       
   193     
       
   194     /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
       
   195      a table. A non-zero result is something that can be returned immediately.
       
   196      Otherwise further processing may be required. */
       
   197     
       
   198     if (c < '0' || c > 'z') { /* Not alphameric */
       
   199     } else if (int escapeValue = escapes[c - '0']) {
       
   200         c = escapeValue;
       
   201         if (isClass) {
       
   202             if (-c == ESC_b)
       
   203                 c = '\b'; /* \b is backslash in a class */
       
   204             else if (-c == ESC_B)
       
   205                 c = 'B'; /* and \B is a capital B in a class (in browsers event though ECMAScript 15.10.2.19 says it raises an error) */
       
   206         }
       
   207     /* Escapes that need further processing, or are illegal. */
       
   208     
       
   209     } else {
       
   210         switch (c) {
       
   211             case '1':
       
   212             case '2':
       
   213             case '3':
       
   214             case '4':
       
   215             case '5':
       
   216             case '6':
       
   217             case '7':
       
   218             case '8':
       
   219             case '9':
       
   220                 /* Escape sequences starting with a non-zero digit are backreferences,
       
   221                  unless there are insufficient brackets, in which case they are octal
       
   222                  escape sequences. Those sequences end on the first non-octal character
       
   223                  or when we overflow 0-255, whichever comes first. */
       
   224                 
       
   225                 if (!isClass) {
       
   226                     const UChar* oldptr = ptr;
       
   227                     c -= '0';
       
   228                     while ((ptr + 1 < patternEnd) && isASCIIDigit(ptr[1]) && c <= bracount)
       
   229                         c = c * 10 + *(++ptr) - '0';
       
   230                     if (c <= bracount) {
       
   231                         c = -(ESC_REF + c);
       
   232                         break;
       
   233                     }
       
   234                     ptr = oldptr;      /* Put the pointer back and fall through */
       
   235                 }
       
   236                 
       
   237                 /* Handle an octal number following \. If the first digit is 8 or 9,
       
   238                  this is not octal. */
       
   239                 
       
   240                 if ((c = *ptr) >= '8') {
       
   241                     c = '\\';
       
   242                     ptr -= 1;
       
   243                     break;
       
   244                 }
       
   245 
       
   246             /* \0 always starts an octal number, but we may drop through to here with a
       
   247              larger first octal digit. */
       
   248 
       
   249             case '0': {
       
   250                 c -= '0';
       
   251                 int i;
       
   252                 for (i = 1; i <= 2; ++i) {
       
   253                     if (ptr + i >= patternEnd || ptr[i] < '0' || ptr[i] > '7')
       
   254                         break;
       
   255                     int cc = c * 8 + ptr[i] - '0';
       
   256                     if (cc > 255)
       
   257                         break;
       
   258                     c = cc;
       
   259                 }
       
   260                 ptr += i - 1;
       
   261                 break;
       
   262             }
       
   263 
       
   264             case 'x': {
       
   265                 c = 0;
       
   266                 int i;
       
   267                 for (i = 1; i <= 2; ++i) {
       
   268                     if (ptr + i >= patternEnd || !isASCIIHexDigit(ptr[i])) {
       
   269                         c = 'x';
       
   270                         i = 1;
       
   271                         break;
       
   272                     }
       
   273                     int cc = ptr[i];
       
   274                     if (cc >= 'a')
       
   275                         cc -= 32;             /* Convert to upper case */
       
   276                     c = c * 16 + cc - ((cc < 'A') ? '0' : ('A' - 10));
       
   277                 }
       
   278                 ptr += i - 1;
       
   279                 break;
       
   280             }
       
   281 
       
   282             case 'u': {
       
   283                 c = 0;
       
   284                 int i;
       
   285                 for (i = 1; i <= 4; ++i) {
       
   286                     if (ptr + i >= patternEnd || !isASCIIHexDigit(ptr[i])) {
       
   287                         c = 'u';
       
   288                         i = 1;
       
   289                         break;
       
   290                     }
       
   291                     int cc = ptr[i];
       
   292                     if (cc >= 'a')
       
   293                         cc -= 32;             /* Convert to upper case */
       
   294                     c = c * 16 + cc - ((cc < 'A') ? '0' : ('A' - 10));
       
   295                 }
       
   296                 ptr += i - 1;
       
   297                 break;
       
   298             }
       
   299 
       
   300             case 'c':
       
   301                 if (++ptr == patternEnd) {
       
   302                     *errorCodePtr = ERR2;
       
   303                     return 0;
       
   304                 }
       
   305                 
       
   306                 c = *ptr;
       
   307 
       
   308                 /* To match Firefox, inside a character class, we also accept
       
   309                    numbers and '_' as control characters */
       
   310                 if ((!isClass && !isASCIIAlpha(c)) || (!isASCIIAlphanumeric(c) && c != '_')) {
       
   311                     c = '\\';
       
   312                     ptr -= 2;
       
   313                     break;
       
   314                 }
       
   315 
       
   316                 /* A letter is upper-cased; then the 0x40 bit is flipped. This coding
       
   317                  is ASCII-specific, but then the whole concept of \cx is ASCII-specific. */
       
   318                 c = toASCIIUpper(c) ^ 0x40;
       
   319                 break;
       
   320             }
       
   321     }
       
   322     
       
   323     *ptrPtr = ptr;
       
   324     return c;
       
   325 }
       
   326 
       
   327 /*************************************************
       
   328 *            Check for counted repeat            *
       
   329 *************************************************/
       
   330 
       
   331 /* This function is called when a '{' is encountered in a place where it might
       
   332 start a quantifier. It looks ahead to see if it really is a quantifier or not.
       
   333 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
       
   334 where the ddds are digits.
       
   335 
       
   336 Arguments:
       
   337   p         pointer to the first char after '{'
       
   338 
       
   339 Returns:    true or false
       
   340 */
       
   341 
       
   342 static bool isCountedRepeat(const UChar* p, const UChar* patternEnd)
       
   343 {
       
   344     if (p >= patternEnd || !isASCIIDigit(*p))
       
   345         return false;
       
   346     p++;
       
   347     while (p < patternEnd && isASCIIDigit(*p))
       
   348         p++;
       
   349     if (p < patternEnd && *p == '}')
       
   350         return true;
       
   351     
       
   352     if (p >= patternEnd || *p++ != ',')
       
   353         return false;
       
   354     if (p < patternEnd && *p == '}')
       
   355         return true;
       
   356     
       
   357     if (p >= patternEnd || !isASCIIDigit(*p))
       
   358         return false;
       
   359     p++;
       
   360     while (p < patternEnd && isASCIIDigit(*p))
       
   361         p++;
       
   362     
       
   363     return (p < patternEnd && *p == '}');
       
   364 }
       
   365 
       
   366 /*************************************************
       
   367 *         Read repeat counts                     *
       
   368 *************************************************/
       
   369 
       
   370 /* Read an item of the form {n,m} and return the values. This is called only
       
   371 after isCountedRepeat() has confirmed that a repeat-count quantifier exists,
       
   372 so the syntax is guaranteed to be correct, but we need to check the values.
       
   373 
       
   374 Arguments:
       
   375   p              pointer to first char after '{'
       
   376   minp           pointer to int for min
       
   377   maxp           pointer to int for max
       
   378                  returned as -1 if no max
       
   379   errorCodePtr   points to error code variable
       
   380 
       
   381 Returns:         pointer to '}' on success;
       
   382                  current ptr on error, with errorCodePtr set non-zero
       
   383 */
       
   384 
       
   385 static const UChar* readRepeatCounts(const UChar* p, int* minp, int* maxp, ErrorCode* errorCodePtr)
       
   386 {
       
   387     int min = 0;
       
   388     int max = -1;
       
   389     
       
   390     /* Read the minimum value and do a paranoid check: a negative value indicates
       
   391      an integer overflow. */
       
   392     
       
   393     while (isASCIIDigit(*p))
       
   394         min = min * 10 + *p++ - '0';
       
   395     if (min < 0 || min > 65535) {
       
   396         *errorCodePtr = ERR5;
       
   397         return p;
       
   398     }
       
   399     
       
   400     /* Read the maximum value if there is one, and again do a paranoid on its size.
       
   401      Also, max must not be less than min. */
       
   402     
       
   403     if (*p == '}')
       
   404         max = min;
       
   405     else {
       
   406         if (*(++p) != '}') {
       
   407             max = 0;
       
   408             while (isASCIIDigit(*p))
       
   409                 max = max * 10 + *p++ - '0';
       
   410             if (max < 0 || max > 65535) {
       
   411                 *errorCodePtr = ERR5;
       
   412                 return p;
       
   413             }
       
   414             if (max < min) {
       
   415                 *errorCodePtr = ERR4;
       
   416                 return p;
       
   417             }
       
   418         }
       
   419     }
       
   420     
       
   421     /* Fill in the required variables, and pass back the pointer to the terminating
       
   422      '}'. */
       
   423     
       
   424     *minp = min;
       
   425     *maxp = max;
       
   426     return p;
       
   427 }
       
   428 
       
   429 /*************************************************
       
   430 *      Find first significant op code            *
       
   431 *************************************************/
       
   432 
       
   433 /* This is called by several functions that scan a compiled expression looking
       
   434 for a fixed first character, or an anchoring op code etc. It skips over things
       
   435 that do not influence this.
       
   436 
       
   437 Arguments:
       
   438   code         pointer to the start of the group
       
   439 Returns:       pointer to the first significant opcode
       
   440 */
       
   441 
       
   442 static const unsigned char* firstSignificantOpcode(const unsigned char* code)
       
   443 {
       
   444     while (*code == OP_BRANUMBER)
       
   445         code += 3;
       
   446     return code;
       
   447 }
       
   448 
       
   449 static const unsigned char* firstSignificantOpcodeSkippingAssertions(const unsigned char* code)
       
   450 {
       
   451     while (true) {
       
   452         switch (*code) {
       
   453             case OP_ASSERT_NOT:
       
   454                 advanceToEndOfBracket(code);
       
   455                 code += 1 + LINK_SIZE;
       
   456                 break;
       
   457             case OP_WORD_BOUNDARY:
       
   458             case OP_NOT_WORD_BOUNDARY:
       
   459                 ++code;
       
   460                 break;
       
   461             case OP_BRANUMBER:
       
   462                 code += 3;
       
   463                 break;
       
   464             default:
       
   465                 return code;
       
   466         }
       
   467     }
       
   468 }
       
   469 
       
   470 /*************************************************
       
   471 *           Get othercase range                  *
       
   472 *************************************************/
       
   473 
       
   474 /* This function is passed the start and end of a class range, in UTF-8 mode
       
   475 with UCP support. It searches up the characters, looking for internal ranges of
       
   476 characters in the "other" case. Each call returns the next one, updating the
       
   477 start address.
       
   478 
       
   479 Arguments:
       
   480   cptr        points to starting character value; updated
       
   481   d           end value
       
   482   ocptr       where to put start of othercase range
       
   483   odptr       where to put end of othercase range
       
   484 
       
   485 Yield:        true when range returned; false when no more
       
   486 */
       
   487 
       
   488 static bool getOthercaseRange(int* cptr, int d, int* ocptr, int* odptr)
       
   489 {
       
   490     int c, othercase = 0;
       
   491     
       
   492     for (c = *cptr; c <= d; c++) {
       
   493         if ((othercase = jsc_pcre_ucp_othercase(c)) >= 0)
       
   494             break;
       
   495     }
       
   496     
       
   497     if (c > d)
       
   498         return false;
       
   499     
       
   500     *ocptr = othercase;
       
   501     int next = othercase + 1;
       
   502     
       
   503     for (++c; c <= d; c++) {
       
   504         if (jsc_pcre_ucp_othercase(c) != next)
       
   505             break;
       
   506         next++;
       
   507     }
       
   508     
       
   509     *odptr = next - 1;
       
   510     *cptr = c;
       
   511     
       
   512     return true;
       
   513 }
       
   514 
       
   515 /*************************************************
       
   516  *       Convert character value to UTF-8         *
       
   517  *************************************************/
       
   518 
       
   519 /* This function takes an integer value in the range 0 - 0x7fffffff
       
   520  and encodes it as a UTF-8 character in 0 to 6 bytes.
       
   521  
       
   522  Arguments:
       
   523  cvalue     the character value
       
   524  buffer     pointer to buffer for result - at least 6 bytes long
       
   525  
       
   526  Returns:     number of characters placed in the buffer
       
   527  */
       
   528 
       
   529 static int encodeUTF8(int cvalue, unsigned char *buffer)
       
   530 {
       
   531     int i;
       
   532     for (i = 0; i < jsc_pcre_utf8_table1_size; i++)
       
   533         if (cvalue <= jsc_pcre_utf8_table1[i])
       
   534             break;
       
   535     buffer += i;
       
   536     for (int j = i; j > 0; j--) {
       
   537         *buffer-- = 0x80 | (cvalue & 0x3f);
       
   538         cvalue >>= 6;
       
   539     }
       
   540     *buffer = jsc_pcre_utf8_table2[i] | cvalue;
       
   541     return i + 1;
       
   542 }
       
   543 
       
   544 /*************************************************
       
   545 *           Compile one branch                   *
       
   546 *************************************************/
       
   547 
       
   548 /* Scan the pattern, compiling it into the code vector.
       
   549 
       
   550 Arguments:
       
   551   options        the option bits
       
   552   brackets       points to number of extracting brackets used
       
   553   codePtr        points to the pointer to the current code point
       
   554   ptrPtr         points to the current pattern pointer
       
   555   errorCodePtr   points to error code variable
       
   556   firstbyteptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
       
   557   reqbyteptr     set to the last literal character required, else < 0
       
   558   cd             contains pointers to tables etc.
       
   559 
       
   560 Returns:         true on success
       
   561                  false, with *errorCodePtr set non-zero on error
       
   562 */
       
   563 
       
   564 static inline bool safelyCheckNextChar(const UChar* ptr, const UChar* patternEnd, UChar expected)
       
   565 {
       
   566     return ((ptr + 1 < patternEnd) && ptr[1] == expected);
       
   567 }
       
   568 
       
   569 static bool
       
   570 compileBranch(int options, int* brackets, unsigned char** codePtr,
       
   571                const UChar** ptrPtr, const UChar* patternEnd, ErrorCode* errorCodePtr, int *firstbyteptr,
       
   572                int* reqbyteptr, CompileData& cd)
       
   573 {
       
   574     int repeatType, opType;
       
   575     int repeatMin = 0, repeat_max = 0;      /* To please picky compilers */
       
   576     int bravalue = 0;
       
   577     int reqvary, tempreqvary;
       
   578     int c;
       
   579     unsigned char* code = *codePtr;
       
   580     unsigned char* tempcode;
       
   581     bool didGroupSetFirstByte = false;
       
   582     const UChar* ptr = *ptrPtr;
       
   583     const UChar* tempptr;
       
   584     unsigned char* previous = NULL;
       
   585     unsigned char classbits[32];
       
   586     
       
   587     bool class_utf8;
       
   588     unsigned char* class_utf8data;
       
   589     unsigned char utf8_char[6];
       
   590     
       
   591     /* Initialize no first byte, no required byte. REQ_UNSET means "no char
       
   592      matching encountered yet". It gets changed to REQ_NONE if we hit something that
       
   593      matches a non-fixed char first char; reqByte just remains unset if we never
       
   594      find one.
       
   595      
       
   596      When we hit a repeat whose minimum is zero, we may have to adjust these values
       
   597      to take the zero repeat into account. This is implemented by setting them to
       
   598      zeroFirstByte and zeroReqByte when such a repeat is encountered. The individual
       
   599      item types that can be repeated set these backoff variables appropriately. */
       
   600     
       
   601     int firstByte = REQ_UNSET;
       
   602     int reqByte = REQ_UNSET;
       
   603     int zeroReqByte = REQ_UNSET;
       
   604     int zeroFirstByte = REQ_UNSET;
       
   605     
       
   606     /* The variable reqCaseOpt contains either the REQ_IGNORE_CASE value or zero,
       
   607      according to the current setting of the ignores-case flag. REQ_IGNORE_CASE is a bit
       
   608      value > 255. It is added into the firstByte or reqByte variables to record the
       
   609      case status of the value. This is used only for ASCII characters. */
       
   610     
       
   611     int reqCaseOpt = (options & IgnoreCaseOption) ? REQ_IGNORE_CASE : 0;
       
   612     
       
   613     /* Switch on next character until the end of the branch */
       
   614     
       
   615     for (;; ptr++) {
       
   616         bool negateClass;
       
   617         bool shouldFlipNegation; /* If a negative special such as \S is used, we should negate the whole class to properly support Unicode. */
       
   618         int classCharCount;
       
   619         int classLastChar;
       
   620         int skipBytes;
       
   621         int subReqByte;
       
   622         int subFirstByte;
       
   623         int mcLength;
       
   624         unsigned char mcbuffer[8];
       
   625         
       
   626         /* Next byte in the pattern */
       
   627         
       
   628         c = ptr < patternEnd ? *ptr : 0;
       
   629         
       
   630         /* Fill in length of a previous callout, except when the next thing is
       
   631          a quantifier. */
       
   632         
       
   633         bool isQuantifier = c == '*' || c == '+' || c == '?' || (c == '{' && isCountedRepeat(ptr + 1, patternEnd));
       
   634         
       
   635         switch (c) {
       
   636             /* The branch terminates at end of string, |, or ). */
       
   637                 
       
   638             case 0:
       
   639                 if (ptr < patternEnd)
       
   640                     goto NORMAL_CHAR;
       
   641                 // End of string; fall through
       
   642             case '|':
       
   643             case ')':
       
   644                 *firstbyteptr = firstByte;
       
   645                 *reqbyteptr = reqByte;
       
   646                 *codePtr = code;
       
   647                 *ptrPtr = ptr;
       
   648                 return true;
       
   649                 
       
   650             /* Handle single-character metacharacters. In multiline mode, ^ disables
       
   651              the setting of any following char as a first character. */
       
   652 
       
   653             case '^':
       
   654                 if (options & MatchAcrossMultipleLinesOption) {
       
   655                     if (firstByte == REQ_UNSET)
       
   656                         firstByte = REQ_NONE;
       
   657                     *code++ = OP_BOL;
       
   658                 } else
       
   659                     *code++ = OP_CIRC;
       
   660                 previous = NULL;
       
   661                 break;
       
   662 
       
   663             case '$':
       
   664                 previous = NULL;
       
   665                 if (options & MatchAcrossMultipleLinesOption)
       
   666                   *code++ = OP_EOL;
       
   667                 else
       
   668                   *code++ = OP_DOLL;
       
   669                 break;
       
   670 
       
   671             /* There can never be a first char if '.' is first, whatever happens about
       
   672              repeats. The value of reqByte doesn't change either. */
       
   673 
       
   674             case '.':
       
   675                 if (firstByte == REQ_UNSET)
       
   676                     firstByte = REQ_NONE;
       
   677                 zeroFirstByte = firstByte;
       
   678                 zeroReqByte = reqByte;
       
   679                 previous = code;
       
   680                 *code++ = OP_NOT_NEWLINE;
       
   681                 break;
       
   682                 
       
   683             /* Character classes. If the included characters are all < 256, we build a
       
   684              32-byte bitmap of the permitted characters, except in the special case
       
   685              where there is only one such character. For negated classes, we build the
       
   686              map as usual, then invert it at the end. However, we use a different opcode
       
   687              so that data characters > 255 can be handled correctly.
       
   688              
       
   689              If the class contains characters outside the 0-255 range, a different
       
   690              opcode is compiled. It may optionally have a bit map for characters < 256,
       
   691              but those above are are explicitly listed afterwards. A flag byte tells
       
   692              whether the bitmap is present, and whether this is a negated class or not.
       
   693              */
       
   694                 
       
   695             case '[': {
       
   696                 previous = code;
       
   697                 shouldFlipNegation = false;
       
   698                 
       
   699                 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
       
   700                  they are encountered at the top level, so we'll do that too. */
       
   701                 
       
   702                 /* If the first character is '^', set the negation flag and skip it. */
       
   703 
       
   704                 if (ptr + 1 >= patternEnd) {
       
   705                     *errorCodePtr = ERR6;
       
   706                     return false;
       
   707                 }
       
   708 
       
   709                 if (ptr[1] == '^') {
       
   710                     negateClass = true;
       
   711                     ++ptr;
       
   712                 } else
       
   713                     negateClass = false;
       
   714                 
       
   715                 /* Keep a count of chars with values < 256 so that we can optimize the case
       
   716                  of just a single character (as long as it's < 256). For higher valued UTF-8
       
   717                  characters, we don't yet do any optimization. */
       
   718                 
       
   719                 classCharCount = 0;
       
   720                 classLastChar = -1;
       
   721                 
       
   722                 class_utf8 = false;                       /* No chars >= 256 */
       
   723                 class_utf8data = code + LINK_SIZE + 34;   /* For UTF-8 items */
       
   724                 
       
   725                 /* Initialize the 32-char bit map to all zeros. We have to build the
       
   726                  map in a temporary bit of store, in case the class contains only 1
       
   727                  character (< 256), because in that case the compiled code doesn't use the
       
   728                  bit map. */
       
   729                 
       
   730                 memset(classbits, 0, 32 * sizeof(unsigned char));
       
   731                 
       
   732                 /* Process characters until ] is reached. The first pass
       
   733                  through the regex checked the overall syntax, so we don't need to be very
       
   734                  strict here. At the start of the loop, c contains the first byte of the
       
   735                  character. */
       
   736 
       
   737                 while ((++ptr < patternEnd) && (c = *ptr) != ']') {
       
   738                     /* Backslash may introduce a single character, or it may introduce one
       
   739                      of the specials, which just set a flag. Escaped items are checked for
       
   740                      validity in the pre-compiling pass. The sequence \b is a special case.
       
   741                      Inside a class (and only there) it is treated as backspace. Elsewhere
       
   742                      it marks a word boundary. Other escapes have preset maps ready to
       
   743                      or into the one we are building. We assume they have more than one
       
   744                      character in them, so set classCharCount bigger than one. */
       
   745                     
       
   746                     if (c == '\\') {
       
   747                         c = checkEscape(&ptr, patternEnd, errorCodePtr, cd.numCapturingBrackets, true);
       
   748                         if (c < 0) {
       
   749                             classCharCount += 2;     /* Greater than 1 is what matters */
       
   750                             switch (-c) {
       
   751                                 case ESC_d:
       
   752                                     for (c = 0; c < 32; c++)
       
   753                                         classbits[c] |= classBitmapForChar(c + cbit_digit);
       
   754                                     continue;
       
   755                                     
       
   756                                 case ESC_D:
       
   757                                     shouldFlipNegation = true;
       
   758                                     for (c = 0; c < 32; c++)
       
   759                                         classbits[c] |= ~classBitmapForChar(c + cbit_digit);
       
   760                                     continue;
       
   761                                     
       
   762                                 case ESC_w:
       
   763                                     for (c = 0; c < 32; c++)
       
   764                                         classbits[c] |= classBitmapForChar(c + cbit_word);
       
   765                                     continue;
       
   766                                     
       
   767                                 case ESC_W:
       
   768                                     shouldFlipNegation = true;
       
   769                                     for (c = 0; c < 32; c++)
       
   770                                         classbits[c] |= ~classBitmapForChar(c + cbit_word);
       
   771                                     continue;
       
   772                                     
       
   773                                 case ESC_s:
       
   774                                     for (c = 0; c < 32; c++)
       
   775                                          classbits[c] |= classBitmapForChar(c + cbit_space);
       
   776                                     continue;
       
   777                                     
       
   778                                 case ESC_S:
       
   779                                     shouldFlipNegation = true;
       
   780                                     for (c = 0; c < 32; c++)
       
   781                                          classbits[c] |= ~classBitmapForChar(c + cbit_space);
       
   782                                     continue;
       
   783                                     
       
   784                                     /* Unrecognized escapes are faulted if PCRE is running in its
       
   785                                      strict mode. By default, for compatibility with Perl, they are
       
   786                                      treated as literals. */
       
   787                                     
       
   788                                 default:
       
   789                                     c = *ptr;              /* The final character */
       
   790                                     classCharCount -= 2;  /* Undo the default count from above */
       
   791                             }
       
   792                         }
       
   793                         
       
   794                         /* Fall through if we have a single character (c >= 0). This may be
       
   795                          > 256 in UTF-8 mode. */
       
   796                         
       
   797                     }   /* End of backslash handling */
       
   798                     
       
   799                     /* A single character may be followed by '-' to form a range. However,
       
   800                      Perl does not permit ']' to be the end of the range. A '-' character
       
   801                      here is treated as a literal. */
       
   802                     
       
   803                     if ((ptr + 2 < patternEnd) && ptr[1] == '-' && ptr[2] != ']') {
       
   804                         ptr += 2;
       
   805                         
       
   806                         int d = *ptr;
       
   807                         
       
   808                         /* The second part of a range can be a single-character escape, but
       
   809                          not any of the other escapes. Perl 5.6 treats a hyphen as a literal
       
   810                          in such circumstances. */
       
   811                         
       
   812                         if (d == '\\') {
       
   813                             const UChar* oldptr = ptr;
       
   814                             d = checkEscape(&ptr, patternEnd, errorCodePtr, cd.numCapturingBrackets, true);
       
   815                             
       
   816                             /* \X is literal X; any other special means the '-' was literal */
       
   817                             if (d < 0) {
       
   818                                 ptr = oldptr - 2;
       
   819                                 goto LONE_SINGLE_CHARACTER;  /* A few lines below */
       
   820                             }
       
   821                         }
       
   822                         
       
   823                         /* The check that the two values are in the correct order happens in
       
   824                          the pre-pass. Optimize one-character ranges */
       
   825                         
       
   826                         if (d == c)
       
   827                             goto LONE_SINGLE_CHARACTER;  /* A few lines below */
       
   828                         
       
   829                         /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
       
   830                          matching, we have to use an XCLASS with extra data items. Caseless
       
   831                          matching for characters > 127 is available only if UCP support is
       
   832                          available. */
       
   833                         
       
   834                         if ((d > 255 || ((options & IgnoreCaseOption) && d > 127))) {
       
   835                             class_utf8 = true;
       
   836                             
       
   837                             /* With UCP support, we can find the other case equivalents of
       
   838                              the relevant characters. There may be several ranges. Optimize how
       
   839                              they fit with the basic range. */
       
   840                             
       
   841                             if (options & IgnoreCaseOption) {
       
   842                                 int occ, ocd;
       
   843                                 int cc = c;
       
   844                                 int origd = d;
       
   845                                 while (getOthercaseRange(&cc, origd, &occ, &ocd)) {
       
   846                                     if (occ >= c && ocd <= d)
       
   847                                         continue;  /* Skip embedded ranges */
       
   848                                     
       
   849                                     if (occ < c  && ocd >= c - 1)        /* Extend the basic range */
       
   850                                     {                                  /* if there is overlap,   */
       
   851                                         c = occ;                           /* noting that if occ < c */
       
   852                                         continue;                          /* we can't have ocd > d  */
       
   853                                     }                                  /* because a subrange is  */
       
   854                                     if (ocd > d && occ <= d + 1)         /* always shorter than    */
       
   855                                     {                                  /* the basic range.       */
       
   856                                         d = ocd;
       
   857                                         continue;
       
   858                                     }
       
   859                                     
       
   860                                     if (occ == ocd)
       
   861                                         *class_utf8data++ = XCL_SINGLE;
       
   862                                     else {
       
   863                                         *class_utf8data++ = XCL_RANGE;
       
   864                                         class_utf8data += encodeUTF8(occ, class_utf8data);
       
   865                                     }
       
   866                                     class_utf8data += encodeUTF8(ocd, class_utf8data);
       
   867                                 }
       
   868                             }
       
   869                             
       
   870                             /* Now record the original range, possibly modified for UCP caseless
       
   871                              overlapping ranges. */
       
   872                             
       
   873                             *class_utf8data++ = XCL_RANGE;
       
   874                             class_utf8data += encodeUTF8(c, class_utf8data);
       
   875                             class_utf8data += encodeUTF8(d, class_utf8data);
       
   876                             
       
   877                             /* With UCP support, we are done. Without UCP support, there is no
       
   878                              caseless matching for UTF-8 characters > 127; we can use the bit map
       
   879                              for the smaller ones. */
       
   880                             
       
   881                             continue;    /* With next character in the class */
       
   882                         }
       
   883                         
       
   884                         /* We use the bit map for all cases when not in UTF-8 mode; else
       
   885                          ranges that lie entirely within 0-127 when there is UCP support; else
       
   886                          for partial ranges without UCP support. */
       
   887                         
       
   888                         for (; c <= d; c++) {
       
   889                             classbits[c/8] |= (1 << (c&7));
       
   890                             if (options & IgnoreCaseOption) {
       
   891                                 int uc = flipCase(c);
       
   892                                 classbits[uc/8] |= (1 << (uc&7));
       
   893                             }
       
   894                             classCharCount++;                /* in case a one-char range */
       
   895                             classLastChar = c;
       
   896                         }
       
   897                         
       
   898                         continue;   /* Go get the next char in the class */
       
   899                     }
       
   900                     
       
   901                     /* Handle a lone single character - we can get here for a normal
       
   902                      non-escape char, or after \ that introduces a single character or for an
       
   903                      apparent range that isn't. */
       
   904                     
       
   905                 LONE_SINGLE_CHARACTER:
       
   906                     
       
   907                     /* Handle a character that cannot go in the bit map */
       
   908                     
       
   909                     if ((c > 255 || ((options & IgnoreCaseOption) && c > 127))) {
       
   910                         class_utf8 = true;
       
   911                         *class_utf8data++ = XCL_SINGLE;
       
   912                         class_utf8data += encodeUTF8(c, class_utf8data);
       
   913                         
       
   914                         if (options & IgnoreCaseOption) {
       
   915                             int othercase;
       
   916                             if ((othercase = jsc_pcre_ucp_othercase(c)) >= 0) {
       
   917                                 *class_utf8data++ = XCL_SINGLE;
       
   918                                 class_utf8data += encodeUTF8(othercase, class_utf8data);
       
   919                             }
       
   920                         }
       
   921                     } else {
       
   922                         /* Handle a single-byte character */
       
   923                         classbits[c/8] |= (1 << (c&7));
       
   924                         if (options & IgnoreCaseOption) {
       
   925                             c = flipCase(c);
       
   926                             classbits[c/8] |= (1 << (c&7));
       
   927                         }
       
   928                         classCharCount++;
       
   929                         classLastChar = c;
       
   930                     }
       
   931                 }
       
   932                 
       
   933                 /* If classCharCount is 1, we saw precisely one character whose value is
       
   934                  less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
       
   935                  can optimize the negative case only if there were no characters >= 128
       
   936                  because OP_NOT and the related opcodes like OP_NOTSTAR operate on
       
   937                  single-bytes only. This is an historical hangover. Maybe one day we can
       
   938                  tidy these opcodes to handle multi-byte characters.
       
   939                  
       
   940                  The optimization throws away the bit map. We turn the item into a
       
   941                  1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
       
   942                  that OP_NOT does not support multibyte characters. In the positive case, it
       
   943                  can cause firstByte to be set. Otherwise, there can be no first char if
       
   944                  this item is first, whatever repeat count may follow. In the case of
       
   945                  reqByte, save the previous value for reinstating. */
       
   946                 
       
   947                 if (classCharCount == 1 && (!class_utf8 && (!negateClass || classLastChar < 128))) {
       
   948                     zeroReqByte = reqByte;
       
   949                     
       
   950                     /* The OP_NOT opcode works on one-byte characters only. */
       
   951                     
       
   952                     if (negateClass) {
       
   953                         if (firstByte == REQ_UNSET)
       
   954                             firstByte = REQ_NONE;
       
   955                         zeroFirstByte = firstByte;
       
   956                         *code++ = OP_NOT;
       
   957                         *code++ = classLastChar;
       
   958                         break;
       
   959                     }
       
   960                     
       
   961                     /* For a single, positive character, get the value into c, and
       
   962                      then we can handle this with the normal one-character code. */
       
   963                     
       
   964                     c = classLastChar;
       
   965                     goto NORMAL_CHAR;
       
   966                 }       /* End of 1-char optimization */
       
   967                 
       
   968                 /* The general case - not the one-char optimization. If this is the first
       
   969                  thing in the branch, there can be no first char setting, whatever the
       
   970                  repeat count. Any reqByte setting must remain unchanged after any kind of
       
   971                  repeat. */
       
   972                 
       
   973                 if (firstByte == REQ_UNSET) firstByte = REQ_NONE;
       
   974                 zeroFirstByte = firstByte;
       
   975                 zeroReqByte = reqByte;
       
   976                 
       
   977                 /* If there are characters with values > 255, we have to compile an
       
   978                  extended class, with its own opcode. If there are no characters < 256,
       
   979                  we can omit the bitmap. */
       
   980                 
       
   981                 if (class_utf8 && !shouldFlipNegation) {
       
   982                     *class_utf8data++ = XCL_END;    /* Marks the end of extra data */
       
   983                     *code++ = OP_XCLASS;
       
   984                     code += LINK_SIZE;
       
   985                     *code = negateClass? XCL_NOT : 0;
       
   986                     
       
   987                     /* If the map is required, install it, and move on to the end of
       
   988                      the extra data */
       
   989                     
       
   990                     if (classCharCount > 0) {
       
   991                         *code++ |= XCL_MAP;
       
   992                         memcpy(code, classbits, 32);
       
   993                         code = class_utf8data;
       
   994                     }
       
   995                     
       
   996                     /* If the map is not required, slide down the extra data. */
       
   997                     
       
   998                     else {
       
   999                         int len = class_utf8data - (code + 33);
       
  1000                         memmove(code + 1, code + 33, len);
       
  1001                         code += len + 1;
       
  1002                     }
       
  1003                     
       
  1004                     /* Now fill in the complete length of the item */
       
  1005                     
       
  1006                     putLinkValue(previous + 1, code - previous);
       
  1007                     break;   /* End of class handling */
       
  1008                 }
       
  1009                 
       
  1010                 /* If there are no characters > 255, negate the 32-byte map if necessary,
       
  1011                  and copy it into the code vector. If this is the first thing in the branch,
       
  1012                  there can be no first char setting, whatever the repeat count. Any reqByte
       
  1013                  setting must remain unchanged after any kind of repeat. */
       
  1014                 
       
  1015                 *code++ = (negateClass == shouldFlipNegation) ? OP_CLASS : OP_NCLASS;
       
  1016                 if (negateClass)
       
  1017                     for (c = 0; c < 32; c++)
       
  1018                         code[c] = ~classbits[c];
       
  1019                 else
       
  1020                     memcpy(code, classbits, 32);
       
  1021                 code += 32;
       
  1022                 break;
       
  1023             }
       
  1024                 
       
  1025             /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
       
  1026              has been tested above. */
       
  1027 
       
  1028             case '{':
       
  1029                 if (!isQuantifier)
       
  1030                     goto NORMAL_CHAR;
       
  1031                 ptr = readRepeatCounts(ptr + 1, &repeatMin, &repeat_max, errorCodePtr);
       
  1032                 if (*errorCodePtr)
       
  1033                     goto FAILED;
       
  1034                 goto REPEAT;
       
  1035                 
       
  1036             case '*':
       
  1037                 repeatMin = 0;
       
  1038                 repeat_max = -1;
       
  1039                 goto REPEAT;
       
  1040                 
       
  1041             case '+':
       
  1042                 repeatMin = 1;
       
  1043                 repeat_max = -1;
       
  1044                 goto REPEAT;
       
  1045                 
       
  1046             case '?':
       
  1047                 repeatMin = 0;
       
  1048                 repeat_max = 1;
       
  1049                 
       
  1050             REPEAT:
       
  1051                 if (!previous) {
       
  1052                     *errorCodePtr = ERR9;
       
  1053                     goto FAILED;
       
  1054                 }
       
  1055                 
       
  1056                 if (repeatMin == 0) {
       
  1057                     firstByte = zeroFirstByte;    /* Adjust for zero repeat */
       
  1058                     reqByte = zeroReqByte;        /* Ditto */
       
  1059                 }
       
  1060                 
       
  1061                 /* Remember whether this is a variable length repeat */
       
  1062                 
       
  1063                 reqvary = (repeatMin == repeat_max) ? 0 : REQ_VARY;
       
  1064                 
       
  1065                 opType = 0;                    /* Default single-char op codes */
       
  1066                 
       
  1067                 /* Save start of previous item, in case we have to move it up to make space
       
  1068                  for an inserted OP_ONCE for the additional '+' extension. */
       
  1069                 /* FIXME: Probably don't need this because we don't use OP_ONCE. */
       
  1070                 
       
  1071                 tempcode = previous;
       
  1072                 
       
  1073                 /* If the next character is '+', we have a possessive quantifier. This
       
  1074                  implies greediness, whatever the setting of the PCRE_UNGREEDY option.
       
  1075                  If the next character is '?' this is a minimizing repeat, by default,
       
  1076                  but if PCRE_UNGREEDY is set, it works the other way round. We change the
       
  1077                  repeat type to the non-default. */
       
  1078                 
       
  1079                 if (safelyCheckNextChar(ptr, patternEnd, '?')) {
       
  1080                     repeatType = 1;
       
  1081                     ptr++;
       
  1082                 } else
       
  1083                     repeatType = 0;
       
  1084                 
       
  1085                 /* If previous was a character match, abolish the item and generate a
       
  1086                  repeat item instead. If a char item has a minumum of more than one, ensure
       
  1087                  that it is set in reqByte - it might not be if a sequence such as x{3} is
       
  1088                  the first thing in a branch because the x will have gone into firstByte
       
  1089                  instead.  */
       
  1090                 
       
  1091                 if (*previous == OP_CHAR || *previous == OP_CHAR_IGNORING_CASE) {
       
  1092                     /* Deal with UTF-8 characters that take up more than one byte. It's
       
  1093                      easier to write this out separately than try to macrify it. Use c to
       
  1094                      hold the length of the character in bytes, plus 0x80 to flag that it's a
       
  1095                      length rather than a small character. */
       
  1096                     
       
  1097                     if (code[-1] & 0x80) {
       
  1098                         unsigned char *lastchar = code - 1;
       
  1099                         while((*lastchar & 0xc0) == 0x80)
       
  1100                             lastchar--;
       
  1101                         c = code - lastchar;            /* Length of UTF-8 character */
       
  1102                         memcpy(utf8_char, lastchar, c); /* Save the char */
       
  1103                         c |= 0x80;                      /* Flag c as a length */
       
  1104                     }
       
  1105                     else {
       
  1106                         c = code[-1];
       
  1107                         if (repeatMin > 1)
       
  1108                             reqByte = c | reqCaseOpt | cd.reqVaryOpt;
       
  1109                     }
       
  1110                     
       
  1111                     goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
       
  1112                 }
       
  1113                 
       
  1114                 else if (*previous == OP_ASCII_CHAR || *previous == OP_ASCII_LETTER_IGNORING_CASE) {
       
  1115                     c = previous[1];
       
  1116                     if (repeatMin > 1)
       
  1117                         reqByte = c | reqCaseOpt | cd.reqVaryOpt;
       
  1118                     goto OUTPUT_SINGLE_REPEAT;
       
  1119                 }
       
  1120                 
       
  1121                 /* If previous was a single negated character ([^a] or similar), we use
       
  1122                  one of the special opcodes, replacing it. The code is shared with single-
       
  1123                  character repeats by setting opt_type to add a suitable offset into
       
  1124                  repeatType. OP_NOT is currently used only for single-byte chars. */
       
  1125                 
       
  1126                 else if (*previous == OP_NOT) {
       
  1127                     opType = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */
       
  1128                     c = previous[1];
       
  1129                     goto OUTPUT_SINGLE_REPEAT;
       
  1130                 }
       
  1131                 
       
  1132                 /* If previous was a character type match (\d or similar), abolish it and
       
  1133                  create a suitable repeat item. The code is shared with single-character
       
  1134                  repeats by setting opType to add a suitable offset into repeatType. */
       
  1135                 
       
  1136                 else if (*previous <= OP_NOT_NEWLINE) {
       
  1137                     opType = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
       
  1138                     c = *previous;
       
  1139                     
       
  1140                 OUTPUT_SINGLE_REPEAT:
       
  1141                     int prop_type = -1;
       
  1142                     int prop_value = -1;
       
  1143                     
       
  1144                     unsigned char* oldcode = code;
       
  1145                     code = previous;                  /* Usually overwrite previous item */
       
  1146                     
       
  1147                     /* If the maximum is zero then the minimum must also be zero; Perl allows
       
  1148                      this case, so we do too - by simply omitting the item altogether. */
       
  1149                     
       
  1150                     if (repeat_max == 0)
       
  1151                         goto END_REPEAT;
       
  1152                     
       
  1153                     /* Combine the opType with the repeatType */
       
  1154                     
       
  1155                     repeatType += opType;
       
  1156                     
       
  1157                     /* A minimum of zero is handled either as the special case * or ?, or as
       
  1158                      an UPTO, with the maximum given. */
       
  1159                     
       
  1160                     if (repeatMin == 0) {
       
  1161                         if (repeat_max == -1)
       
  1162                             *code++ = OP_STAR + repeatType;
       
  1163                         else if (repeat_max == 1)
       
  1164                             *code++ = OP_QUERY + repeatType;
       
  1165                         else {
       
  1166                             *code++ = OP_UPTO + repeatType;
       
  1167                             put2ByteValueAndAdvance(code, repeat_max);
       
  1168                         }
       
  1169                     }
       
  1170                     
       
  1171                     /* A repeat minimum of 1 is optimized into some special cases. If the
       
  1172                      maximum is unlimited, we use OP_PLUS. Otherwise, the original item it
       
  1173                      left in place and, if the maximum is greater than 1, we use OP_UPTO with
       
  1174                      one less than the maximum. */
       
  1175                     
       
  1176                     else if (repeatMin == 1) {
       
  1177                         if (repeat_max == -1)
       
  1178                             *code++ = OP_PLUS + repeatType;
       
  1179                         else {
       
  1180                             code = oldcode;                 /* leave previous item in place */
       
  1181                             if (repeat_max == 1)
       
  1182                                 goto END_REPEAT;
       
  1183                             *code++ = OP_UPTO + repeatType;
       
  1184                             put2ByteValueAndAdvance(code, repeat_max - 1);
       
  1185                         }
       
  1186                     }
       
  1187                     
       
  1188                     /* The case {n,n} is just an EXACT, while the general case {n,m} is
       
  1189                      handled as an EXACT followed by an UPTO. */
       
  1190                     
       
  1191                     else {
       
  1192                         *code++ = OP_EXACT + opType;  /* NB EXACT doesn't have repeatType */
       
  1193                         put2ByteValueAndAdvance(code, repeatMin);
       
  1194                         
       
  1195                         /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
       
  1196                          we have to insert the character for the previous code. For a repeated
       
  1197                          Unicode property match, there are two extra bytes that define the
       
  1198                          required property. In UTF-8 mode, long characters have their length in
       
  1199                          c, with the 0x80 bit as a flag. */
       
  1200                         
       
  1201                         if (repeat_max < 0) {
       
  1202                             if (c >= 128) {
       
  1203                                 memcpy(code, utf8_char, c & 7);
       
  1204                                 code += c & 7;
       
  1205                             } else {
       
  1206                                 *code++ = c;
       
  1207                                 if (prop_type >= 0) {
       
  1208                                     *code++ = prop_type;
       
  1209                                     *code++ = prop_value;
       
  1210                                 }
       
  1211                             }
       
  1212                             *code++ = OP_STAR + repeatType;
       
  1213                         }
       
  1214                         
       
  1215                         /* Else insert an UPTO if the max is greater than the min, again
       
  1216                          preceded by the character, for the previously inserted code. */
       
  1217                         
       
  1218                         else if (repeat_max != repeatMin) {
       
  1219                             if (c >= 128) {
       
  1220                                 memcpy(code, utf8_char, c & 7);
       
  1221                                 code += c & 7;
       
  1222                             } else
       
  1223                                 *code++ = c;
       
  1224                             if (prop_type >= 0) {
       
  1225                                 *code++ = prop_type;
       
  1226                                 *code++ = prop_value;
       
  1227                             }
       
  1228                             repeat_max -= repeatMin;
       
  1229                             *code++ = OP_UPTO + repeatType;
       
  1230                             put2ByteValueAndAdvance(code, repeat_max);
       
  1231                         }
       
  1232                     }
       
  1233                     
       
  1234                     /* The character or character type itself comes last in all cases. */
       
  1235                     
       
  1236                     if (c >= 128) {
       
  1237                         memcpy(code, utf8_char, c & 7);
       
  1238                         code += c & 7;
       
  1239                     } else
       
  1240                         *code++ = c;
       
  1241                     
       
  1242                     /* For a repeated Unicode property match, there are two extra bytes that
       
  1243                      define the required property. */
       
  1244                     
       
  1245                     if (prop_type >= 0) {
       
  1246                         *code++ = prop_type;
       
  1247                         *code++ = prop_value;
       
  1248                     }
       
  1249                 }
       
  1250                 
       
  1251                 /* If previous was a character class or a back reference, we put the repeat
       
  1252                  stuff after it, but just skip the item if the repeat was {0,0}. */
       
  1253                 
       
  1254                 else if (*previous == OP_CLASS ||
       
  1255                          *previous == OP_NCLASS ||
       
  1256                          *previous == OP_XCLASS ||
       
  1257                          *previous == OP_REF)
       
  1258                 {
       
  1259                     if (repeat_max == 0) {
       
  1260                         code = previous;
       
  1261                         goto END_REPEAT;
       
  1262                     }
       
  1263                     
       
  1264                     if (repeatMin == 0 && repeat_max == -1)
       
  1265                         *code++ = OP_CRSTAR + repeatType;
       
  1266                     else if (repeatMin == 1 && repeat_max == -1)
       
  1267                         *code++ = OP_CRPLUS + repeatType;
       
  1268                     else if (repeatMin == 0 && repeat_max == 1)
       
  1269                         *code++ = OP_CRQUERY + repeatType;
       
  1270                     else {
       
  1271                         *code++ = OP_CRRANGE + repeatType;
       
  1272                         put2ByteValueAndAdvance(code, repeatMin);
       
  1273                         if (repeat_max == -1)
       
  1274                             repeat_max = 0;  /* 2-byte encoding for max */
       
  1275                         put2ByteValueAndAdvance(code, repeat_max);
       
  1276                     }
       
  1277                 }
       
  1278                 
       
  1279                 /* If previous was a bracket group, we may have to replicate it in certain
       
  1280                  cases. */
       
  1281                 
       
  1282                 else if (*previous >= OP_BRA) {
       
  1283                     int ketoffset = 0;
       
  1284                     int len = code - previous;
       
  1285                     unsigned char* bralink = NULL;
       
  1286                     
       
  1287                     /* If the maximum repeat count is unlimited, find the end of the bracket
       
  1288                      by scanning through from the start, and compute the offset back to it
       
  1289                      from the current code pointer. There may be an OP_OPT setting following
       
  1290                      the final KET, so we can't find the end just by going back from the code
       
  1291                      pointer. */
       
  1292                     
       
  1293                     if (repeat_max == -1) {
       
  1294                         const unsigned char* ket = previous;
       
  1295                         advanceToEndOfBracket(ket);
       
  1296                         ketoffset = code - ket;
       
  1297                     }
       
  1298                     
       
  1299                     /* The case of a zero minimum is special because of the need to stick
       
  1300                      OP_BRAZERO in front of it, and because the group appears once in the
       
  1301                      data, whereas in other cases it appears the minimum number of times. For
       
  1302                      this reason, it is simplest to treat this case separately, as otherwise
       
  1303                      the code gets far too messy. There are several special subcases when the
       
  1304                      minimum is zero. */
       
  1305                     
       
  1306                     if (repeatMin == 0) {
       
  1307                         /* If the maximum is also zero, we just omit the group from the output
       
  1308                          altogether. */
       
  1309                         
       
  1310                         if (repeat_max == 0) {
       
  1311                             code = previous;
       
  1312                             goto END_REPEAT;
       
  1313                         }
       
  1314                         
       
  1315                         /* If the maximum is 1 or unlimited, we just have to stick in the
       
  1316                          BRAZERO and do no more at this point. However, we do need to adjust
       
  1317                          any OP_RECURSE calls inside the group that refer to the group itself or
       
  1318                          any internal group, because the offset is from the start of the whole
       
  1319                          regex. Temporarily terminate the pattern while doing this. */
       
  1320                         
       
  1321                         if (repeat_max <= 1) {
       
  1322                             *code = OP_END;
       
  1323                             memmove(previous+1, previous, len);
       
  1324                             code++;
       
  1325                             *previous++ = OP_BRAZERO + repeatType;
       
  1326                         }
       
  1327                         
       
  1328                         /* If the maximum is greater than 1 and limited, we have to replicate
       
  1329                          in a nested fashion, sticking OP_BRAZERO before each set of brackets.
       
  1330                          The first one has to be handled carefully because it's the original
       
  1331                          copy, which has to be moved up. The remainder can be handled by code
       
  1332                          that is common with the non-zero minimum case below. We have to
       
  1333                          adjust the value of repeat_max, since one less copy is required. */
       
  1334                         
       
  1335                         else {
       
  1336                             *code = OP_END;
       
  1337                             memmove(previous + 2 + LINK_SIZE, previous, len);
       
  1338                             code += 2 + LINK_SIZE;
       
  1339                             *previous++ = OP_BRAZERO + repeatType;
       
  1340                             *previous++ = OP_BRA;
       
  1341                             
       
  1342                             /* We chain together the bracket offset fields that have to be
       
  1343                              filled in later when the ends of the brackets are reached. */
       
  1344                             
       
  1345                             int offset = (!bralink) ? 0 : previous - bralink;
       
  1346                             bralink = previous;
       
  1347                             putLinkValueAllowZeroAndAdvance(previous, offset);
       
  1348                         }
       
  1349                         
       
  1350                         repeat_max--;
       
  1351                     }
       
  1352                     
       
  1353                     /* If the minimum is greater than zero, replicate the group as many
       
  1354                      times as necessary, and adjust the maximum to the number of subsequent
       
  1355                      copies that we need. If we set a first char from the group, and didn't
       
  1356                      set a required char, copy the latter from the former. */
       
  1357                     
       
  1358                     else {
       
  1359                         if (repeatMin > 1) {
       
  1360                             if (didGroupSetFirstByte && reqByte < 0)
       
  1361                                 reqByte = firstByte;
       
  1362                             for (int i = 1; i < repeatMin; i++) {
       
  1363                                 memcpy(code, previous, len);
       
  1364                                 code += len;
       
  1365                             }
       
  1366                         }
       
  1367                         if (repeat_max > 0)
       
  1368                             repeat_max -= repeatMin;
       
  1369                     }
       
  1370                     
       
  1371                     /* This code is common to both the zero and non-zero minimum cases. If
       
  1372                      the maximum is limited, it replicates the group in a nested fashion,
       
  1373                      remembering the bracket starts on a stack. In the case of a zero minimum,
       
  1374                      the first one was set up above. In all cases the repeat_max now specifies
       
  1375                      the number of additional copies needed. */
       
  1376                     
       
  1377                     if (repeat_max >= 0) {
       
  1378                         for (int i = repeat_max - 1; i >= 0; i--) {
       
  1379                             *code++ = OP_BRAZERO + repeatType;
       
  1380                             
       
  1381                             /* All but the final copy start a new nesting, maintaining the
       
  1382                              chain of brackets outstanding. */
       
  1383                             
       
  1384                             if (i != 0) {
       
  1385                                 *code++ = OP_BRA;
       
  1386                                 int offset = (!bralink) ? 0 : code - bralink;
       
  1387                                 bralink = code;
       
  1388                                 putLinkValueAllowZeroAndAdvance(code, offset);
       
  1389                             }
       
  1390                             
       
  1391                             memcpy(code, previous, len);
       
  1392                             code += len;
       
  1393                         }
       
  1394                         
       
  1395                         /* Now chain through the pending brackets, and fill in their length
       
  1396                          fields (which are holding the chain links pro tem). */
       
  1397                         
       
  1398                         while (bralink) {
       
  1399                             int offset = code - bralink + 1;
       
  1400                             unsigned char* bra = code - offset;
       
  1401                             int oldlinkoffset = getLinkValueAllowZero(bra + 1);
       
  1402                             bralink = (!oldlinkoffset) ? 0 : bralink - oldlinkoffset;
       
  1403                             *code++ = OP_KET;
       
  1404                             putLinkValueAndAdvance(code, offset);
       
  1405                             putLinkValue(bra + 1, offset);
       
  1406                         }
       
  1407                     }
       
  1408                     
       
  1409                     /* If the maximum is unlimited, set a repeater in the final copy. We
       
  1410                      can't just offset backwards from the current code point, because we
       
  1411                      don't know if there's been an options resetting after the ket. The
       
  1412                      correct offset was computed above. */
       
  1413                     
       
  1414                     else
       
  1415                         code[-ketoffset] = OP_KETRMAX + repeatType;
       
  1416                 }
       
  1417                 
       
  1418                 // A quantifier after an assertion is mostly meaningless, but it
       
  1419                 // can nullify the assertion if it has a 0 minimum.
       
  1420                 else if (*previous == OP_ASSERT || *previous == OP_ASSERT_NOT) {
       
  1421                     if (repeatMin == 0) {
       
  1422                         code = previous;
       
  1423                         goto END_REPEAT;
       
  1424                     }
       
  1425                 }
       
  1426                 
       
  1427                 /* Else there's some kind of shambles */
       
  1428                 
       
  1429                 else {
       
  1430                     *errorCodePtr = ERR11;
       
  1431                     goto FAILED;
       
  1432                 }
       
  1433                 
       
  1434                 /* In all case we no longer have a previous item. We also set the
       
  1435                  "follows varying string" flag for subsequently encountered reqbytes if
       
  1436                  it isn't already set and we have just passed a varying length item. */
       
  1437                 
       
  1438             END_REPEAT:
       
  1439                 previous = NULL;
       
  1440                 cd.reqVaryOpt |= reqvary;
       
  1441                 break;
       
  1442                 
       
  1443             /* Start of nested bracket sub-expression, or comment or lookahead or
       
  1444              lookbehind or option setting or condition. First deal with special things
       
  1445              that can come after a bracket; all are introduced by ?, and the appearance
       
  1446              of any of them means that this is not a referencing group. They were
       
  1447              checked for validity in the first pass over the string, so we don't have to
       
  1448              check for syntax errors here.  */
       
  1449                 
       
  1450             case '(':
       
  1451                 skipBytes = 0;
       
  1452                 
       
  1453                 if (*(++ptr) == '?') {
       
  1454                     switch (*(++ptr)) {
       
  1455                         case ':':                 /* Non-extracting bracket */
       
  1456                             bravalue = OP_BRA;
       
  1457                             ptr++;
       
  1458                             break;
       
  1459                             
       
  1460                         case '=':                 /* Positive lookahead */
       
  1461                             bravalue = OP_ASSERT;
       
  1462                             ptr++;
       
  1463                             break;
       
  1464                             
       
  1465                         case '!':                 /* Negative lookahead */
       
  1466                             bravalue = OP_ASSERT_NOT;
       
  1467                             ptr++;
       
  1468                             break;
       
  1469                             
       
  1470                         /* Character after (? not specially recognized */
       
  1471                             
       
  1472                         default:
       
  1473                             *errorCodePtr = ERR12;
       
  1474                             goto FAILED;
       
  1475                         }
       
  1476                 }
       
  1477                 
       
  1478                 /* Else we have a referencing group; adjust the opcode. If the bracket
       
  1479                  number is greater than EXTRACT_BASIC_MAX, we set the opcode one higher, and
       
  1480                  arrange for the true number to follow later, in an OP_BRANUMBER item. */
       
  1481                 
       
  1482                 else {
       
  1483                     if (++(*brackets) > EXTRACT_BASIC_MAX) {
       
  1484                         bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1;
       
  1485                         code[1 + LINK_SIZE] = OP_BRANUMBER;
       
  1486                         put2ByteValue(code + 2 + LINK_SIZE, *brackets);
       
  1487                         skipBytes = 3;
       
  1488                     }
       
  1489                     else
       
  1490                         bravalue = OP_BRA + *brackets;
       
  1491                 }
       
  1492                 
       
  1493                 /* Process nested bracketed re. We copy code into a non-variable
       
  1494                  in order to be able to pass its address because some compilers
       
  1495                  complain otherwise. Pass in a new setting for the ims options
       
  1496                  if they have changed. */
       
  1497                 
       
  1498                 previous = code;
       
  1499                 *code = bravalue;
       
  1500                 tempcode = code;
       
  1501                 tempreqvary = cd.reqVaryOpt;     /* Save value before bracket */
       
  1502                 
       
  1503                 if (!compileBracket(
       
  1504                                    options,
       
  1505                                    brackets,                     /* Extracting bracket count */
       
  1506                                    &tempcode,                    /* Where to put code (updated) */
       
  1507                                    &ptr,                         /* Input pointer (updated) */
       
  1508                                    patternEnd,
       
  1509                                    errorCodePtr,                 /* Where to put an error message */
       
  1510                                    skipBytes,                    /* Skip over OP_BRANUMBER */
       
  1511                                    &subFirstByte,                /* For possible first char */
       
  1512                                    &subReqByte,                  /* For possible last char */
       
  1513                                    cd))                          /* Tables block */
       
  1514                     goto FAILED;
       
  1515                 
       
  1516                 /* At the end of compiling, code is still pointing to the start of the
       
  1517                  group, while tempcode has been updated to point past the end of the group
       
  1518                  and any option resetting that may follow it. The pattern pointer (ptr)
       
  1519                  is on the bracket. */
       
  1520                 
       
  1521                 /* Handle updating of the required and first characters. Update for normal
       
  1522                  brackets of all kinds, and conditions with two branches (see code above).
       
  1523                  If the bracket is followed by a quantifier with zero repeat, we have to
       
  1524                  back off. Hence the definition of zeroReqByte and zeroFirstByte outside the
       
  1525                  main loop so that they can be accessed for the back off. */
       
  1526                 
       
  1527                 zeroReqByte = reqByte;
       
  1528                 zeroFirstByte = firstByte;
       
  1529                 didGroupSetFirstByte = false;
       
  1530                 
       
  1531                 if (bravalue >= OP_BRA) {
       
  1532                     /* If we have not yet set a firstByte in this branch, take it from the
       
  1533                      subpattern, remembering that it was set here so that a repeat of more
       
  1534                      than one can replicate it as reqByte if necessary. If the subpattern has
       
  1535                      no firstByte, set "none" for the whole branch. In both cases, a zero
       
  1536                      repeat forces firstByte to "none". */
       
  1537                     
       
  1538                     if (firstByte == REQ_UNSET) {
       
  1539                         if (subFirstByte >= 0) {
       
  1540                             firstByte = subFirstByte;
       
  1541                             didGroupSetFirstByte = true;
       
  1542                         }
       
  1543                         else
       
  1544                             firstByte = REQ_NONE;
       
  1545                         zeroFirstByte = REQ_NONE;
       
  1546                     }
       
  1547                     
       
  1548                     /* If firstByte was previously set, convert the subpattern's firstByte
       
  1549                      into reqByte if there wasn't one, using the vary flag that was in
       
  1550                      existence beforehand. */
       
  1551                     
       
  1552                     else if (subFirstByte >= 0 && subReqByte < 0)
       
  1553                         subReqByte = subFirstByte | tempreqvary;
       
  1554                     
       
  1555                     /* If the subpattern set a required byte (or set a first byte that isn't
       
  1556                      really the first byte - see above), set it. */
       
  1557                     
       
  1558                     if (subReqByte >= 0)
       
  1559                         reqByte = subReqByte;
       
  1560                 }
       
  1561                 
       
  1562                 /* For a forward assertion, we take the reqByte, if set. This can be
       
  1563                  helpful if the pattern that follows the assertion doesn't set a different
       
  1564                  char. For example, it's useful for /(?=abcde).+/. We can't set firstByte
       
  1565                  for an assertion, however because it leads to incorrect effect for patterns
       
  1566                  such as /(?=a)a.+/ when the "real" "a" would then become a reqByte instead
       
  1567                  of a firstByte. This is overcome by a scan at the end if there's no
       
  1568                  firstByte, looking for an asserted first char. */
       
  1569                 
       
  1570                 else if (bravalue == OP_ASSERT && subReqByte >= 0)
       
  1571                     reqByte = subReqByte;
       
  1572                 
       
  1573                 /* Now update the main code pointer to the end of the group. */
       
  1574                 
       
  1575                 code = tempcode;
       
  1576                 
       
  1577                 /* Error if hit end of pattern */
       
  1578                 
       
  1579                 if (ptr >= patternEnd || *ptr != ')') {
       
  1580                     *errorCodePtr = ERR14;
       
  1581                     goto FAILED;
       
  1582                 }
       
  1583                 break;
       
  1584                 
       
  1585             /* Check \ for being a real metacharacter; if not, fall through and handle
       
  1586              it as a data character at the start of a string. Escape items are checked
       
  1587              for validity in the pre-compiling pass. */
       
  1588                 
       
  1589             case '\\':
       
  1590                 tempptr = ptr;
       
  1591                 c = checkEscape(&ptr, patternEnd, errorCodePtr, cd.numCapturingBrackets, false);
       
  1592                 
       
  1593                 /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
       
  1594                  are arranged to be the negation of the corresponding OP_values. For the
       
  1595                  back references, the values are ESC_REF plus the reference number. Only
       
  1596                  back references and those types that consume a character may be repeated.
       
  1597                  We can test for values between ESC_b and ESC_w for the latter; this may
       
  1598                  have to change if any new ones are ever created. */
       
  1599                 
       
  1600                 if (c < 0) {
       
  1601                     /* For metasequences that actually match a character, we disable the
       
  1602                      setting of a first character if it hasn't already been set. */
       
  1603                     
       
  1604                     if (firstByte == REQ_UNSET && -c > ESC_b && -c <= ESC_w)
       
  1605                         firstByte = REQ_NONE;
       
  1606                     
       
  1607                     /* Set values to reset to if this is followed by a zero repeat. */
       
  1608                     
       
  1609                     zeroFirstByte = firstByte;
       
  1610                     zeroReqByte = reqByte;
       
  1611                     
       
  1612                     /* Back references are handled specially */
       
  1613                     
       
  1614                     if (-c >= ESC_REF) {
       
  1615                         int number = -c - ESC_REF;
       
  1616                         previous = code;
       
  1617                         *code++ = OP_REF;
       
  1618                         put2ByteValueAndAdvance(code, number);
       
  1619                     }
       
  1620                     
       
  1621                     /* For the rest, we can obtain the OP value by negating the escape
       
  1622                      value */
       
  1623                     
       
  1624                     else {
       
  1625                         previous = (-c > ESC_b && -c <= ESC_w) ? code : NULL;
       
  1626                         *code++ = -c;
       
  1627                     }
       
  1628                     continue;
       
  1629                 }
       
  1630                 
       
  1631                 /* Fall through. */
       
  1632                 
       
  1633                 /* Handle a literal character. It is guaranteed not to be whitespace or #
       
  1634                  when the extended flag is set. If we are in UTF-8 mode, it may be a
       
  1635                  multi-byte literal character. */
       
  1636                 
       
  1637                 default:
       
  1638             NORMAL_CHAR:
       
  1639                 
       
  1640                 previous = code;
       
  1641                 
       
  1642                 if (c < 128) {
       
  1643                     mcLength = 1;
       
  1644                     mcbuffer[0] = c;
       
  1645                     
       
  1646                     if ((options & IgnoreCaseOption) && (c | 0x20) >= 'a' && (c | 0x20) <= 'z') {
       
  1647                         *code++ = OP_ASCII_LETTER_IGNORING_CASE;
       
  1648                         *code++ = c | 0x20;
       
  1649                     } else {
       
  1650                         *code++ = OP_ASCII_CHAR;
       
  1651                         *code++ = c;
       
  1652                     }
       
  1653                 } else {
       
  1654                     mcLength = encodeUTF8(c, mcbuffer);
       
  1655                     
       
  1656                     *code++ = (options & IgnoreCaseOption) ? OP_CHAR_IGNORING_CASE : OP_CHAR;
       
  1657                     for (c = 0; c < mcLength; c++)
       
  1658                         *code++ = mcbuffer[c];
       
  1659                 }
       
  1660                 
       
  1661                 /* Set the first and required bytes appropriately. If no previous first
       
  1662                  byte, set it from this character, but revert to none on a zero repeat.
       
  1663                  Otherwise, leave the firstByte value alone, and don't change it on a zero
       
  1664                  repeat. */
       
  1665                 
       
  1666                 if (firstByte == REQ_UNSET) {
       
  1667                     zeroFirstByte = REQ_NONE;
       
  1668                     zeroReqByte = reqByte;
       
  1669                     
       
  1670                     /* If the character is more than one byte long, we can set firstByte
       
  1671                      only if it is not to be matched caselessly. */
       
  1672                     
       
  1673                     if (mcLength == 1 || reqCaseOpt == 0) {
       
  1674                         firstByte = mcbuffer[0] | reqCaseOpt;
       
  1675                         if (mcLength != 1)
       
  1676                             reqByte = code[-1] | cd.reqVaryOpt;
       
  1677                     }
       
  1678                     else
       
  1679                         firstByte = reqByte = REQ_NONE;
       
  1680                 }
       
  1681                 
       
  1682                 /* firstByte was previously set; we can set reqByte only the length is
       
  1683                  1 or the matching is caseful. */
       
  1684                 
       
  1685                 else {
       
  1686                     zeroFirstByte = firstByte;
       
  1687                     zeroReqByte = reqByte;
       
  1688                     if (mcLength == 1 || reqCaseOpt == 0)
       
  1689                         reqByte = code[-1] | reqCaseOpt | cd.reqVaryOpt;
       
  1690                 }
       
  1691                 
       
  1692                 break;            /* End of literal character handling */
       
  1693         }
       
  1694     }                   /* end of big loop */
       
  1695     
       
  1696     /* Control never reaches here by falling through, only by a goto for all the
       
  1697      error states. Pass back the position in the pattern so that it can be displayed
       
  1698      to the user for diagnosing the error. */
       
  1699     
       
  1700 FAILED:
       
  1701     *ptrPtr = ptr;
       
  1702     return false;
       
  1703 }
       
  1704 
       
  1705 /*************************************************
       
  1706 *     Compile sequence of alternatives           *
       
  1707 *************************************************/
       
  1708 
       
  1709 /* On entry, ptr is pointing past the bracket character, but on return
       
  1710 it points to the closing bracket, or vertical bar, or end of string.
       
  1711 The code variable is pointing at the byte into which the BRA operator has been
       
  1712 stored. If the ims options are changed at the start (for a (?ims: group) or
       
  1713 during any branch, we need to insert an OP_OPT item at the start of every
       
  1714 following branch to ensure they get set correctly at run time, and also pass
       
  1715 the new options into every subsequent branch compile.
       
  1716 
       
  1717 Argument:
       
  1718   options        option bits, including any changes for this subpattern
       
  1719   brackets       -> int containing the number of extracting brackets used
       
  1720   codePtr        -> the address of the current code pointer
       
  1721   ptrPtr         -> the address of the current pattern pointer
       
  1722   errorCodePtr   -> pointer to error code variable
       
  1723   skipBytes      skip this many bytes at start (for OP_BRANUMBER)
       
  1724   firstbyteptr   place to put the first required character, or a negative number
       
  1725   reqbyteptr     place to put the last required character, or a negative number
       
  1726   cd             points to the data block with tables pointers etc.
       
  1727 
       
  1728 Returns:      true on success
       
  1729 */
       
  1730 
       
  1731 static bool
       
  1732 compileBracket(int options, int* brackets, unsigned char** codePtr,
       
  1733     const UChar** ptrPtr, const UChar* patternEnd, ErrorCode* errorCodePtr, int skipBytes,
       
  1734     int* firstbyteptr, int* reqbyteptr, CompileData& cd)
       
  1735 {
       
  1736     const UChar* ptr = *ptrPtr;
       
  1737     unsigned char* code = *codePtr;
       
  1738     unsigned char* lastBranch = code;
       
  1739     unsigned char* start_bracket = code;
       
  1740     int firstByte = REQ_UNSET;
       
  1741     int reqByte = REQ_UNSET;
       
  1742     
       
  1743     /* Offset is set zero to mark that this bracket is still open */
       
  1744     
       
  1745     putLinkValueAllowZero(code + 1, 0);
       
  1746     code += 1 + LINK_SIZE + skipBytes;
       
  1747     
       
  1748     /* Loop for each alternative branch */
       
  1749     
       
  1750     while (true) {
       
  1751         /* Now compile the branch */
       
  1752         
       
  1753         int branchFirstByte;
       
  1754         int branchReqByte;
       
  1755         if (!compileBranch(options, brackets, &code, &ptr, patternEnd, errorCodePtr,
       
  1756                             &branchFirstByte, &branchReqByte, cd)) {
       
  1757             *ptrPtr = ptr;
       
  1758             return false;
       
  1759         }
       
  1760         
       
  1761         /* If this is the first branch, the firstByte and reqByte values for the
       
  1762          branch become the values for the regex. */
       
  1763         
       
  1764         if (*lastBranch != OP_ALT) {
       
  1765             firstByte = branchFirstByte;
       
  1766             reqByte = branchReqByte;
       
  1767         }
       
  1768         
       
  1769         /* If this is not the first branch, the first char and reqByte have to
       
  1770          match the values from all the previous branches, except that if the previous
       
  1771          value for reqByte didn't have REQ_VARY set, it can still match, and we set
       
  1772          REQ_VARY for the regex. */
       
  1773         
       
  1774         else {
       
  1775             /* If we previously had a firstByte, but it doesn't match the new branch,
       
  1776              we have to abandon the firstByte for the regex, but if there was previously
       
  1777              no reqByte, it takes on the value of the old firstByte. */
       
  1778             
       
  1779             if (firstByte >= 0 && firstByte != branchFirstByte) {
       
  1780                 if (reqByte < 0)
       
  1781                     reqByte = firstByte;
       
  1782                 firstByte = REQ_NONE;
       
  1783             }
       
  1784             
       
  1785             /* If we (now or from before) have no firstByte, a firstByte from the
       
  1786              branch becomes a reqByte if there isn't a branch reqByte. */
       
  1787             
       
  1788             if (firstByte < 0 && branchFirstByte >= 0 && branchReqByte < 0)
       
  1789                 branchReqByte = branchFirstByte;
       
  1790             
       
  1791             /* Now ensure that the reqbytes match */
       
  1792             
       
  1793             if ((reqByte & ~REQ_VARY) != (branchReqByte & ~REQ_VARY))
       
  1794                 reqByte = REQ_NONE;
       
  1795             else
       
  1796                 reqByte |= branchReqByte;   /* To "or" REQ_VARY */
       
  1797         }
       
  1798         
       
  1799         /* Reached end of expression, either ')' or end of pattern. Go back through
       
  1800          the alternative branches and reverse the chain of offsets, with the field in
       
  1801          the BRA item now becoming an offset to the first alternative. If there are
       
  1802          no alternatives, it points to the end of the group. The length in the
       
  1803          terminating ket is always the length of the whole bracketed item. If any of
       
  1804          the ims options were changed inside the group, compile a resetting op-code
       
  1805          following, except at the very end of the pattern. Return leaving the pointer
       
  1806          at the terminating char. */
       
  1807         
       
  1808         if (ptr >= patternEnd || *ptr != '|') {
       
  1809             int length = code - lastBranch;
       
  1810             do {
       
  1811                 int prevLength = getLinkValueAllowZero(lastBranch + 1);
       
  1812                 putLinkValue(lastBranch + 1, length);
       
  1813                 length = prevLength;
       
  1814                 lastBranch -= length;
       
  1815             } while (length > 0);
       
  1816             
       
  1817             /* Fill in the ket */
       
  1818             
       
  1819             *code = OP_KET;
       
  1820             putLinkValue(code + 1, code - start_bracket);
       
  1821             code += 1 + LINK_SIZE;
       
  1822             
       
  1823             /* Set values to pass back */
       
  1824             
       
  1825             *codePtr = code;
       
  1826             *ptrPtr = ptr;
       
  1827             *firstbyteptr = firstByte;
       
  1828             *reqbyteptr = reqByte;
       
  1829             return true;
       
  1830         }
       
  1831         
       
  1832         /* Another branch follows; insert an "or" node. Its length field points back
       
  1833          to the previous branch while the bracket remains open. At the end the chain
       
  1834          is reversed. It's done like this so that the start of the bracket has a
       
  1835          zero offset until it is closed, making it possible to detect recursion. */
       
  1836         
       
  1837         *code = OP_ALT;
       
  1838         putLinkValue(code + 1, code - lastBranch);
       
  1839         lastBranch = code;
       
  1840         code += 1 + LINK_SIZE;
       
  1841         ptr++;
       
  1842     }
       
  1843     ASSERT_NOT_REACHED();
       
  1844 }
       
  1845 
       
  1846 /*************************************************
       
  1847 *          Check for anchored expression         *
       
  1848 *************************************************/
       
  1849 
       
  1850 /* Try to find out if this is an anchored regular expression. Consider each
       
  1851 alternative branch. If they all start OP_CIRC, or with a bracket
       
  1852 all of whose alternatives start OP_CIRC (recurse ad lib), then
       
  1853 it's anchored.
       
  1854 
       
  1855 Arguments:
       
  1856   code          points to start of expression (the bracket)
       
  1857   captureMap    a bitmap of which brackets we are inside while testing; this
       
  1858                  handles up to substring 31; all brackets after that share
       
  1859                  the zero bit
       
  1860   backrefMap    the back reference bitmap
       
  1861 */
       
  1862 
       
  1863 static bool branchIsAnchored(const unsigned char* code)
       
  1864 {
       
  1865     const unsigned char* scode = firstSignificantOpcode(code);
       
  1866     int op = *scode;
       
  1867 
       
  1868     /* Brackets */
       
  1869     if (op >= OP_BRA || op == OP_ASSERT)
       
  1870         return bracketIsAnchored(scode);
       
  1871 
       
  1872     /* Check for explicit anchoring */    
       
  1873     return op == OP_CIRC;
       
  1874 }
       
  1875 
       
  1876 static bool bracketIsAnchored(const unsigned char* code)
       
  1877 {
       
  1878     do {
       
  1879         if (!branchIsAnchored(code + 1 + LINK_SIZE))
       
  1880             return false;
       
  1881         code += getLinkValue(code + 1);
       
  1882     } while (*code == OP_ALT);   /* Loop for each alternative */
       
  1883     return true;
       
  1884 }
       
  1885 
       
  1886 /*************************************************
       
  1887 *         Check for starting with ^ or .*        *
       
  1888 *************************************************/
       
  1889 
       
  1890 /* This is called to find out if every branch starts with ^ or .* so that
       
  1891 "first char" processing can be done to speed things up in multiline
       
  1892 matching and for non-DOTALL patterns that start with .* (which must start at
       
  1893 the beginning or after \n)
       
  1894 
       
  1895 Except when the .* appears inside capturing parentheses, and there is a
       
  1896 subsequent back reference to those parentheses. By keeping a bitmap of the
       
  1897 first 31 back references, we can catch some of the more common cases more
       
  1898 precisely; all the greater back references share a single bit.
       
  1899 
       
  1900 Arguments:
       
  1901   code          points to start of expression (the bracket)
       
  1902   captureMap    a bitmap of which brackets we are inside while testing; this
       
  1903                  handles up to substring 31; all brackets after that share
       
  1904                  the zero bit
       
  1905   backrefMap    the back reference bitmap
       
  1906 */
       
  1907 
       
  1908 static bool branchNeedsLineStart(const unsigned char* code, unsigned captureMap, unsigned backrefMap)
       
  1909 {
       
  1910     const unsigned char* scode = firstSignificantOpcode(code);
       
  1911     int op = *scode;
       
  1912     
       
  1913     /* Capturing brackets */
       
  1914     if (op > OP_BRA) {
       
  1915         int captureNum = op - OP_BRA;
       
  1916         if (captureNum > EXTRACT_BASIC_MAX)
       
  1917             captureNum = get2ByteValue(scode + 2 + LINK_SIZE);
       
  1918         int bracketMask = (captureNum < 32) ? (1 << captureNum) : 1;
       
  1919         return bracketNeedsLineStart(scode, captureMap | bracketMask, backrefMap);
       
  1920     }
       
  1921     
       
  1922     /* Other brackets */
       
  1923     if (op == OP_BRA || op == OP_ASSERT)
       
  1924         return bracketNeedsLineStart(scode, captureMap, backrefMap);
       
  1925     
       
  1926     /* .* means "start at start or after \n" if it isn't in brackets that
       
  1927      may be referenced. */
       
  1928     
       
  1929     if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
       
  1930         return scode[1] == OP_NOT_NEWLINE && !(captureMap & backrefMap);
       
  1931 
       
  1932     /* Explicit ^ */
       
  1933     return op == OP_CIRC || op == OP_BOL;
       
  1934 }
       
  1935 
       
  1936 static bool bracketNeedsLineStart(const unsigned char* code, unsigned captureMap, unsigned backrefMap)
       
  1937 {
       
  1938     do {
       
  1939         if (!branchNeedsLineStart(code + 1 + LINK_SIZE, captureMap, backrefMap))
       
  1940             return false;
       
  1941         code += getLinkValue(code + 1);
       
  1942     } while (*code == OP_ALT);  /* Loop for each alternative */
       
  1943     return true;
       
  1944 }
       
  1945 
       
  1946 /*************************************************
       
  1947 *       Check for asserted fixed first char      *
       
  1948 *************************************************/
       
  1949 
       
  1950 /* During compilation, the "first char" settings from forward assertions are
       
  1951 discarded, because they can cause conflicts with actual literals that follow.
       
  1952 However, if we end up without a first char setting for an unanchored pattern,
       
  1953 it is worth scanning the regex to see if there is an initial asserted first
       
  1954 char. If all branches start with the same asserted char, or with a bracket all
       
  1955 of whose alternatives start with the same asserted char (recurse ad lib), then
       
  1956 we return that char, otherwise -1.
       
  1957 
       
  1958 Arguments:
       
  1959   code       points to start of expression (the bracket)
       
  1960   options    pointer to the options (used to check casing changes)
       
  1961   inassert   true if in an assertion
       
  1962 
       
  1963 Returns:     -1 or the fixed first char
       
  1964 */
       
  1965 
       
  1966 static int branchFindFirstAssertedCharacter(const unsigned char* code, bool inassert)
       
  1967 {
       
  1968     const unsigned char* scode = firstSignificantOpcodeSkippingAssertions(code);
       
  1969     int op = *scode;
       
  1970     
       
  1971     if (op >= OP_BRA)
       
  1972         op = OP_BRA;
       
  1973     
       
  1974     switch (op) {
       
  1975         default:
       
  1976             return -1;
       
  1977             
       
  1978         case OP_BRA:
       
  1979         case OP_ASSERT:
       
  1980             return bracketFindFirstAssertedCharacter(scode, op == OP_ASSERT);
       
  1981 
       
  1982         case OP_EXACT:
       
  1983             scode += 2;
       
  1984             /* Fall through */
       
  1985 
       
  1986         case OP_CHAR:
       
  1987         case OP_CHAR_IGNORING_CASE:
       
  1988         case OP_ASCII_CHAR:
       
  1989         case OP_ASCII_LETTER_IGNORING_CASE:
       
  1990         case OP_PLUS:
       
  1991         case OP_MINPLUS:
       
  1992             if (!inassert)
       
  1993                 return -1;
       
  1994             return scode[1];
       
  1995     }
       
  1996 }
       
  1997 
       
  1998 static int bracketFindFirstAssertedCharacter(const unsigned char* code, bool inassert)
       
  1999 {
       
  2000     int c = -1;
       
  2001     do {
       
  2002         int d = branchFindFirstAssertedCharacter(code + 1 + LINK_SIZE, inassert);
       
  2003         if (d < 0)
       
  2004             return -1;
       
  2005         if (c < 0)
       
  2006             c = d;
       
  2007         else if (c != d)
       
  2008             return -1;
       
  2009         code += getLinkValue(code + 1);
       
  2010     } while (*code == OP_ALT);
       
  2011     return c;
       
  2012 }
       
  2013 
       
  2014 static inline int multiplyWithOverflowCheck(int a, int b)
       
  2015 {
       
  2016     if (!a || !b)
       
  2017         return 0;
       
  2018     if (a > MAX_PATTERN_SIZE / b)
       
  2019         return -1;
       
  2020     return a * b;
       
  2021 }
       
  2022 
       
  2023 static int calculateCompiledPatternLength(const UChar* pattern, int patternLength, JSRegExpIgnoreCaseOption ignoreCase,
       
  2024     CompileData& cd, ErrorCode& errorcode)
       
  2025 {
       
  2026     /* Make a pass over the pattern to compute the
       
  2027      amount of store required to hold the compiled code. This does not have to be
       
  2028      perfect as long as errors are overestimates. */
       
  2029 
       
  2030     if (patternLength > MAX_PATTERN_SIZE) {
       
  2031         errorcode = ERR16;
       
  2032         return -1;
       
  2033     }
       
  2034 
       
  2035     int length = 1 + LINK_SIZE;      /* For initial BRA plus length */
       
  2036     int branch_extra = 0;
       
  2037     int lastitemlength = 0;
       
  2038     unsigned brastackptr = 0;
       
  2039     FixedArray<int, BRASTACK_SIZE> brastack;
       
  2040     FixedArray<unsigned char, BRASTACK_SIZE> bralenstack;
       
  2041     int bracount = 0;
       
  2042     
       
  2043     const UChar* ptr = (const UChar*)(pattern - 1);
       
  2044     const UChar* patternEnd = (const UChar*)(pattern + patternLength);
       
  2045     
       
  2046     while (++ptr < patternEnd) {
       
  2047         int minRepeats = 0, maxRepeats = 0;
       
  2048         int c = *ptr;
       
  2049 
       
  2050         switch (c) {
       
  2051             /* A backslashed item may be an escaped data character or it may be a
       
  2052              character type. */
       
  2053 
       
  2054             case '\\':
       
  2055                 c = checkEscape(&ptr, patternEnd, &errorcode, cd.numCapturingBrackets, false);
       
  2056                 if (errorcode != 0)
       
  2057                     return -1;
       
  2058                 
       
  2059                 lastitemlength = 1;     /* Default length of last item for repeats */
       
  2060                 
       
  2061                 if (c >= 0) {            /* Data character */
       
  2062                     length += 2;          /* For a one-byte character */
       
  2063                     
       
  2064                     if (c > 127) {
       
  2065                         int i;
       
  2066                         for (i = 0; i < jsc_pcre_utf8_table1_size; i++)
       
  2067                             if (c <= jsc_pcre_utf8_table1[i]) break;
       
  2068                         length += i;
       
  2069                         lastitemlength += i;
       
  2070                     }
       
  2071                     
       
  2072                     continue;
       
  2073                 }
       
  2074                 
       
  2075                 /* Other escapes need one byte */
       
  2076                 
       
  2077                 length++;
       
  2078                 
       
  2079                 /* A back reference needs an additional 2 bytes, plus either one or 5
       
  2080                  bytes for a repeat. We also need to keep the value of the highest
       
  2081                  back reference. */
       
  2082                 
       
  2083                 if (c <= -ESC_REF) {
       
  2084                     int refnum = -c - ESC_REF;
       
  2085                     cd.backrefMap |= (refnum < 32) ? (1 << refnum) : 1;
       
  2086                     if (refnum > cd.topBackref)
       
  2087                         cd.topBackref = refnum;
       
  2088                     length += 2;   /* For single back reference */
       
  2089                     if (safelyCheckNextChar(ptr, patternEnd, '{') && isCountedRepeat(ptr + 2, patternEnd)) {
       
  2090                         ptr = readRepeatCounts(ptr + 2, &minRepeats, &maxRepeats, &errorcode);
       
  2091                         if (errorcode)
       
  2092                             return -1;
       
  2093                         if ((minRepeats == 0 && (maxRepeats == 1 || maxRepeats == -1)) ||
       
  2094                             (minRepeats == 1 && maxRepeats == -1))
       
  2095                             length++;
       
  2096                         else
       
  2097                             length += 5;
       
  2098                         if (safelyCheckNextChar(ptr, patternEnd, '?'))
       
  2099                             ptr++;
       
  2100                     }
       
  2101                 }
       
  2102                 continue;
       
  2103                 
       
  2104             case '^':     /* Single-byte metacharacters */
       
  2105             case '.':
       
  2106             case '$':
       
  2107                 length++;
       
  2108                 lastitemlength = 1;
       
  2109                 continue;
       
  2110                 
       
  2111             case '*':            /* These repeats won't be after brackets; */
       
  2112             case '+':            /* those are handled separately */
       
  2113             case '?':
       
  2114                 length++;
       
  2115                 goto POSSESSIVE;
       
  2116                 
       
  2117             /* This covers the cases of braced repeats after a single char, metachar,
       
  2118              class, or back reference. */
       
  2119 
       
  2120             case '{':
       
  2121                 if (!isCountedRepeat(ptr + 1, patternEnd))
       
  2122                     goto NORMAL_CHAR;
       
  2123                 ptr = readRepeatCounts(ptr + 1, &minRepeats, &maxRepeats, &errorcode);
       
  2124                 if (errorcode != 0)
       
  2125                     return -1;
       
  2126                 
       
  2127                 /* These special cases just insert one extra opcode */
       
  2128                 
       
  2129                 if ((minRepeats == 0 && (maxRepeats == 1 || maxRepeats == -1)) ||
       
  2130                     (minRepeats == 1 && maxRepeats == -1))
       
  2131                     length++;
       
  2132                 
       
  2133                 /* These cases might insert additional copies of a preceding character. */
       
  2134                 
       
  2135                 else {
       
  2136                     if (minRepeats != 1) {
       
  2137                         length -= lastitemlength;   /* Uncount the original char or metachar */
       
  2138                         if (minRepeats > 0)
       
  2139                             length += 3 + lastitemlength;
       
  2140                     }
       
  2141                     length += lastitemlength + ((maxRepeats > 0) ? 3 : 1);
       
  2142                 }
       
  2143                 
       
  2144                 if (safelyCheckNextChar(ptr, patternEnd, '?'))
       
  2145                     ptr++;      /* Needs no extra length */
       
  2146 
       
  2147             POSSESSIVE:                     /* Test for possessive quantifier */
       
  2148                 if (safelyCheckNextChar(ptr, patternEnd, '+')) {
       
  2149                     ptr++;
       
  2150                     length += 2 + 2 * LINK_SIZE;   /* Allow for atomic brackets */
       
  2151                 }
       
  2152                 continue;
       
  2153                 
       
  2154             /* An alternation contains an offset to the next branch or ket. If any ims
       
  2155              options changed in the previous branch(es), and/or if we are in a
       
  2156              lookbehind assertion, extra space will be needed at the start of the
       
  2157              branch. This is handled by branch_extra. */
       
  2158                 
       
  2159             case '|':
       
  2160                 if (brastackptr == 0)
       
  2161                     cd.needOuterBracket = true;
       
  2162                 length += 1 + LINK_SIZE + branch_extra;
       
  2163                 continue;
       
  2164                 
       
  2165             /* A character class uses 33 characters provided that all the character
       
  2166              values are less than 256. Otherwise, it uses a bit map for low valued
       
  2167              characters, and individual items for others. Don't worry about character
       
  2168              types that aren't allowed in classes - they'll get picked up during the
       
  2169              compile. A character class that contains only one single-byte character
       
  2170              uses 2 or 3 bytes, depending on whether it is negated or not. Notice this
       
  2171              where we can. (In UTF-8 mode we can do this only for chars < 128.) */
       
  2172                 
       
  2173             case '[': {
       
  2174                 int class_optcount;
       
  2175                 if (*(++ptr) == '^') {
       
  2176                     class_optcount = 10;  /* Greater than one */
       
  2177                     ptr++;
       
  2178                 }
       
  2179                 else
       
  2180                     class_optcount = 0;
       
  2181                 
       
  2182                 bool class_utf8 = false;
       
  2183                 
       
  2184                 for (; ptr < patternEnd && *ptr != ']'; ++ptr) {
       
  2185                     /* Check for escapes */
       
  2186                     
       
  2187                     if (*ptr == '\\') {
       
  2188                         c = checkEscape(&ptr, patternEnd, &errorcode, cd.numCapturingBrackets, true);
       
  2189                         if (errorcode != 0)
       
  2190                             return -1;
       
  2191                         
       
  2192                         /* Handle escapes that turn into characters */
       
  2193                         
       
  2194                         if (c >= 0)
       
  2195                             goto NON_SPECIAL_CHARACTER;
       
  2196                         
       
  2197                         /* Escapes that are meta-things. The normal ones just affect the
       
  2198                          bit map, but Unicode properties require an XCLASS extended item. */
       
  2199                         
       
  2200                         else
       
  2201                             class_optcount = 10;         /* \d, \s etc; make sure > 1 */
       
  2202                     }
       
  2203                     
       
  2204                     /* Anything else increments the possible optimization count. We have to
       
  2205                      detect ranges here so that we can compute the number of extra ranges for
       
  2206                      caseless wide characters when UCP support is available. If there are wide
       
  2207                      characters, we are going to have to use an XCLASS, even for single
       
  2208                      characters. */
       
  2209                     
       
  2210                     else {
       
  2211                         c = *ptr;
       
  2212                         
       
  2213                         /* Come here from handling \ above when it escapes to a char value */
       
  2214                         
       
  2215                     NON_SPECIAL_CHARACTER:
       
  2216                         class_optcount++;
       
  2217                         
       
  2218                         int d = -1;
       
  2219                         if (safelyCheckNextChar(ptr, patternEnd, '-')) {
       
  2220                             const UChar* hyptr = ptr++;
       
  2221                             if (safelyCheckNextChar(ptr, patternEnd, '\\')) {
       
  2222                                 ptr++;
       
  2223                                 d = checkEscape(&ptr, patternEnd, &errorcode, cd.numCapturingBrackets, true);
       
  2224                                 if (errorcode != 0)
       
  2225                                     return -1;
       
  2226                             }
       
  2227                             else if ((ptr + 1 < patternEnd) && ptr[1] != ']')
       
  2228                                 d = *++ptr;
       
  2229                             if (d < 0)
       
  2230                                 ptr = hyptr;      /* go back to hyphen as data */
       
  2231                         }
       
  2232                         
       
  2233                         /* If d >= 0 we have a range. In UTF-8 mode, if the end is > 255, or >
       
  2234                          127 for caseless matching, we will need to use an XCLASS. */
       
  2235                         
       
  2236                         if (d >= 0) {
       
  2237                             class_optcount = 10;     /* Ensure > 1 */
       
  2238                             if (d < c) {
       
  2239                                 errorcode = ERR8;
       
  2240                                 return -1;
       
  2241                             }
       
  2242                             
       
  2243                             if ((d > 255 || (ignoreCase && d > 127))) {
       
  2244                                 unsigned char buffer[6];
       
  2245                                 if (!class_utf8)         /* Allow for XCLASS overhead */
       
  2246                                 {
       
  2247                                     class_utf8 = true;
       
  2248                                     length += LINK_SIZE + 2;
       
  2249                                 }
       
  2250                                 
       
  2251                                 /* If we have UCP support, find out how many extra ranges are
       
  2252                                  needed to map the other case of characters within this range. We
       
  2253                                  have to mimic the range optimization here, because extending the
       
  2254                                  range upwards might push d over a boundary that makes it use
       
  2255                                  another byte in the UTF-8 representation. */
       
  2256                                 
       
  2257                                 if (ignoreCase) {
       
  2258                                     int occ, ocd;
       
  2259                                     int cc = c;
       
  2260                                     int origd = d;
       
  2261                                     while (getOthercaseRange(&cc, origd, &occ, &ocd)) {
       
  2262                                         if (occ >= c && ocd <= d)
       
  2263                                             continue;   /* Skip embedded */
       
  2264                                         
       
  2265                                         if (occ < c  && ocd >= c - 1)  /* Extend the basic range */
       
  2266                                         {                            /* if there is overlap,   */
       
  2267                                             c = occ;                     /* noting that if occ < c */
       
  2268                                             continue;                    /* we can't have ocd > d  */
       
  2269                                         }                            /* because a subrange is  */
       
  2270                                         if (ocd > d && occ <= d + 1)   /* always shorter than    */
       
  2271                                         {                            /* the basic range.       */
       
  2272                                             d = ocd;
       
  2273                                             continue;
       
  2274                                         }
       
  2275                                         
       
  2276                                         /* An extra item is needed */
       
  2277                                         
       
  2278                                         length += 1 + encodeUTF8(occ, buffer) +
       
  2279                                         ((occ == ocd) ? 0 : encodeUTF8(ocd, buffer));
       
  2280                                     }
       
  2281                                 }
       
  2282                                 
       
  2283                                 /* The length of the (possibly extended) range */
       
  2284                                 
       
  2285                                 length += 1 + encodeUTF8(c, buffer) + encodeUTF8(d, buffer);
       
  2286                             }
       
  2287                             
       
  2288                         }
       
  2289                         
       
  2290                         /* We have a single character. There is nothing to be done unless we
       
  2291                          are in UTF-8 mode. If the char is > 255, or 127 when caseless, we must
       
  2292                          allow for an XCL_SINGLE item, doubled for caselessness if there is UCP
       
  2293                          support. */
       
  2294                         
       
  2295                         else {
       
  2296                             if ((c > 255 || (ignoreCase && c > 127))) {
       
  2297                                 unsigned char buffer[6];
       
  2298                                 class_optcount = 10;     /* Ensure > 1 */
       
  2299                                 if (!class_utf8)         /* Allow for XCLASS overhead */
       
  2300                                 {
       
  2301                                     class_utf8 = true;
       
  2302                                     length += LINK_SIZE + 2;
       
  2303                                 }
       
  2304                                 length += (ignoreCase ? 2 : 1) * (1 + encodeUTF8(c, buffer));
       
  2305                             }
       
  2306                         }
       
  2307                     }
       
  2308                 }
       
  2309                 
       
  2310                 if (ptr >= patternEnd) {   /* Missing terminating ']' */
       
  2311                     errorcode = ERR6;
       
  2312                     return -1;
       
  2313                 }
       
  2314                 
       
  2315                 /* We can optimize when there was only one optimizable character.
       
  2316                  Note that this does not detect the case of a negated single character.
       
  2317                  In that case we do an incorrect length computation, but it's not a serious
       
  2318                  problem because the computed length is too large rather than too small. */
       
  2319 
       
  2320                 if (class_optcount == 1)
       
  2321                     goto NORMAL_CHAR;
       
  2322 
       
  2323                 /* Here, we handle repeats for the class opcodes. */
       
  2324                 {
       
  2325                     length += 33;
       
  2326                     
       
  2327                     /* A repeat needs either 1 or 5 bytes. If it is a possessive quantifier,
       
  2328                      we also need extra for wrapping the whole thing in a sub-pattern. */
       
  2329                     
       
  2330                     if (safelyCheckNextChar(ptr, patternEnd, '{') && isCountedRepeat(ptr + 2, patternEnd)) {
       
  2331                         ptr = readRepeatCounts(ptr + 2, &minRepeats, &maxRepeats, &errorcode);
       
  2332                         if (errorcode != 0)
       
  2333                             return -1;
       
  2334                         if ((minRepeats == 0 && (maxRepeats == 1 || maxRepeats == -1)) ||
       
  2335                             (minRepeats == 1 && maxRepeats == -1))
       
  2336                             length++;
       
  2337                         else
       
  2338                             length += 5;
       
  2339                         if (safelyCheckNextChar(ptr, patternEnd, '+')) {
       
  2340                             ptr++;
       
  2341                             length += 2 + 2 * LINK_SIZE;
       
  2342                         } else if (safelyCheckNextChar(ptr, patternEnd, '?'))
       
  2343                             ptr++;
       
  2344                     }
       
  2345                 }
       
  2346                 continue;
       
  2347             }
       
  2348 
       
  2349             /* Brackets may be genuine groups or special things */
       
  2350                 
       
  2351             case '(': {
       
  2352                 int branch_newextra = 0;
       
  2353                 int bracket_length = 1 + LINK_SIZE;
       
  2354                 bool capturing = false;
       
  2355                 
       
  2356                 /* Handle special forms of bracket, which all start (? */
       
  2357                 
       
  2358                 if (safelyCheckNextChar(ptr, patternEnd, '?')) {
       
  2359                     switch (c = (ptr + 2 < patternEnd ? ptr[2] : 0)) {
       
  2360                         /* Non-referencing groups and lookaheads just move the pointer on, and
       
  2361                          then behave like a non-special bracket, except that they don't increment
       
  2362                          the count of extracting brackets. Ditto for the "once only" bracket,
       
  2363                          which is in Perl from version 5.005. */
       
  2364                             
       
  2365                         case ':':
       
  2366                         case '=':
       
  2367                         case '!':
       
  2368                             ptr += 2;
       
  2369                             break;
       
  2370                             
       
  2371                         /* Else loop checking valid options until ) is met. Anything else is an
       
  2372                          error. If we are without any brackets, i.e. at top level, the settings
       
  2373                          act as if specified in the options, so massage the options immediately.
       
  2374                          This is for backward compatibility with Perl 5.004. */
       
  2375                             
       
  2376                         default:
       
  2377                             errorcode = ERR12;
       
  2378                             return -1;
       
  2379                     }
       
  2380                 } else
       
  2381                     capturing = 1;
       
  2382                 
       
  2383                 /* Capturing brackets must be counted so we can process escapes in a
       
  2384                  Perlish way. If the number exceeds EXTRACT_BASIC_MAX we are going to need
       
  2385                  an additional 3 bytes of memory per capturing bracket. */
       
  2386                 
       
  2387                 if (capturing) {
       
  2388                     bracount++;
       
  2389                     if (bracount > EXTRACT_BASIC_MAX)
       
  2390                         bracket_length += 3;
       
  2391                 }
       
  2392                 
       
  2393                 /* Save length for computing whole length at end if there's a repeat that
       
  2394                  requires duplication of the group. Also save the current value of
       
  2395                  branch_extra, and start the new group with the new value. If non-zero, this
       
  2396                  will either be 2 for a (?imsx: group, or 3 for a lookbehind assertion. */
       
  2397                 
       
  2398                 if (brastackptr >= sizeof(brastack)/sizeof(int)) {
       
  2399                     errorcode = ERR17;
       
  2400                     return -1;
       
  2401                 }
       
  2402                 
       
  2403                 bralenstack[brastackptr] = branch_extra;
       
  2404                 branch_extra = branch_newextra;
       
  2405                 
       
  2406                 brastack[brastackptr++] = length;
       
  2407                 length += bracket_length;
       
  2408                 continue;
       
  2409             }
       
  2410 
       
  2411             /* Handle ket. Look for subsequent maxRepeats/minRepeats; for certain sets of values we
       
  2412              have to replicate this bracket up to that many times. If brastackptr is
       
  2413              0 this is an unmatched bracket which will generate an error, but take care
       
  2414              not to try to access brastack[-1] when computing the length and restoring
       
  2415              the branch_extra value. */
       
  2416 
       
  2417             case ')': {
       
  2418                 int duplength;
       
  2419                 length += 1 + LINK_SIZE;
       
  2420                 if (brastackptr > 0) {
       
  2421                     duplength = length - brastack[--brastackptr];
       
  2422                     branch_extra = bralenstack[brastackptr];
       
  2423                 }
       
  2424                 else
       
  2425                     duplength = 0;
       
  2426                 
       
  2427                 /* Leave ptr at the final char; for readRepeatCounts this happens
       
  2428                  automatically; for the others we need an increment. */
       
  2429                 
       
  2430                 if ((ptr + 1 < patternEnd) && (c = ptr[1]) == '{' && isCountedRepeat(ptr + 2, patternEnd)) {
       
  2431                     ptr = readRepeatCounts(ptr + 2, &minRepeats, &maxRepeats, &errorcode);
       
  2432                     if (errorcode)
       
  2433                         return -1;
       
  2434                 } else if (c == '*') {
       
  2435                     minRepeats = 0;
       
  2436                     maxRepeats = -1;
       
  2437                     ptr++;
       
  2438                 } else if (c == '+') {
       
  2439                     minRepeats = 1;
       
  2440                     maxRepeats = -1;
       
  2441                     ptr++;
       
  2442                 } else if (c == '?') {
       
  2443                     minRepeats = 0;
       
  2444                     maxRepeats = 1;
       
  2445                     ptr++;
       
  2446                 } else {
       
  2447                     minRepeats = 1;
       
  2448                     maxRepeats = 1;
       
  2449                 }
       
  2450                 
       
  2451                 /* If the minimum is zero, we have to allow for an OP_BRAZERO before the
       
  2452                  group, and if the maximum is greater than zero, we have to replicate
       
  2453                  maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting
       
  2454                  bracket set. */
       
  2455                 
       
  2456                 int repeatsLength;
       
  2457                 if (minRepeats == 0) {
       
  2458                     length++;
       
  2459                     if (maxRepeats > 0) {
       
  2460                         repeatsLength = multiplyWithOverflowCheck(maxRepeats - 1, duplength + 3 + 2 * LINK_SIZE);
       
  2461                         if (repeatsLength < 0) {
       
  2462                             errorcode = ERR16;
       
  2463                             return -1;
       
  2464                         }
       
  2465                         length += repeatsLength;
       
  2466                         if (length > MAX_PATTERN_SIZE) {
       
  2467                             errorcode = ERR16;
       
  2468                             return -1;
       
  2469                         }
       
  2470                     }
       
  2471                 }
       
  2472                 
       
  2473                 /* When the minimum is greater than zero, we have to replicate up to
       
  2474                  minval-1 times, with no additions required in the copies. Then, if there
       
  2475                  is a limited maximum we have to replicate up to maxval-1 times allowing
       
  2476                  for a BRAZERO item before each optional copy and nesting brackets for all
       
  2477                  but one of the optional copies. */
       
  2478                 
       
  2479                 else {
       
  2480                     repeatsLength = multiplyWithOverflowCheck(minRepeats - 1, duplength);
       
  2481                     if (repeatsLength < 0) {
       
  2482                         errorcode = ERR16;
       
  2483                         return -1;
       
  2484                     }
       
  2485                     length += repeatsLength;
       
  2486                     if (maxRepeats > minRepeats) { /* Need this test as maxRepeats=-1 means no limit */
       
  2487                         repeatsLength = multiplyWithOverflowCheck(maxRepeats - minRepeats, duplength + 3 + 2 * LINK_SIZE);
       
  2488                         if (repeatsLength < 0) {
       
  2489                             errorcode = ERR16;
       
  2490                             return -1;
       
  2491                         }
       
  2492                         length += repeatsLength - (2 + 2 * LINK_SIZE);
       
  2493                     }
       
  2494                     if (length > MAX_PATTERN_SIZE) {
       
  2495                         errorcode = ERR16;
       
  2496                         return -1;
       
  2497                     }
       
  2498                 }
       
  2499                 
       
  2500                 /* Allow space for once brackets for "possessive quantifier" */
       
  2501                 
       
  2502                 if (safelyCheckNextChar(ptr, patternEnd, '+')) {
       
  2503                     ptr++;
       
  2504                     length += 2 + 2 * LINK_SIZE;
       
  2505                 }
       
  2506                 continue;
       
  2507             }
       
  2508 
       
  2509             /* Non-special character. It won't be space or # in extended mode, so it is
       
  2510              always a genuine character. If we are in a \Q...\E sequence, check for the
       
  2511              end; if not, we have a literal. */
       
  2512                 
       
  2513             default:
       
  2514             NORMAL_CHAR:
       
  2515                 length += 2;          /* For a one-byte character */
       
  2516                 lastitemlength = 1;   /* Default length of last item for repeats */
       
  2517 
       
  2518                 if (c > 127) {
       
  2519                     int i;
       
  2520                     for (i = 0; i < jsc_pcre_utf8_table1_size; i++)
       
  2521                         if (c <= jsc_pcre_utf8_table1[i])
       
  2522                             break;
       
  2523                     length += i;
       
  2524                     lastitemlength += i;
       
  2525                 }
       
  2526                 
       
  2527                 continue;
       
  2528         }
       
  2529     }
       
  2530     
       
  2531     length += 2 + LINK_SIZE;    /* For final KET and END */
       
  2532 
       
  2533     cd.numCapturingBrackets = bracount;
       
  2534     return length;
       
  2535 }
       
  2536 
       
  2537 /*************************************************
       
  2538 *        Compile a Regular Expression            *
       
  2539 *************************************************/
       
  2540 
       
  2541 /* This function takes a string and returns a pointer to a block of store
       
  2542 holding a compiled version of the expression. The original API for this
       
  2543 function had no error code return variable; it is retained for backwards
       
  2544 compatibility. The new function is given a new name.
       
  2545 
       
  2546 Arguments:
       
  2547   pattern       the regular expression
       
  2548   options       various option bits
       
  2549   errorCodePtr  pointer to error code variable (pcre_compile2() only)
       
  2550                   can be NULL if you don't want a code value
       
  2551   errorPtr      pointer to pointer to error text
       
  2552   erroroffset   ptr offset in pattern where error was detected
       
  2553   tables        pointer to character tables or NULL
       
  2554 
       
  2555 Returns:        pointer to compiled data block, or NULL on error,
       
  2556                 with errorPtr and erroroffset set
       
  2557 */
       
  2558 
       
  2559 static inline JSRegExp* returnError(ErrorCode errorcode, const char** errorPtr)
       
  2560 {
       
  2561     *errorPtr = errorText(errorcode);
       
  2562     return 0;
       
  2563 }
       
  2564 
       
  2565 JSRegExp* jsRegExpCompile(const UChar* pattern, int patternLength,
       
  2566                 JSRegExpIgnoreCaseOption ignoreCase, JSRegExpMultilineOption multiline,
       
  2567                 unsigned* numSubpatterns, const char** errorPtr)
       
  2568 {
       
  2569     /* We can't pass back an error message if errorPtr is NULL; I guess the best we
       
  2570      can do is just return NULL, but we can set a code value if there is a code pointer. */
       
  2571     if (!errorPtr)
       
  2572         return 0;
       
  2573     *errorPtr = NULL;
       
  2574     
       
  2575     CompileData cd;
       
  2576     
       
  2577     ErrorCode errorcode = ERR0;
       
  2578     /* Call this once just to count the brackets. */
       
  2579     calculateCompiledPatternLength(pattern, patternLength, ignoreCase, cd, errorcode);
       
  2580     /* Call it again to compute the length. */
       
  2581     int length = calculateCompiledPatternLength(pattern, patternLength, ignoreCase, cd, errorcode);
       
  2582     if (errorcode)
       
  2583         return returnError(errorcode, errorPtr);
       
  2584     
       
  2585     if (length > MAX_PATTERN_SIZE)
       
  2586         return returnError(ERR16, errorPtr);
       
  2587     
       
  2588     size_t size = length + sizeof(JSRegExp);
       
  2589 #if REGEXP_HISTOGRAM
       
  2590     size_t stringOffset = (size + sizeof(UChar) - 1) / sizeof(UChar) * sizeof(UChar);
       
  2591     size = stringOffset + patternLength * sizeof(UChar);
       
  2592 #endif
       
  2593     JSRegExp* re = reinterpret_cast<JSRegExp*>(new char[size]);
       
  2594     
       
  2595     if (!re)
       
  2596         return returnError(ERR13, errorPtr);
       
  2597     
       
  2598     re->options = (ignoreCase ? IgnoreCaseOption : 0) | (multiline ? MatchAcrossMultipleLinesOption : 0);
       
  2599     
       
  2600     /* The starting points of the name/number translation table and of the code are
       
  2601      passed around in the compile data block. */
       
  2602     
       
  2603     const unsigned char* codeStart = (const unsigned char*)(re + 1);
       
  2604     
       
  2605     /* Set up a starting, non-extracting bracket, then compile the expression. On
       
  2606      error, errorcode will be set non-zero, so we don't need to look at the result
       
  2607      of the function here. */
       
  2608     
       
  2609     const UChar* ptr = (const UChar*)pattern;
       
  2610     const UChar* patternEnd = pattern + patternLength;
       
  2611     unsigned char* code = const_cast<unsigned char*>(codeStart);
       
  2612     int firstByte, reqByte;
       
  2613     int bracketCount = 0;
       
  2614     if (!cd.needOuterBracket)
       
  2615         compileBranch(re->options, &bracketCount, &code, &ptr, patternEnd, &errorcode, &firstByte, &reqByte, cd);
       
  2616     else {
       
  2617         *code = OP_BRA;
       
  2618         compileBracket(re->options, &bracketCount, &code, &ptr, patternEnd, &errorcode, 0, &firstByte, &reqByte, cd);
       
  2619     }
       
  2620     re->topBracket = bracketCount;
       
  2621     re->topBackref = cd.topBackref;
       
  2622     
       
  2623     /* If not reached end of pattern on success, there's an excess bracket. */
       
  2624     
       
  2625     if (errorcode == 0 && ptr < patternEnd)
       
  2626         errorcode = ERR10;
       
  2627     
       
  2628     /* Fill in the terminating state and check for disastrous overflow, but
       
  2629      if debugging, leave the test till after things are printed out. */
       
  2630     
       
  2631     *code++ = OP_END;
       
  2632 
       
  2633     ASSERT(code - codeStart <= length);
       
  2634     if (code - codeStart > length)
       
  2635         errorcode = ERR7;
       
  2636     
       
  2637     /* Give an error if there's back reference to a non-existent capturing
       
  2638      subpattern. */
       
  2639     
       
  2640     if (re->topBackref > re->topBracket)
       
  2641         errorcode = ERR15;
       
  2642     
       
  2643     /* Failed to compile, or error while post-processing */
       
  2644     
       
  2645     if (errorcode != ERR0) {
       
  2646         delete [] reinterpret_cast<char*>(re);
       
  2647         return returnError(errorcode, errorPtr);
       
  2648     }
       
  2649     
       
  2650     /* If the anchored option was not passed, set the flag if we can determine that
       
  2651      the pattern is anchored by virtue of ^ characters or \A or anything else (such
       
  2652      as starting with .* when DOTALL is set).
       
  2653      
       
  2654      Otherwise, if we know what the first character has to be, save it, because that
       
  2655      speeds up unanchored matches no end. If not, see if we can set the
       
  2656      UseMultiLineFirstByteOptimizationOption flag. This is helpful for multiline matches when all branches
       
  2657      start with ^. and also when all branches start with .* for non-DOTALL matches.
       
  2658      */
       
  2659     
       
  2660     if (cd.needOuterBracket ? bracketIsAnchored(codeStart) : branchIsAnchored(codeStart))
       
  2661         re->options |= IsAnchoredOption;
       
  2662     else {
       
  2663         if (firstByte < 0) {
       
  2664             firstByte = (cd.needOuterBracket
       
  2665                     ? bracketFindFirstAssertedCharacter(codeStart, false)
       
  2666                     : branchFindFirstAssertedCharacter(codeStart, false))
       
  2667                 | ((re->options & IgnoreCaseOption) ? REQ_IGNORE_CASE : 0);
       
  2668         }
       
  2669         if (firstByte >= 0) {
       
  2670             int ch = firstByte & 255;
       
  2671             if (ch < 127) {
       
  2672                 re->firstByte = ((firstByte & REQ_IGNORE_CASE) && flipCase(ch) == ch) ? ch : firstByte;
       
  2673                 re->options |= UseFirstByteOptimizationOption;
       
  2674             }
       
  2675         } else {
       
  2676             if (cd.needOuterBracket ? bracketNeedsLineStart(codeStart, 0, cd.backrefMap) : branchNeedsLineStart(codeStart, 0, cd.backrefMap))
       
  2677                 re->options |= UseMultiLineFirstByteOptimizationOption;
       
  2678         }
       
  2679     }
       
  2680     
       
  2681     /* For an anchored pattern, we use the "required byte" only if it follows a
       
  2682      variable length item in the regex. Remove the caseless flag for non-caseable
       
  2683      bytes. */
       
  2684     
       
  2685     if (reqByte >= 0 && (!(re->options & IsAnchoredOption) || (reqByte & REQ_VARY))) {
       
  2686         int ch = reqByte & 255;
       
  2687         if (ch < 127) {
       
  2688             re->reqByte = ((reqByte & REQ_IGNORE_CASE) && flipCase(ch) == ch) ? (reqByte & ~REQ_IGNORE_CASE) : reqByte;
       
  2689             re->options |= UseRequiredByteOptimizationOption;
       
  2690         }
       
  2691     }
       
  2692     
       
  2693 #if REGEXP_HISTOGRAM
       
  2694     re->stringOffset = stringOffset;
       
  2695     re->stringLength = patternLength;
       
  2696     memcpy(reinterpret_cast<char*>(re) + stringOffset, pattern, patternLength * 2);
       
  2697 #endif
       
  2698 
       
  2699     if (numSubpatterns)
       
  2700         *numSubpatterns = re->topBracket;
       
  2701     return re;
       
  2702 }
       
  2703 
       
  2704 void jsRegExpFree(JSRegExp* re)
       
  2705 {
       
  2706     delete [] reinterpret_cast<char*>(re);
       
  2707 }