libraries/spcre/libpcre/pcre/pcre_compile.c
changeset 0 7f656887cf89
equal deleted inserted replaced
-1:000000000000 0:7f656887cf89
       
     1 /*************************************************
       
     2 *      Perl-Compatible Regular Expressions       *
       
     3 *************************************************/
       
     4 
       
     5 /* PCRE is a library of functions to support regular expressions whose syntax
       
     6 and semantics are as close as possible to those of the Perl 5 language.
       
     7 
       
     8                        Written by Philip Hazel
       
     9            Copyright (c) 1997-2008 University of Cambridge
       
    10 
       
    11 -----------------------------------------------------------------------------
       
    12 Redistribution and use in source and binary forms, with or without
       
    13 modification, are permitted provided that the following conditions are met:
       
    14 
       
    15     * Redistributions of source code must retain the above copyright notice,
       
    16       this list of conditions and the following disclaimer.
       
    17 
       
    18     * Redistributions in binary form must reproduce the above copyright
       
    19       notice, this list of conditions and the following disclaimer in the
       
    20       documentation and/or other materials provided with the distribution.
       
    21 
       
    22     * Neither the name of the University of Cambridge nor the names of its
       
    23       contributors may be used to endorse or promote products derived from
       
    24       this software without specific prior written permission.
       
    25 
       
    26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
       
    27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
       
    28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
       
    29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
       
    30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
       
    31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
       
    32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
       
    33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
       
    34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
       
    35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
       
    36 POSSIBILITY OF SUCH DAMAGE.
       
    37 -----------------------------------------------------------------------------
       
    38 */
       
    39 
       
    40 
       
    41 /* This module contains the external function pcre_compile(), along with
       
    42 supporting internal functions that are not used by other modules. */
       
    43 
       
    44 
       
    45 #ifdef HAVE_CONFIG_H
       
    46 #include "config.h"
       
    47 #endif
       
    48 
       
    49 #define NLBLOCK cd             /* Block containing newline information */
       
    50 #define PSSTART start_pattern  /* Field containing processed string start */
       
    51 #define PSEND   end_pattern    /* Field containing processed string end */
       
    52 
       
    53 #include "pcre_internal.h"
       
    54 
       
    55 
       
    56 /* When DEBUG is defined, we need the pcre_printint() function, which is also
       
    57 used by pcretest. DEBUG is not defined when building a production library. */
       
    58 
       
    59 #ifdef DEBUG
       
    60 #include "pcre_printint.src"
       
    61 #endif
       
    62 
       
    63 
       
    64 /* Macro for setting individual bits in class bitmaps. */
       
    65 
       
    66 #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
       
    67 
       
    68 /* Maximum length value to check against when making sure that the integer that
       
    69 holds the compiled pattern length does not overflow. We make it a bit less than
       
    70 INT_MAX to allow for adding in group terminating bytes, so that we don't have
       
    71 to check them every time. */
       
    72 
       
    73 #define OFLOW_MAX (INT_MAX - 20)
       
    74 
       
    75 
       
    76 /*************************************************
       
    77 *      Code parameters and static tables         *
       
    78 *************************************************/
       
    79 
       
    80 /* This value specifies the size of stack workspace that is used during the
       
    81 first pre-compile phase that determines how much memory is required. The regex
       
    82 is partly compiled into this space, but the compiled parts are discarded as
       
    83 soon as they can be, so that hopefully there will never be an overrun. The code
       
    84 does, however, check for an overrun. The largest amount I've seen used is 218,
       
    85 so this number is very generous.
       
    86 
       
    87 The same workspace is used during the second, actual compile phase for
       
    88 remembering forward references to groups so that they can be filled in at the
       
    89 end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
       
    90 is 4 there is plenty of room. */
       
    91 
       
    92 #define COMPILE_WORK_SIZE (4096)
       
    93 
       
    94 
       
    95 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
       
    96 are simple data values; negative values are for special things like \d and so
       
    97 on. Zero means further processing is needed (for things like \x), or the escape
       
    98 is invalid. */
       
    99 
       
   100 #ifndef EBCDIC  /* This is the "normal" table for ASCII systems */
       
   101 static const short int escapes[] = {
       
   102      0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */
       
   103      0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */
       
   104    '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E,      0, -ESC_G,   /* @ - G */
       
   105 -ESC_H,      0,      0, -ESC_K,      0,      0,      0,      0,   /* H - O */
       
   106 -ESC_P, -ESC_Q, -ESC_R, -ESC_S,      0,      0, -ESC_V, -ESC_W,   /* P - W */
       
   107 -ESC_X,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */
       
   108    '`',      7, -ESC_b,      0, -ESC_d,  ESC_e,  ESC_f,      0,   /* ` - g */
       
   109 -ESC_h,      0,      0, -ESC_k,      0,      0,  ESC_n,      0,   /* h - o */
       
   110 -ESC_p,      0,  ESC_r, -ESC_s,  ESC_tee,    0, -ESC_v, -ESC_w,   /* p - w */
       
   111      0,      0, -ESC_z                                            /* x - z */
       
   112 };
       
   113 
       
   114 #else           /* This is the "abnormal" table for EBCDIC systems */
       
   115 static const short int escapes[] = {
       
   116 /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',
       
   117 /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,
       
   118 /*  58 */     0,     0,    '!',     '$',    '*',   ')',    ';',    '~',
       
   119 /*  60 */   '-',   '/',      0,       0,      0,     0,      0,      0,
       
   120 /*  68 */     0,     0,    '|',     ',',    '%',   '_',    '>',    '?',
       
   121 /*  70 */     0,     0,      0,       0,      0,     0,      0,      0,
       
   122 /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',
       
   123 /*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,
       
   124 /*  88 */-ESC_h,     0,      0,     '{',      0,     0,      0,      0,
       
   125 /*  90 */     0,     0, -ESC_k,     'l',      0, ESC_n,      0, -ESC_p,
       
   126 /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,
       
   127 /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,-ESC_v, -ESC_w,      0,
       
   128 /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,
       
   129 /*  B0 */     0,     0,      0,       0,      0,     0,      0,      0,
       
   130 /*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',
       
   131 /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,
       
   132 /*  C8 */-ESC_H,     0,      0,       0,      0,     0,      0,      0,
       
   133 /*  D0 */   '}',     0, -ESC_K,       0,      0,     0,      0, -ESC_P,
       
   134 /*  D8 */-ESC_Q,-ESC_R,      0,       0,      0,     0,      0,      0,
       
   135 /*  E0 */  '\\',     0, -ESC_S,       0,      0,-ESC_V, -ESC_W, -ESC_X,
       
   136 /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,
       
   137 /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,
       
   138 /*  F8 */     0,     0,      0,       0,      0,     0,      0,      0
       
   139 };
       
   140 #endif
       
   141 
       
   142 
       
   143 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
       
   144 searched linearly. Put all the names into a single string, in order to reduce
       
   145 the number of relocations when a shared library is dynamically linked. */
       
   146 
       
   147 typedef struct verbitem {
       
   148   int   len;
       
   149   int   op;
       
   150 } verbitem;
       
   151 
       
   152 static const char verbnames[] =
       
   153   "ACCEPT\0"
       
   154   "COMMIT\0"
       
   155   "F\0"
       
   156   "FAIL\0"
       
   157   "PRUNE\0"
       
   158   "SKIP\0"
       
   159   "THEN";
       
   160 
       
   161 static const verbitem verbs[] = {
       
   162   { 6, OP_ACCEPT },
       
   163   { 6, OP_COMMIT },
       
   164   { 1, OP_FAIL },
       
   165   { 4, OP_FAIL },
       
   166   { 5, OP_PRUNE },
       
   167   { 4, OP_SKIP  },
       
   168   { 4, OP_THEN  }
       
   169 };
       
   170 
       
   171 static const int verbcount = sizeof(verbs)/sizeof(verbitem);
       
   172 
       
   173 
       
   174 /* Tables of names of POSIX character classes and their lengths. The names are
       
   175 now all in a single string, to reduce the number of relocations when a shared
       
   176 library is dynamically loaded. The list of lengths is terminated by a zero
       
   177 length entry. The first three must be alpha, lower, upper, as this is assumed
       
   178 for handling case independence. */
       
   179 
       
   180 static const char posix_names[] =
       
   181   "alpha\0"  "lower\0"  "upper\0"  "alnum\0"  "ascii\0"  "blank\0"
       
   182   "cntrl\0"  "digit\0"  "graph\0"  "print\0"  "punct\0"  "space\0"
       
   183   "word\0"   "xdigit";
       
   184 
       
   185 static const uschar posix_name_lengths[] = {
       
   186   5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
       
   187 
       
   188 /* Table of class bit maps for each POSIX class. Each class is formed from a
       
   189 base map, with an optional addition or removal of another map. Then, for some
       
   190 classes, there is some additional tweaking: for [:blank:] the vertical space
       
   191 characters are removed, and for [:alpha:] and [:alnum:] the underscore
       
   192 character is removed. The triples in the table consist of the base map offset,
       
   193 second map offset or -1 if no second map, and a non-negative value for map
       
   194 addition or a negative value for map subtraction (if there are two maps). The
       
   195 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
       
   196 remove vertical space characters, 2 => remove underscore. */
       
   197 
       
   198 static const int posix_class_maps[] = {
       
   199   cbit_word,  cbit_digit, -2,             /* alpha */
       
   200   cbit_lower, -1,          0,             /* lower */
       
   201   cbit_upper, -1,          0,             /* upper */
       
   202   cbit_word,  -1,          2,             /* alnum - word without underscore */
       
   203   cbit_print, cbit_cntrl,  0,             /* ascii */
       
   204   cbit_space, -1,          1,             /* blank - a GNU extension */
       
   205   cbit_cntrl, -1,          0,             /* cntrl */
       
   206   cbit_digit, -1,          0,             /* digit */
       
   207   cbit_graph, -1,          0,             /* graph */
       
   208   cbit_print, -1,          0,             /* print */
       
   209   cbit_punct, -1,          0,             /* punct */
       
   210   cbit_space, -1,          0,             /* space */
       
   211   cbit_word,  -1,          0,             /* word - a Perl extension */
       
   212   cbit_xdigit,-1,          0              /* xdigit */
       
   213 };
       
   214 
       
   215 
       
   216 #define STRING(a)  # a
       
   217 #define XSTRING(s) STRING(s)
       
   218 
       
   219 /* The texts of compile-time error messages. These are "char *" because they
       
   220 are passed to the outside world. Do not ever re-use any error number, because
       
   221 they are documented. Always add a new error instead. Messages marked DEAD below
       
   222 are no longer used. This used to be a table of strings, but in order to reduce
       
   223 the number of relocations needed when a shared library is loaded dynamically,
       
   224 it is now one long string. We cannot use a table of offsets, because the
       
   225 lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
       
   226 simply count through to the one we want - this isn't a performance issue
       
   227 because these strings are used only when there is a compilation error. */
       
   228 
       
   229 static const char error_texts[] =
       
   230   "no error\0"
       
   231   "\\ at end of pattern\0"
       
   232   "\\c at end of pattern\0"
       
   233   "unrecognized character follows \\\0"
       
   234   "numbers out of order in {} quantifier\0"
       
   235   /* 5 */
       
   236   "number too big in {} quantifier\0"
       
   237   "missing terminating ] for character class\0"
       
   238   "invalid escape sequence in character class\0"
       
   239   "range out of order in character class\0"
       
   240   "nothing to repeat\0"
       
   241   /* 10 */
       
   242   "operand of unlimited repeat could match the empty string\0"  /** DEAD **/
       
   243   "internal error: unexpected repeat\0"
       
   244   "unrecognized character after (? or (?-\0"
       
   245   "POSIX named classes are supported only within a class\0"
       
   246   "missing )\0"
       
   247   /* 15 */
       
   248   "reference to non-existent subpattern\0"
       
   249   "erroffset passed as NULL\0"
       
   250   "unknown option bit(s) set\0"
       
   251   "missing ) after comment\0"
       
   252   "parentheses nested too deeply\0"  /** DEAD **/
       
   253   /* 20 */
       
   254   "regular expression is too large\0"
       
   255   "failed to get memory\0"
       
   256   "unmatched parentheses\0"
       
   257   "internal error: code overflow\0"
       
   258   "unrecognized character after (?<\0"
       
   259   /* 25 */
       
   260   "lookbehind assertion is not fixed length\0"
       
   261   "malformed number or name after (?(\0"
       
   262   "conditional group contains more than two branches\0"
       
   263   "assertion expected after (?(\0"
       
   264   "(?R or (?[+-]digits must be followed by )\0"
       
   265   /* 30 */
       
   266   "unknown POSIX class name\0"
       
   267   "POSIX collating elements are not supported\0"
       
   268   "this version of PCRE is not compiled with PCRE_UTF8 support\0"
       
   269   "spare error\0"  /** DEAD **/
       
   270   "character value in \\x{...} sequence is too large\0"
       
   271   /* 35 */
       
   272   "invalid condition (?(0)\0"
       
   273   "\\C not allowed in lookbehind assertion\0"
       
   274   "PCRE does not support \\L, \\l, \\N, \\U, or \\u\0"
       
   275   "number after (?C is > 255\0"
       
   276   "closing ) for (?C expected\0"
       
   277   /* 40 */
       
   278   "recursive call could loop indefinitely\0"
       
   279   "unrecognized character after (?P\0"
       
   280   "syntax error in subpattern name (missing terminator)\0"
       
   281   "two named subpatterns have the same name\0"
       
   282   "invalid UTF-8 string\0"
       
   283   /* 45 */
       
   284   "support for \\P, \\p, and \\X has not been compiled\0"
       
   285   "malformed \\P or \\p sequence\0"
       
   286   "unknown property name after \\P or \\p\0"
       
   287   "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
       
   288   "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
       
   289   /* 50 */
       
   290   "repeated subpattern is too long\0"    /** DEAD **/
       
   291   "octal value is greater than \\377 (not in UTF-8 mode)\0"
       
   292   "internal error: overran compiling workspace\0"
       
   293   "internal error: previously-checked referenced subpattern not found\0"
       
   294   "DEFINE group contains more than one branch\0"
       
   295   /* 55 */
       
   296   "repeating a DEFINE group is not allowed\0"
       
   297   "inconsistent NEWLINE options\0"
       
   298   "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
       
   299   "a numbered reference must not be zero\0"
       
   300   "(*VERB) with an argument is not supported\0"
       
   301   /* 60 */
       
   302   "(*VERB) not recognized\0"
       
   303   "number is too big\0"
       
   304   "subpattern name expected\0"
       
   305   "digit expected after (?+\0"
       
   306   "] is an invalid data character in JavaScript compatibility mode";
       
   307 
       
   308 
       
   309 /* Table to identify digits and hex digits. This is used when compiling
       
   310 patterns. Note that the tables in chartables are dependent on the locale, and
       
   311 may mark arbitrary characters as digits - but the PCRE compiling code expects
       
   312 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
       
   313 a private table here. It costs 256 bytes, but it is a lot faster than doing
       
   314 character value tests (at least in some simple cases I timed), and in some
       
   315 applications one wants PCRE to compile efficiently as well as match
       
   316 efficiently.
       
   317 
       
   318 For convenience, we use the same bit definitions as in chartables:
       
   319 
       
   320   0x04   decimal digit
       
   321   0x08   hexadecimal digit
       
   322 
       
   323 Then we can use ctype_digit and ctype_xdigit in the code. */
       
   324 
       
   325 #ifndef EBCDIC  /* This is the "normal" case, for ASCII systems */
       
   326 static const unsigned char digitab[] =
       
   327   {
       
   328   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
       
   329   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15 */
       
   330   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  16- 23 */
       
   331   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31 */
       
   332   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - '  */
       
   333   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ( - /  */
       
   334   0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  */
       
   335   0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /*  8 - ?  */
       
   336   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  @ - G  */
       
   337   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  H - O  */
       
   338   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  P - W  */
       
   339   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  X - _  */
       
   340   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  ` - g  */
       
   341   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  h - o  */
       
   342   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  p - w  */
       
   343   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  x -127 */
       
   344   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
       
   345   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
       
   346   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
       
   347   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
       
   348   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
       
   349   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
       
   350   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
       
   351   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
       
   352   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
       
   353   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
       
   354   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
       
   355   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
       
   356   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
       
   357   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
       
   358   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
       
   359   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
       
   360 
       
   361 #else           /* This is the "abnormal" case, for EBCDIC systems */
       
   362 static const unsigned char digitab[] =
       
   363   {
       
   364   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */
       
   365   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15    */
       
   366   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  16- 23 10 */
       
   367   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31    */
       
   368   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  32- 39 20 */
       
   369   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  40- 47    */
       
   370   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  48- 55 30 */
       
   371   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  56- 63    */
       
   372   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */
       
   373   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */
       
   374   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */
       
   375   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88- 95    */
       
   376   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */
       
   377   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */
       
   378   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
       
   379   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- "     */
       
   380   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g  80 */
       
   381   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  h -143    */
       
   382   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p  90 */
       
   383   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  q -159    */
       
   384   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x  A0 */
       
   385   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  y -175    */
       
   386   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ^ -183 B0 */
       
   387   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191    */
       
   388   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  { - G  C0 */
       
   389   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  H -207    */
       
   390   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  } - P  D0 */
       
   391   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  Q -223    */
       
   392   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  \ - X  E0 */
       
   393   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  Y -239    */
       
   394   0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  F0 */
       
   395   0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255    */
       
   396 
       
   397 static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
       
   398   0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*   0-  7 */
       
   399   0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /*   8- 15 */
       
   400   0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  16- 23 */
       
   401   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31 */
       
   402   0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  32- 39 */
       
   403   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  40- 47 */
       
   404   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  48- 55 */
       
   405   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  56- 63 */
       
   406   0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */
       
   407   0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */
       
   408   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */
       
   409   0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88- 95 */
       
   410   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */
       
   411   0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */
       
   412   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
       
   413   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- "  */
       
   414   0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g  */
       
   415   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  h -143 */
       
   416   0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p  */
       
   417   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  q -159 */
       
   418   0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x  */
       
   419   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  y -175 */
       
   420   0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ^ -183 */
       
   421   0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
       
   422   0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /*  { - G  */
       
   423   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  H -207 */
       
   424   0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /*  } - P  */
       
   425   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  Q -223 */
       
   426   0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /*  \ - X  */
       
   427   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  Y -239 */
       
   428   0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /*  0 - 7  */
       
   429   0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255 */
       
   430 #endif
       
   431 
       
   432 
       
   433 /* Definition to allow mutual recursion */
       
   434 
       
   435 static BOOL
       
   436   compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
       
   437     int *, int *, branch_chain *, compile_data *, int *);
       
   438 
       
   439 
       
   440 
       
   441 /*************************************************
       
   442 *            Find an error text                  *
       
   443 *************************************************/
       
   444 
       
   445 /* The error texts are now all in one long string, to save on relocations. As
       
   446 some of the text is of unknown length, we can't use a table of offsets.
       
   447 Instead, just count through the strings. This is not a performance issue
       
   448 because it happens only when there has been a compilation error.
       
   449 
       
   450 Argument:   the error number
       
   451 Returns:    pointer to the error string
       
   452 */
       
   453 
       
   454 static const char *
       
   455 find_error_text(int n)
       
   456 {
       
   457 const char *s = error_texts;
       
   458 for (; n > 0; n--) while (*s++ != 0) {};
       
   459 return s;
       
   460 }
       
   461 
       
   462 
       
   463 /*************************************************
       
   464 *            Handle escapes                      *
       
   465 *************************************************/
       
   466 
       
   467 /* This function is called when a \ has been encountered. It either returns a
       
   468 positive value for a simple escape such as \n, or a negative value which
       
   469 encodes one of the more complicated things such as \d. A backreference to group
       
   470 n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
       
   471 UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
       
   472 ptr is pointing at the \. On exit, it is on the final character of the escape
       
   473 sequence.
       
   474 
       
   475 Arguments:
       
   476   ptrptr         points to the pattern position pointer
       
   477   errorcodeptr   points to the errorcode variable
       
   478   bracount       number of previous extracting brackets
       
   479   options        the options bits
       
   480   isclass        TRUE if inside a character class
       
   481 
       
   482 Returns:         zero or positive => a data character
       
   483                  negative => a special escape sequence
       
   484                  on error, errorcodeptr is set
       
   485 */
       
   486 
       
   487 static int
       
   488 check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
       
   489   int options, BOOL isclass)
       
   490 {
       
   491 BOOL utf8 = (options & PCRE_UTF8) != 0;
       
   492 const uschar *ptr = *ptrptr + 1;
       
   493 int c, i;
       
   494 
       
   495 GETCHARINCTEST(c, ptr);           /* Get character value, increment pointer */
       
   496 ptr--;                            /* Set pointer back to the last byte */
       
   497 
       
   498 /* If backslash is at the end of the pattern, it's an error. */
       
   499 
       
   500 if (c == 0) *errorcodeptr = ERR1;
       
   501 
       
   502 /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
       
   503 in a table. A non-zero result is something that can be returned immediately.
       
   504 Otherwise further processing may be required. */
       
   505 
       
   506 #ifndef EBCDIC  /* ASCII coding */
       
   507 else if (c < '0' || c > 'z') {}                           /* Not alphanumeric */
       
   508 else if ((i = escapes[c - '0']) != 0) c = i;
       
   509 
       
   510 #else           /* EBCDIC coding */
       
   511 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphanumeric */
       
   512 else if ((i = escapes[c - 0x48]) != 0)  c = i;
       
   513 #endif
       
   514 
       
   515 /* Escapes that need further processing, or are illegal. */
       
   516 
       
   517 else
       
   518   {
       
   519   const uschar *oldptr;
       
   520   BOOL braced, negated;
       
   521 
       
   522   switch (c)
       
   523     {
       
   524     /* A number of Perl escapes are not handled by PCRE. We give an explicit
       
   525     error. */
       
   526 
       
   527     case 'l':
       
   528     case 'L':
       
   529     case 'N':
       
   530     case 'u':
       
   531     case 'U':
       
   532     *errorcodeptr = ERR37;
       
   533     break;
       
   534 
       
   535     /* \g must be followed by one of a number of specific things:
       
   536 
       
   537     (1) A number, either plain or braced. If positive, it is an absolute
       
   538     backreference. If negative, it is a relative backreference. This is a Perl
       
   539     5.10 feature.
       
   540 
       
   541     (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
       
   542     is part of Perl's movement towards a unified syntax for back references. As
       
   543     this is synonymous with \k{name}, we fudge it up by pretending it really
       
   544     was \k.
       
   545 
       
   546     (3) For Oniguruma compatibility we also support \g followed by a name or a
       
   547     number either in angle brackets or in single quotes. However, these are
       
   548     (possibly recursive) subroutine calls, _not_ backreferences. Just return
       
   549     the -ESC_g code (cf \k). */
       
   550 
       
   551     case 'g':
       
   552     if (ptr[1] == '<' || ptr[1] == '\'')
       
   553       {
       
   554       c = -ESC_g;
       
   555       break;
       
   556       }
       
   557 
       
   558     /* Handle the Perl-compatible cases */
       
   559 
       
   560     if (ptr[1] == '{')
       
   561       {
       
   562       const uschar *p;
       
   563       for (p = ptr+2; *p != 0 && *p != '}'; p++)
       
   564         if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;
       
   565       if (*p != 0 && *p != '}')
       
   566         {
       
   567         c = -ESC_k;
       
   568         break;
       
   569         }
       
   570       braced = TRUE;
       
   571       ptr++;
       
   572       }
       
   573     else braced = FALSE;
       
   574 
       
   575     if (ptr[1] == '-')
       
   576       {
       
   577       negated = TRUE;
       
   578       ptr++;
       
   579       }
       
   580     else negated = FALSE;
       
   581 
       
   582     c = 0;
       
   583     while ((digitab[ptr[1]] & ctype_digit) != 0)
       
   584       c = c * 10 + *(++ptr) - '0';
       
   585 
       
   586     if (c < 0)   /* Integer overflow */
       
   587       {
       
   588       *errorcodeptr = ERR61;
       
   589       break;
       
   590       }
       
   591 
       
   592     if (braced && *(++ptr) != '}')
       
   593       {
       
   594       *errorcodeptr = ERR57;
       
   595       break;
       
   596       }
       
   597 
       
   598     if (c == 0)
       
   599       {
       
   600       *errorcodeptr = ERR58;
       
   601       break;
       
   602       }
       
   603 
       
   604     if (negated)
       
   605       {
       
   606       if (c > bracount)
       
   607         {
       
   608         *errorcodeptr = ERR15;
       
   609         break;
       
   610         }
       
   611       c = bracount - (c - 1);
       
   612       }
       
   613 
       
   614     c = -(ESC_REF + c);
       
   615     break;
       
   616 
       
   617     /* The handling of escape sequences consisting of a string of digits
       
   618     starting with one that is not zero is not straightforward. By experiment,
       
   619     the way Perl works seems to be as follows:
       
   620 
       
   621     Outside a character class, the digits are read as a decimal number. If the
       
   622     number is less than 10, or if there are that many previous extracting
       
   623     left brackets, then it is a back reference. Otherwise, up to three octal
       
   624     digits are read to form an escaped byte. Thus \123 is likely to be octal
       
   625     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
       
   626     value is greater than 377, the least significant 8 bits are taken. Inside a
       
   627     character class, \ followed by a digit is always an octal number. */
       
   628 
       
   629     case '1': case '2': case '3': case '4': case '5':
       
   630     case '6': case '7': case '8': case '9':
       
   631 
       
   632     if (!isclass)
       
   633       {
       
   634       oldptr = ptr;
       
   635       c -= '0';
       
   636       while ((digitab[ptr[1]] & ctype_digit) != 0)
       
   637         c = c * 10 + *(++ptr) - '0';
       
   638       if (c < 0)    /* Integer overflow */
       
   639         {
       
   640         *errorcodeptr = ERR61;
       
   641         break;
       
   642         }
       
   643       if (c < 10 || c <= bracount)
       
   644         {
       
   645         c = -(ESC_REF + c);
       
   646         break;
       
   647         }
       
   648       ptr = oldptr;      /* Put the pointer back and fall through */
       
   649       }
       
   650 
       
   651     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
       
   652     generates a binary zero byte and treats the digit as a following literal.
       
   653     Thus we have to pull back the pointer by one. */
       
   654 
       
   655     if ((c = *ptr) >= '8')
       
   656       {
       
   657       ptr--;
       
   658       c = 0;
       
   659       break;
       
   660       }
       
   661 
       
   662     /* \0 always starts an octal number, but we may drop through to here with a
       
   663     larger first octal digit. The original code used just to take the least
       
   664     significant 8 bits of octal numbers (I think this is what early Perls used
       
   665     to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
       
   666     than 3 octal digits. */
       
   667 
       
   668     case '0':
       
   669     c -= '0';
       
   670     while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
       
   671         c = c * 8 + *(++ptr) - '0';
       
   672     if (!utf8 && c > 255) *errorcodeptr = ERR51;
       
   673     break;
       
   674 
       
   675     /* \x is complicated. \x{ddd} is a character number which can be greater
       
   676     than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
       
   677     treated as a data character. */
       
   678 
       
   679     case 'x':
       
   680     if (ptr[1] == '{')
       
   681       {
       
   682       const uschar *pt = ptr + 2;
       
   683       int count = 0;
       
   684 
       
   685       c = 0;
       
   686       while ((digitab[*pt] & ctype_xdigit) != 0)
       
   687         {
       
   688         register int cc = *pt++;
       
   689         if (c == 0 && cc == '0') continue;     /* Leading zeroes */
       
   690         count++;
       
   691 
       
   692 #ifndef EBCDIC  /* ASCII coding */
       
   693         if (cc >= 'a') cc -= 32;               /* Convert to upper case */
       
   694         c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
       
   695 #else           /* EBCDIC coding */
       
   696         if (cc >= 'a' && cc <= 'z') cc += 64;  /* Convert to upper case */
       
   697         c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
       
   698 #endif
       
   699         }
       
   700 
       
   701       if (*pt == '}')
       
   702         {
       
   703         if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
       
   704         ptr = pt;
       
   705         break;
       
   706         }
       
   707 
       
   708       /* If the sequence of hex digits does not end with '}', then we don't
       
   709       recognize this construct; fall through to the normal \x handling. */
       
   710       }
       
   711 
       
   712     /* Read just a single-byte hex-defined char */
       
   713 
       
   714     c = 0;
       
   715     while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
       
   716       {
       
   717       int cc;                               /* Some compilers don't like ++ */
       
   718       cc = *(++ptr);                        /* in initializers */
       
   719 #ifndef EBCDIC  /* ASCII coding */
       
   720       if (cc >= 'a') cc -= 32;              /* Convert to upper case */
       
   721       c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
       
   722 #else           /* EBCDIC coding */
       
   723       if (cc <= 'z') cc += 64;              /* Convert to upper case */
       
   724       c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
       
   725 #endif
       
   726       }
       
   727     break;
       
   728 
       
   729     /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
       
   730     This coding is ASCII-specific, but then the whole concept of \cx is
       
   731     ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
       
   732 
       
   733     case 'c':
       
   734     c = *(++ptr);
       
   735     if (c == 0)
       
   736       {
       
   737       *errorcodeptr = ERR2;
       
   738       break;
       
   739       }
       
   740 
       
   741 #ifndef EBCDIC  /* ASCII coding */
       
   742     if (c >= 'a' && c <= 'z') c -= 32;
       
   743     c ^= 0x40;
       
   744 #else           /* EBCDIC coding */
       
   745     if (c >= 'a' && c <= 'z') c += 64;
       
   746     c ^= 0xC0;
       
   747 #endif
       
   748     break;
       
   749 
       
   750     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
       
   751     other alphanumeric following \ is an error if PCRE_EXTRA was set;
       
   752     otherwise, for Perl compatibility, it is a literal. This code looks a bit
       
   753     odd, but there used to be some cases other than the default, and there may
       
   754     be again in future, so I haven't "optimized" it. */
       
   755 
       
   756     default:
       
   757     if ((options & PCRE_EXTRA) != 0) switch(c)
       
   758       {
       
   759       default:
       
   760       *errorcodeptr = ERR3;
       
   761       break;
       
   762       }
       
   763     break;
       
   764     }
       
   765   }
       
   766 
       
   767 *ptrptr = ptr;
       
   768 return c;
       
   769 }
       
   770 
       
   771 
       
   772 
       
   773 #ifdef SUPPORT_UCP
       
   774 /*************************************************
       
   775 *               Handle \P and \p                 *
       
   776 *************************************************/
       
   777 
       
   778 /* This function is called after \P or \p has been encountered, provided that
       
   779 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
       
   780 pointing at the P or p. On exit, it is pointing at the final character of the
       
   781 escape sequence.
       
   782 
       
   783 Argument:
       
   784   ptrptr         points to the pattern position pointer
       
   785   negptr         points to a boolean that is set TRUE for negation else FALSE
       
   786   dptr           points to an int that is set to the detailed property value
       
   787   errorcodeptr   points to the error code variable
       
   788 
       
   789 Returns:         type value from ucp_type_table, or -1 for an invalid type
       
   790 */
       
   791 
       
   792 static int
       
   793 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
       
   794 {
       
   795 int c, i, bot, top;
       
   796 const uschar *ptr = *ptrptr;
       
   797 char name[32];
       
   798 
       
   799 c = *(++ptr);
       
   800 if (c == 0) goto ERROR_RETURN;
       
   801 
       
   802 *negptr = FALSE;
       
   803 
       
   804 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
       
   805 negation. */
       
   806 
       
   807 if (c == '{')
       
   808   {
       
   809   if (ptr[1] == '^')
       
   810     {
       
   811     *negptr = TRUE;
       
   812     ptr++;
       
   813     }
       
   814   for (i = 0; i < (int)sizeof(name) - 1; i++)
       
   815     {
       
   816     c = *(++ptr);
       
   817     if (c == 0) goto ERROR_RETURN;
       
   818     if (c == '}') break;
       
   819     name[i] = c;
       
   820     }
       
   821   if (c !='}') goto ERROR_RETURN;
       
   822   name[i] = 0;
       
   823   }
       
   824 
       
   825 /* Otherwise there is just one following character */
       
   826 
       
   827 else
       
   828   {
       
   829   name[0] = c;
       
   830   name[1] = 0;
       
   831   }
       
   832 
       
   833 *ptrptr = ptr;
       
   834 
       
   835 /* Search for a recognized property name using binary chop */
       
   836 
       
   837 bot = 0;
       
   838 top = _pcre_utt_size;
       
   839 
       
   840 while (bot < top)
       
   841   {
       
   842   i = (bot + top) >> 1;
       
   843   c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);
       
   844   if (c == 0)
       
   845     {
       
   846     *dptr = _pcre_utt[i].value;
       
   847     return _pcre_utt[i].type;
       
   848     }
       
   849   if (c > 0) bot = i + 1; else top = i;
       
   850   }
       
   851 
       
   852 *errorcodeptr = ERR47;
       
   853 *ptrptr = ptr;
       
   854 return -1;
       
   855 
       
   856 ERROR_RETURN:
       
   857 *errorcodeptr = ERR46;
       
   858 *ptrptr = ptr;
       
   859 return -1;
       
   860 }
       
   861 #endif
       
   862 
       
   863 
       
   864 
       
   865 
       
   866 /*************************************************
       
   867 *            Check for counted repeat            *
       
   868 *************************************************/
       
   869 
       
   870 /* This function is called when a '{' is encountered in a place where it might
       
   871 start a quantifier. It looks ahead to see if it really is a quantifier or not.
       
   872 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
       
   873 where the ddds are digits.
       
   874 
       
   875 Arguments:
       
   876   p         pointer to the first char after '{'
       
   877 
       
   878 Returns:    TRUE or FALSE
       
   879 */
       
   880 
       
   881 static BOOL
       
   882 is_counted_repeat(const uschar *p)
       
   883 {
       
   884 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
       
   885 while ((digitab[*p] & ctype_digit) != 0) p++;
       
   886 if (*p == '}') return TRUE;
       
   887 
       
   888 if (*p++ != ',') return FALSE;
       
   889 if (*p == '}') return TRUE;
       
   890 
       
   891 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
       
   892 while ((digitab[*p] & ctype_digit) != 0) p++;
       
   893 
       
   894 return (*p == '}');
       
   895 }
       
   896 
       
   897 
       
   898 
       
   899 /*************************************************
       
   900 *         Read repeat counts                     *
       
   901 *************************************************/
       
   902 
       
   903 /* Read an item of the form {n,m} and return the values. This is called only
       
   904 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
       
   905 so the syntax is guaranteed to be correct, but we need to check the values.
       
   906 
       
   907 Arguments:
       
   908   p              pointer to first char after '{'
       
   909   minp           pointer to int for min
       
   910   maxp           pointer to int for max
       
   911                  returned as -1 if no max
       
   912   errorcodeptr   points to error code variable
       
   913 
       
   914 Returns:         pointer to '}' on success;
       
   915                  current ptr on error, with errorcodeptr set non-zero
       
   916 */
       
   917 
       
   918 static const uschar *
       
   919 read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
       
   920 {
       
   921 int min = 0;
       
   922 int max = -1;
       
   923 
       
   924 /* Read the minimum value and do a paranoid check: a negative value indicates
       
   925 an integer overflow. */
       
   926 
       
   927 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
       
   928 if (min < 0 || min > 65535)
       
   929   {
       
   930   *errorcodeptr = ERR5;
       
   931   return p;
       
   932   }
       
   933 
       
   934 /* Read the maximum value if there is one, and again do a paranoid on its size.
       
   935 Also, max must not be less than min. */
       
   936 
       
   937 if (*p == '}') max = min; else
       
   938   {
       
   939   if (*(++p) != '}')
       
   940     {
       
   941     max = 0;
       
   942     while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
       
   943     if (max < 0 || max > 65535)
       
   944       {
       
   945       *errorcodeptr = ERR5;
       
   946       return p;
       
   947       }
       
   948     if (max < min)
       
   949       {
       
   950       *errorcodeptr = ERR4;
       
   951       return p;
       
   952       }
       
   953     }
       
   954   }
       
   955 
       
   956 /* Fill in the required variables, and pass back the pointer to the terminating
       
   957 '}'. */
       
   958 
       
   959 *minp = min;
       
   960 *maxp = max;
       
   961 return p;
       
   962 }
       
   963 
       
   964 
       
   965 
       
   966 /*************************************************
       
   967 *       Find forward referenced subpattern       *
       
   968 *************************************************/
       
   969 
       
   970 /* This function scans along a pattern's text looking for capturing
       
   971 subpatterns, and counting them. If it finds a named pattern that matches the
       
   972 name it is given, it returns its number. Alternatively, if the name is NULL, it
       
   973 returns when it reaches a given numbered subpattern. This is used for forward
       
   974 references to subpatterns. We know that if (?P< is encountered, the name will
       
   975 be terminated by '>' because that is checked in the first pass.
       
   976 
       
   977 Arguments:
       
   978   ptr          current position in the pattern
       
   979   cd           compile background data
       
   980   name         name to seek, or NULL if seeking a numbered subpattern
       
   981   lorn         name length, or subpattern number if name is NULL
       
   982   xmode        TRUE if we are in /x mode
       
   983 
       
   984 Returns:       the number of the named subpattern, or -1 if not found
       
   985 */
       
   986 
       
   987 static int
       
   988 find_parens(const uschar *ptr, compile_data *cd, const uschar *name, int lorn,
       
   989   BOOL xmode)
       
   990 {
       
   991 const uschar *thisname;
       
   992 int count = cd->bracount;
       
   993 
       
   994 for (; *ptr != 0; ptr++)
       
   995   {
       
   996   int term;
       
   997 
       
   998   /* Skip over backslashed characters and also entire \Q...\E */
       
   999 
       
  1000   if (*ptr == '\\')
       
  1001     {
       
  1002     if (*(++ptr) == 0) return -1;
       
  1003     if (*ptr == 'Q') for (;;)
       
  1004       {
       
  1005       while (*(++ptr) != 0 && *ptr != '\\') {};
       
  1006       if (*ptr == 0) return -1;
       
  1007       if (*(++ptr) == 'E') break;
       
  1008       }
       
  1009     continue;
       
  1010     }
       
  1011 
       
  1012   /* Skip over character classes; this logic must be similar to the way they
       
  1013   are handled for real. If the first character is '^', skip it. Also, if the
       
  1014   first few characters (either before or after ^) are \Q\E or \E we skip them
       
  1015   too. This makes for compatibility with Perl. */
       
  1016 
       
  1017   if (*ptr == '[')
       
  1018     {
       
  1019     BOOL negate_class = FALSE;
       
  1020     for (;;)
       
  1021       {
       
  1022       int c = *(++ptr);
       
  1023       if (c == '\\')
       
  1024         {
       
  1025         if (ptr[1] == 'E') ptr++;
       
  1026           else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3;
       
  1027             else break;
       
  1028         }
       
  1029       else if (!negate_class && c == '^')
       
  1030         negate_class = TRUE;
       
  1031       else break;
       
  1032       }
       
  1033 
       
  1034     /* If the next character is ']', it is a data character that must be
       
  1035     skipped, except in JavaScript compatibility mode. */
       
  1036 
       
  1037     if (ptr[1] == ']' && (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0)
       
  1038       ptr++;
       
  1039 
       
  1040     while (*(++ptr) != ']')
       
  1041       {
       
  1042       if (*ptr == 0) return -1;
       
  1043       if (*ptr == '\\')
       
  1044         {
       
  1045         if (*(++ptr) == 0) return -1;
       
  1046         if (*ptr == 'Q') for (;;)
       
  1047           {
       
  1048           while (*(++ptr) != 0 && *ptr != '\\') {};
       
  1049           if (*ptr == 0) return -1;
       
  1050           if (*(++ptr) == 'E') break;
       
  1051           }
       
  1052         continue;
       
  1053         }
       
  1054       }
       
  1055     continue;
       
  1056     }
       
  1057 
       
  1058   /* Skip comments in /x mode */
       
  1059 
       
  1060   if (xmode && *ptr == '#')
       
  1061     {
       
  1062     while (*(++ptr) != 0 && *ptr != '\n') {};
       
  1063     if (*ptr == 0) return -1;
       
  1064     continue;
       
  1065     }
       
  1066 
       
  1067   /* An opening parens must now be a real metacharacter */
       
  1068 
       
  1069   if (*ptr != '(') continue;
       
  1070   if (ptr[1] != '?' && ptr[1] != '*')
       
  1071     {
       
  1072     count++;
       
  1073     if (name == NULL && count == lorn) return count;
       
  1074     continue;
       
  1075     }
       
  1076 
       
  1077   ptr += 2;
       
  1078   if (*ptr == 'P') ptr++;                      /* Allow optional P */
       
  1079 
       
  1080   /* We have to disambiguate (?<! and (?<= from (?<name> */
       
  1081 
       
  1082   if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
       
  1083        *ptr != '\'')
       
  1084     continue;
       
  1085 
       
  1086   count++;
       
  1087 
       
  1088   if (name == NULL && count == lorn) return count;
       
  1089   term = *ptr++;
       
  1090   if (term == '<') term = '>';
       
  1091   thisname = ptr;
       
  1092   while (*ptr != term) ptr++;
       
  1093   if (name != NULL && lorn == ptr - thisname &&
       
  1094       strncmp((const char *)name, (const char *)thisname, lorn) == 0)
       
  1095     return count;
       
  1096   }
       
  1097 
       
  1098 return -1;
       
  1099 }
       
  1100 
       
  1101 
       
  1102 
       
  1103 /*************************************************
       
  1104 *      Find first significant op code            *
       
  1105 *************************************************/
       
  1106 
       
  1107 /* This is called by several functions that scan a compiled expression looking
       
  1108 for a fixed first character, or an anchoring op code etc. It skips over things
       
  1109 that do not influence this. For some calls, a change of option is important.
       
  1110 For some calls, it makes sense to skip negative forward and all backward
       
  1111 assertions, and also the \b assertion; for others it does not.
       
  1112 
       
  1113 Arguments:
       
  1114   code         pointer to the start of the group
       
  1115   options      pointer to external options
       
  1116   optbit       the option bit whose changing is significant, or
       
  1117                  zero if none are
       
  1118   skipassert   TRUE if certain assertions are to be skipped
       
  1119 
       
  1120 Returns:       pointer to the first significant opcode
       
  1121 */
       
  1122 
       
  1123 static const uschar*
       
  1124 first_significant_code(const uschar *code, int *options, int optbit,
       
  1125   BOOL skipassert)
       
  1126 {
       
  1127 for (;;)
       
  1128   {
       
  1129   switch ((int)*code)
       
  1130     {
       
  1131     case OP_OPT:
       
  1132     if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
       
  1133       *options = (int)code[1];
       
  1134     code += 2;
       
  1135     break;
       
  1136 
       
  1137     case OP_ASSERT_NOT:
       
  1138     case OP_ASSERTBACK:
       
  1139     case OP_ASSERTBACK_NOT:
       
  1140     if (!skipassert) return code;
       
  1141     do code += GET(code, 1); while (*code == OP_ALT);
       
  1142     code += _pcre_OP_lengths[*code];
       
  1143     break;
       
  1144 
       
  1145     case OP_WORD_BOUNDARY:
       
  1146     case OP_NOT_WORD_BOUNDARY:
       
  1147     if (!skipassert) return code;
       
  1148     /* Fall through */
       
  1149 
       
  1150     case OP_CALLOUT:
       
  1151     case OP_CREF:
       
  1152     case OP_RREF:
       
  1153     case OP_DEF:
       
  1154     code += _pcre_OP_lengths[*code];
       
  1155     break;
       
  1156 
       
  1157     default:
       
  1158     return code;
       
  1159     }
       
  1160   }
       
  1161 /* Control never reaches here */
       
  1162 }
       
  1163 
       
  1164 
       
  1165 
       
  1166 
       
  1167 /*************************************************
       
  1168 *        Find the fixed length of a pattern      *
       
  1169 *************************************************/
       
  1170 
       
  1171 /* Scan a pattern and compute the fixed length of subject that will match it,
       
  1172 if the length is fixed. This is needed for dealing with backward assertions.
       
  1173 In UTF8 mode, the result is in characters rather than bytes.
       
  1174 
       
  1175 Arguments:
       
  1176   code     points to the start of the pattern (the bracket)
       
  1177   options  the compiling options
       
  1178 
       
  1179 Returns:   the fixed length, or -1 if there is no fixed length,
       
  1180              or -2 if \C was encountered
       
  1181 */
       
  1182 
       
  1183 static int
       
  1184 find_fixedlength(uschar *code, int options)
       
  1185 {
       
  1186 int length = -1;
       
  1187 
       
  1188 register int branchlength = 0;
       
  1189 register uschar *cc = code + 1 + LINK_SIZE;
       
  1190 
       
  1191 /* Scan along the opcodes for this branch. If we get to the end of the
       
  1192 branch, check the length against that of the other branches. */
       
  1193 
       
  1194 for (;;)
       
  1195   {
       
  1196   int d;
       
  1197   register int op = *cc;
       
  1198   switch (op)
       
  1199     {
       
  1200     case OP_CBRA:
       
  1201     case OP_BRA:
       
  1202     case OP_ONCE:
       
  1203     case OP_COND:
       
  1204     d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
       
  1205     if (d < 0) return d;
       
  1206     branchlength += d;
       
  1207     do cc += GET(cc, 1); while (*cc == OP_ALT);
       
  1208     cc += 1 + LINK_SIZE;
       
  1209     break;
       
  1210 
       
  1211     /* Reached end of a branch; if it's a ket it is the end of a nested
       
  1212     call. If it's ALT it is an alternation in a nested call. If it is
       
  1213     END it's the end of the outer call. All can be handled by the same code. */
       
  1214 
       
  1215     case OP_ALT:
       
  1216     case OP_KET:
       
  1217     case OP_KETRMAX:
       
  1218     case OP_KETRMIN:
       
  1219     case OP_END:
       
  1220     if (length < 0) length = branchlength;
       
  1221       else if (length != branchlength) return -1;
       
  1222     if (*cc != OP_ALT) return length;
       
  1223     cc += 1 + LINK_SIZE;
       
  1224     branchlength = 0;
       
  1225     break;
       
  1226 
       
  1227     /* Skip over assertive subpatterns */
       
  1228 
       
  1229     case OP_ASSERT:
       
  1230     case OP_ASSERT_NOT:
       
  1231     case OP_ASSERTBACK:
       
  1232     case OP_ASSERTBACK_NOT:
       
  1233     do cc += GET(cc, 1); while (*cc == OP_ALT);
       
  1234     /* Fall through */
       
  1235 
       
  1236     /* Skip over things that don't match chars */
       
  1237 
       
  1238     case OP_REVERSE:
       
  1239     case OP_CREF:
       
  1240     case OP_RREF:
       
  1241     case OP_DEF:
       
  1242     case OP_OPT:
       
  1243     case OP_CALLOUT:
       
  1244     case OP_SOD:
       
  1245     case OP_SOM:
       
  1246     case OP_EOD:
       
  1247     case OP_EODN:
       
  1248     case OP_CIRC:
       
  1249     case OP_DOLL:
       
  1250     case OP_NOT_WORD_BOUNDARY:
       
  1251     case OP_WORD_BOUNDARY:
       
  1252     cc += _pcre_OP_lengths[*cc];
       
  1253     break;
       
  1254 
       
  1255     /* Handle literal characters */
       
  1256 
       
  1257     case OP_CHAR:
       
  1258     case OP_CHARNC:
       
  1259     case OP_NOT:
       
  1260     branchlength++;
       
  1261     cc += 2;
       
  1262 #ifdef SUPPORT_UTF8
       
  1263     if ((options & PCRE_UTF8) != 0)
       
  1264       {
       
  1265       while ((*cc & 0xc0) == 0x80) cc++;
       
  1266       }
       
  1267 #endif
       
  1268     break;
       
  1269 
       
  1270     /* Handle exact repetitions. The count is already in characters, but we
       
  1271     need to skip over a multibyte character in UTF8 mode.  */
       
  1272 
       
  1273     case OP_EXACT:
       
  1274     branchlength += GET2(cc,1);
       
  1275     cc += 4;
       
  1276 #ifdef SUPPORT_UTF8
       
  1277     if ((options & PCRE_UTF8) != 0)
       
  1278       {
       
  1279       while((*cc & 0x80) == 0x80) cc++;
       
  1280       }
       
  1281 #endif
       
  1282     break;
       
  1283 
       
  1284     case OP_TYPEEXACT:
       
  1285     branchlength += GET2(cc,1);
       
  1286     if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
       
  1287     cc += 4;
       
  1288     break;
       
  1289 
       
  1290     /* Handle single-char matchers */
       
  1291 
       
  1292     case OP_PROP:
       
  1293     case OP_NOTPROP:
       
  1294     cc += 2;
       
  1295     /* Fall through */
       
  1296 
       
  1297     case OP_NOT_DIGIT:
       
  1298     case OP_DIGIT:
       
  1299     case OP_NOT_WHITESPACE:
       
  1300     case OP_WHITESPACE:
       
  1301     case OP_NOT_WORDCHAR:
       
  1302     case OP_WORDCHAR:
       
  1303     case OP_ANY:
       
  1304     case OP_ALLANY:
       
  1305     branchlength++;
       
  1306     cc++;
       
  1307     break;
       
  1308 
       
  1309     /* The single-byte matcher isn't allowed */
       
  1310 
       
  1311     case OP_ANYBYTE:
       
  1312     return -2;
       
  1313 
       
  1314     /* Check a class for variable quantification */
       
  1315 
       
  1316 #ifdef SUPPORT_UTF8
       
  1317     case OP_XCLASS:
       
  1318     cc += GET(cc, 1) - 33;
       
  1319     /* Fall through */
       
  1320 #endif
       
  1321 
       
  1322     case OP_CLASS:
       
  1323     case OP_NCLASS:
       
  1324     cc += 33;
       
  1325 
       
  1326     switch (*cc)
       
  1327       {
       
  1328       case OP_CRSTAR:
       
  1329       case OP_CRMINSTAR:
       
  1330       case OP_CRQUERY:
       
  1331       case OP_CRMINQUERY:
       
  1332       return -1;
       
  1333 
       
  1334       case OP_CRRANGE:
       
  1335       case OP_CRMINRANGE:
       
  1336       if (GET2(cc,1) != GET2(cc,3)) return -1;
       
  1337       branchlength += GET2(cc,1);
       
  1338       cc += 5;
       
  1339       break;
       
  1340 
       
  1341       default:
       
  1342       branchlength++;
       
  1343       }
       
  1344     break;
       
  1345 
       
  1346     /* Anything else is variable length */
       
  1347 
       
  1348     default:
       
  1349     return -1;
       
  1350     }
       
  1351   }
       
  1352 /* Control never gets here */
       
  1353 }
       
  1354 
       
  1355 
       
  1356 
       
  1357 
       
  1358 /*************************************************
       
  1359 *    Scan compiled regex for numbered bracket    *
       
  1360 *************************************************/
       
  1361 
       
  1362 /* This little function scans through a compiled pattern until it finds a
       
  1363 capturing bracket with the given number.
       
  1364 
       
  1365 Arguments:
       
  1366   code        points to start of expression
       
  1367   utf8        TRUE in UTF-8 mode
       
  1368   number      the required bracket number
       
  1369 
       
  1370 Returns:      pointer to the opcode for the bracket, or NULL if not found
       
  1371 */
       
  1372 
       
  1373 static const uschar *
       
  1374 find_bracket(const uschar *code, BOOL utf8, int number)
       
  1375 {
       
  1376 for (;;)
       
  1377   {
       
  1378   register int c = *code;
       
  1379   if (c == OP_END) return NULL;
       
  1380 
       
  1381   /* XCLASS is used for classes that cannot be represented just by a bit
       
  1382   map. This includes negated single high-valued characters. The length in
       
  1383   the table is zero; the actual length is stored in the compiled code. */
       
  1384 
       
  1385   if (c == OP_XCLASS) code += GET(code, 1);
       
  1386 
       
  1387   /* Handle capturing bracket */
       
  1388 
       
  1389   else if (c == OP_CBRA)
       
  1390     {
       
  1391     int n = GET2(code, 1+LINK_SIZE);
       
  1392     if (n == number) return (uschar *)code;
       
  1393     code += _pcre_OP_lengths[c];
       
  1394     }
       
  1395 
       
  1396   /* Otherwise, we can get the item's length from the table, except that for
       
  1397   repeated character types, we have to test for \p and \P, which have an extra
       
  1398   two bytes of parameters. */
       
  1399 
       
  1400   else
       
  1401     {
       
  1402     switch(c)
       
  1403       {
       
  1404       case OP_TYPESTAR:
       
  1405       case OP_TYPEMINSTAR:
       
  1406       case OP_TYPEPLUS:
       
  1407       case OP_TYPEMINPLUS:
       
  1408       case OP_TYPEQUERY:
       
  1409       case OP_TYPEMINQUERY:
       
  1410       case OP_TYPEPOSSTAR:
       
  1411       case OP_TYPEPOSPLUS:
       
  1412       case OP_TYPEPOSQUERY:
       
  1413       if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
       
  1414       break;
       
  1415 
       
  1416       case OP_TYPEUPTO:
       
  1417       case OP_TYPEMINUPTO:
       
  1418       case OP_TYPEEXACT:
       
  1419       case OP_TYPEPOSUPTO:
       
  1420       if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
       
  1421       break;
       
  1422       }
       
  1423 
       
  1424     /* Add in the fixed length from the table */
       
  1425 
       
  1426     code += _pcre_OP_lengths[c];
       
  1427 
       
  1428   /* In UTF-8 mode, opcodes that are followed by a character may be followed by
       
  1429   a multi-byte character. The length in the table is a minimum, so we have to
       
  1430   arrange to skip the extra bytes. */
       
  1431 
       
  1432 #ifdef SUPPORT_UTF8
       
  1433     if (utf8) switch(c)
       
  1434       {
       
  1435       case OP_CHAR:
       
  1436       case OP_CHARNC:
       
  1437       case OP_EXACT:
       
  1438       case OP_UPTO:
       
  1439       case OP_MINUPTO:
       
  1440       case OP_POSUPTO:
       
  1441       case OP_STAR:
       
  1442       case OP_MINSTAR:
       
  1443       case OP_POSSTAR:
       
  1444       case OP_PLUS:
       
  1445       case OP_MINPLUS:
       
  1446       case OP_POSPLUS:
       
  1447       case OP_QUERY:
       
  1448       case OP_MINQUERY:
       
  1449       case OP_POSQUERY:
       
  1450       if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
       
  1451       break;
       
  1452       }
       
  1453 #else
       
  1454     (void)(utf8);  /* Keep compiler happy by referencing function argument */
       
  1455 #endif
       
  1456     }
       
  1457   }
       
  1458 }
       
  1459 
       
  1460 
       
  1461 
       
  1462 /*************************************************
       
  1463 *   Scan compiled regex for recursion reference  *
       
  1464 *************************************************/
       
  1465 
       
  1466 /* This little function scans through a compiled pattern until it finds an
       
  1467 instance of OP_RECURSE.
       
  1468 
       
  1469 Arguments:
       
  1470   code        points to start of expression
       
  1471   utf8        TRUE in UTF-8 mode
       
  1472 
       
  1473 Returns:      pointer to the opcode for OP_RECURSE, or NULL if not found
       
  1474 */
       
  1475 
       
  1476 static const uschar *
       
  1477 find_recurse(const uschar *code, BOOL utf8)
       
  1478 {
       
  1479 for (;;)
       
  1480   {
       
  1481   register int c = *code;
       
  1482   if (c == OP_END) return NULL;
       
  1483   if (c == OP_RECURSE) return code;
       
  1484 
       
  1485   /* XCLASS is used for classes that cannot be represented just by a bit
       
  1486   map. This includes negated single high-valued characters. The length in
       
  1487   the table is zero; the actual length is stored in the compiled code. */
       
  1488 
       
  1489   if (c == OP_XCLASS) code += GET(code, 1);
       
  1490 
       
  1491   /* Otherwise, we can get the item's length from the table, except that for
       
  1492   repeated character types, we have to test for \p and \P, which have an extra
       
  1493   two bytes of parameters. */
       
  1494 
       
  1495   else
       
  1496     {
       
  1497     switch(c)
       
  1498       {
       
  1499       case OP_TYPESTAR:
       
  1500       case OP_TYPEMINSTAR:
       
  1501       case OP_TYPEPLUS:
       
  1502       case OP_TYPEMINPLUS:
       
  1503       case OP_TYPEQUERY:
       
  1504       case OP_TYPEMINQUERY:
       
  1505       case OP_TYPEPOSSTAR:
       
  1506       case OP_TYPEPOSPLUS:
       
  1507       case OP_TYPEPOSQUERY:
       
  1508       if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
       
  1509       break;
       
  1510 
       
  1511       case OP_TYPEPOSUPTO:
       
  1512       case OP_TYPEUPTO:
       
  1513       case OP_TYPEMINUPTO:
       
  1514       case OP_TYPEEXACT:
       
  1515       if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
       
  1516       break;
       
  1517       }
       
  1518 
       
  1519     /* Add in the fixed length from the table */
       
  1520 
       
  1521     code += _pcre_OP_lengths[c];
       
  1522 
       
  1523     /* In UTF-8 mode, opcodes that are followed by a character may be followed
       
  1524     by a multi-byte character. The length in the table is a minimum, so we have
       
  1525     to arrange to skip the extra bytes. */
       
  1526 
       
  1527 #ifdef SUPPORT_UTF8
       
  1528     if (utf8) switch(c)
       
  1529       {
       
  1530       case OP_CHAR:
       
  1531       case OP_CHARNC:
       
  1532       case OP_EXACT:
       
  1533       case OP_UPTO:
       
  1534       case OP_MINUPTO:
       
  1535       case OP_POSUPTO:
       
  1536       case OP_STAR:
       
  1537       case OP_MINSTAR:
       
  1538       case OP_POSSTAR:
       
  1539       case OP_PLUS:
       
  1540       case OP_MINPLUS:
       
  1541       case OP_POSPLUS:
       
  1542       case OP_QUERY:
       
  1543       case OP_MINQUERY:
       
  1544       case OP_POSQUERY:
       
  1545       if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
       
  1546       break;
       
  1547       }
       
  1548 #else
       
  1549     (void)(utf8);  /* Keep compiler happy by referencing function argument */
       
  1550 #endif
       
  1551     }
       
  1552   }
       
  1553 }
       
  1554 
       
  1555 
       
  1556 
       
  1557 /*************************************************
       
  1558 *    Scan compiled branch for non-emptiness      *
       
  1559 *************************************************/
       
  1560 
       
  1561 /* This function scans through a branch of a compiled pattern to see whether it
       
  1562 can match the empty string or not. It is called from could_be_empty()
       
  1563 below and from compile_branch() when checking for an unlimited repeat of a
       
  1564 group that can match nothing. Note that first_significant_code() skips over
       
  1565 backward and negative forward assertions when its final argument is TRUE. If we
       
  1566 hit an unclosed bracket, we return "empty" - this means we've struck an inner
       
  1567 bracket whose current branch will already have been scanned.
       
  1568 
       
  1569 Arguments:
       
  1570   code        points to start of search
       
  1571   endcode     points to where to stop
       
  1572   utf8        TRUE if in UTF8 mode
       
  1573 
       
  1574 Returns:      TRUE if what is matched could be empty
       
  1575 */
       
  1576 
       
  1577 static BOOL
       
  1578 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
       
  1579 {
       
  1580 register int c;
       
  1581 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
       
  1582      code < endcode;
       
  1583      code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
       
  1584   {
       
  1585   const uschar *ccode;
       
  1586 
       
  1587   c = *code;
       
  1588 
       
  1589   /* Skip over forward assertions; the other assertions are skipped by
       
  1590   first_significant_code() with a TRUE final argument. */
       
  1591 
       
  1592   if (c == OP_ASSERT)
       
  1593     {
       
  1594     do code += GET(code, 1); while (*code == OP_ALT);
       
  1595     c = *code;
       
  1596     continue;
       
  1597     }
       
  1598 
       
  1599   /* Groups with zero repeats can of course be empty; skip them. */
       
  1600 
       
  1601   if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO)
       
  1602     {
       
  1603     code += _pcre_OP_lengths[c];
       
  1604     do code += GET(code, 1); while (*code == OP_ALT);
       
  1605     c = *code;
       
  1606     continue;
       
  1607     }
       
  1608 
       
  1609   /* For other groups, scan the branches. */
       
  1610 
       
  1611   if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
       
  1612     {
       
  1613     BOOL empty_branch;
       
  1614     if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
       
  1615 
       
  1616     /* Scan a closed bracket */
       
  1617 
       
  1618     empty_branch = FALSE;
       
  1619     do
       
  1620       {
       
  1621       if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
       
  1622         empty_branch = TRUE;
       
  1623       code += GET(code, 1);
       
  1624       }
       
  1625     while (*code == OP_ALT);
       
  1626     if (!empty_branch) return FALSE;   /* All branches are non-empty */
       
  1627     c = *code;
       
  1628     continue;
       
  1629     }
       
  1630 
       
  1631   /* Handle the other opcodes */
       
  1632 
       
  1633   switch (c)
       
  1634     {
       
  1635     /* Check for quantifiers after a class. XCLASS is used for classes that
       
  1636     cannot be represented just by a bit map. This includes negated single
       
  1637     high-valued characters. The length in _pcre_OP_lengths[] is zero; the
       
  1638     actual length is stored in the compiled code, so we must update "code"
       
  1639     here. */
       
  1640 
       
  1641 #ifdef SUPPORT_UTF8
       
  1642     case OP_XCLASS:
       
  1643     ccode = code += GET(code, 1);
       
  1644     goto CHECK_CLASS_REPEAT;
       
  1645 #endif
       
  1646 
       
  1647     case OP_CLASS:
       
  1648     case OP_NCLASS:
       
  1649     ccode = code + 33;
       
  1650 
       
  1651 #ifdef SUPPORT_UTF8
       
  1652     CHECK_CLASS_REPEAT:
       
  1653 #endif
       
  1654 
       
  1655     switch (*ccode)
       
  1656       {
       
  1657       case OP_CRSTAR:            /* These could be empty; continue */
       
  1658       case OP_CRMINSTAR:
       
  1659       case OP_CRQUERY:
       
  1660       case OP_CRMINQUERY:
       
  1661       break;
       
  1662 
       
  1663       default:                   /* Non-repeat => class must match */
       
  1664       case OP_CRPLUS:            /* These repeats aren't empty */
       
  1665       case OP_CRMINPLUS:
       
  1666       return FALSE;
       
  1667 
       
  1668       case OP_CRRANGE:
       
  1669       case OP_CRMINRANGE:
       
  1670       if (GET2(ccode, 1) > 0) return FALSE;  /* Minimum > 0 */
       
  1671       break;
       
  1672       }
       
  1673     break;
       
  1674 
       
  1675     /* Opcodes that must match a character */
       
  1676 
       
  1677     case OP_PROP:
       
  1678     case OP_NOTPROP:
       
  1679     case OP_EXTUNI:
       
  1680     case OP_NOT_DIGIT:
       
  1681     case OP_DIGIT:
       
  1682     case OP_NOT_WHITESPACE:
       
  1683     case OP_WHITESPACE:
       
  1684     case OP_NOT_WORDCHAR:
       
  1685     case OP_WORDCHAR:
       
  1686     case OP_ANY:
       
  1687     case OP_ALLANY:
       
  1688     case OP_ANYBYTE:
       
  1689     case OP_CHAR:
       
  1690     case OP_CHARNC:
       
  1691     case OP_NOT:
       
  1692     case OP_PLUS:
       
  1693     case OP_MINPLUS:
       
  1694     case OP_POSPLUS:
       
  1695     case OP_EXACT:
       
  1696     case OP_NOTPLUS:
       
  1697     case OP_NOTMINPLUS:
       
  1698     case OP_NOTPOSPLUS:
       
  1699     case OP_NOTEXACT:
       
  1700     case OP_TYPEPLUS:
       
  1701     case OP_TYPEMINPLUS:
       
  1702     case OP_TYPEPOSPLUS:
       
  1703     case OP_TYPEEXACT:
       
  1704     return FALSE;
       
  1705 
       
  1706     /* These are going to continue, as they may be empty, but we have to
       
  1707     fudge the length for the \p and \P cases. */
       
  1708 
       
  1709     case OP_TYPESTAR:
       
  1710     case OP_TYPEMINSTAR:
       
  1711     case OP_TYPEPOSSTAR:
       
  1712     case OP_TYPEQUERY:
       
  1713     case OP_TYPEMINQUERY:
       
  1714     case OP_TYPEPOSQUERY:
       
  1715     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
       
  1716     break;
       
  1717 
       
  1718     /* Same for these */
       
  1719 
       
  1720     case OP_TYPEUPTO:
       
  1721     case OP_TYPEMINUPTO:
       
  1722     case OP_TYPEPOSUPTO:
       
  1723     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
       
  1724     break;
       
  1725 
       
  1726     /* End of branch */
       
  1727 
       
  1728     case OP_KET:
       
  1729     case OP_KETRMAX:
       
  1730     case OP_KETRMIN:
       
  1731     case OP_ALT:
       
  1732     return TRUE;
       
  1733 
       
  1734     /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
       
  1735     MINUPTO, and POSUPTO may be followed by a multibyte character */
       
  1736 
       
  1737 #ifdef SUPPORT_UTF8
       
  1738     case OP_STAR:
       
  1739     case OP_MINSTAR:
       
  1740     case OP_POSSTAR:
       
  1741     case OP_QUERY:
       
  1742     case OP_MINQUERY:
       
  1743     case OP_POSQUERY:
       
  1744     case OP_UPTO:
       
  1745     case OP_MINUPTO:
       
  1746     case OP_POSUPTO:
       
  1747     if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
       
  1748     break;
       
  1749 #endif
       
  1750     }
       
  1751   }
       
  1752 
       
  1753 return TRUE;
       
  1754 }
       
  1755 
       
  1756 
       
  1757 
       
  1758 /*************************************************
       
  1759 *    Scan compiled regex for non-emptiness       *
       
  1760 *************************************************/
       
  1761 
       
  1762 /* This function is called to check for left recursive calls. We want to check
       
  1763 the current branch of the current pattern to see if it could match the empty
       
  1764 string. If it could, we must look outwards for branches at other levels,
       
  1765 stopping when we pass beyond the bracket which is the subject of the recursion.
       
  1766 
       
  1767 Arguments:
       
  1768   code        points to start of the recursion
       
  1769   endcode     points to where to stop (current RECURSE item)
       
  1770   bcptr       points to the chain of current (unclosed) branch starts
       
  1771   utf8        TRUE if in UTF-8 mode
       
  1772 
       
  1773 Returns:      TRUE if what is matched could be empty
       
  1774 */
       
  1775 
       
  1776 static BOOL
       
  1777 could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
       
  1778   BOOL utf8)
       
  1779 {
       
  1780 while (bcptr != NULL && bcptr->current >= code)
       
  1781   {
       
  1782   if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
       
  1783   bcptr = bcptr->outer;
       
  1784   }
       
  1785 return TRUE;
       
  1786 }
       
  1787 
       
  1788 
       
  1789 
       
  1790 /*************************************************
       
  1791 *           Check for POSIX class syntax         *
       
  1792 *************************************************/
       
  1793 
       
  1794 /* This function is called when the sequence "[:" or "[." or "[=" is
       
  1795 encountered in a character class. It checks whether this is followed by a
       
  1796 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
       
  1797 reach an unescaped ']' without the special preceding character, return FALSE.
       
  1798 
       
  1799 Originally, this function only recognized a sequence of letters between the
       
  1800 terminators, but it seems that Perl recognizes any sequence of characters,
       
  1801 though of course unknown POSIX names are subsequently rejected. Perl gives an
       
  1802 "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
       
  1803 didn't consider this to be a POSIX class. Likewise for [:1234:].
       
  1804 
       
  1805 The problem in trying to be exactly like Perl is in the handling of escapes. We
       
  1806 have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
       
  1807 class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
       
  1808 below handles the special case of \], but does not try to do any other escape
       
  1809 processing. This makes it different from Perl for cases such as [:l\ower:]
       
  1810 where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
       
  1811 "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
       
  1812 I think.
       
  1813 
       
  1814 Arguments:
       
  1815   ptr      pointer to the initial [
       
  1816   endptr   where to return the end pointer
       
  1817 
       
  1818 Returns:   TRUE or FALSE
       
  1819 */
       
  1820 
       
  1821 static BOOL
       
  1822 check_posix_syntax(const uschar *ptr, const uschar **endptr)
       
  1823 {
       
  1824 int terminator;          /* Don't combine these lines; the Solaris cc */
       
  1825 terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
       
  1826 for (++ptr; *ptr != 0; ptr++)
       
  1827   {
       
  1828   if (*ptr == '\\' && ptr[1] == ']') ptr++; else
       
  1829     {
       
  1830     if (*ptr == ']') return FALSE;
       
  1831     if (*ptr == terminator && ptr[1] == ']')
       
  1832       {
       
  1833       *endptr = ptr;
       
  1834       return TRUE;
       
  1835       }
       
  1836     }
       
  1837   }
       
  1838 return FALSE;
       
  1839 }
       
  1840 
       
  1841 
       
  1842 
       
  1843 
       
  1844 /*************************************************
       
  1845 *          Check POSIX class name                *
       
  1846 *************************************************/
       
  1847 
       
  1848 /* This function is called to check the name given in a POSIX-style class entry
       
  1849 such as [:alnum:].
       
  1850 
       
  1851 Arguments:
       
  1852   ptr        points to the first letter
       
  1853   len        the length of the name
       
  1854 
       
  1855 Returns:     a value representing the name, or -1 if unknown
       
  1856 */
       
  1857 
       
  1858 static int
       
  1859 check_posix_name(const uschar *ptr, int len)
       
  1860 {
       
  1861 const char *pn = posix_names;
       
  1862 register int yield = 0;
       
  1863 while (posix_name_lengths[yield] != 0)
       
  1864   {
       
  1865   if (len == posix_name_lengths[yield] &&
       
  1866     strncmp((const char *)ptr, pn, len) == 0) return yield;
       
  1867   pn += posix_name_lengths[yield] + 1;
       
  1868   yield++;
       
  1869   }
       
  1870 return -1;
       
  1871 }
       
  1872 
       
  1873 
       
  1874 /*************************************************
       
  1875 *    Adjust OP_RECURSE items in repeated group   *
       
  1876 *************************************************/
       
  1877 
       
  1878 /* OP_RECURSE items contain an offset from the start of the regex to the group
       
  1879 that is referenced. This means that groups can be replicated for fixed
       
  1880 repetition simply by copying (because the recursion is allowed to refer to
       
  1881 earlier groups that are outside the current group). However, when a group is
       
  1882 optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
       
  1883 inserted before it, after it has been compiled. This means that any OP_RECURSE
       
  1884 items within it that refer to the group itself or any contained groups have to
       
  1885 have their offsets adjusted. That one of the jobs of this function. Before it
       
  1886 is called, the partially compiled regex must be temporarily terminated with
       
  1887 OP_END.
       
  1888 
       
  1889 This function has been extended with the possibility of forward references for
       
  1890 recursions and subroutine calls. It must also check the list of such references
       
  1891 for the group we are dealing with. If it finds that one of the recursions in
       
  1892 the current group is on this list, it adjusts the offset in the list, not the
       
  1893 value in the reference (which is a group number).
       
  1894 
       
  1895 Arguments:
       
  1896   group      points to the start of the group
       
  1897   adjust     the amount by which the group is to be moved
       
  1898   utf8       TRUE in UTF-8 mode
       
  1899   cd         contains pointers to tables etc.
       
  1900   save_hwm   the hwm forward reference pointer at the start of the group
       
  1901 
       
  1902 Returns:     nothing
       
  1903 */
       
  1904 
       
  1905 static void
       
  1906 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
       
  1907   uschar *save_hwm)
       
  1908 {
       
  1909 uschar *ptr = group;
       
  1910 
       
  1911 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
       
  1912   {
       
  1913   int offset;
       
  1914   uschar *hc;
       
  1915 
       
  1916   /* See if this recursion is on the forward reference list. If so, adjust the
       
  1917   reference. */
       
  1918 
       
  1919   for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
       
  1920     {
       
  1921     offset = GET(hc, 0);
       
  1922     if (cd->start_code + offset == ptr + 1)
       
  1923       {
       
  1924       PUT(hc, 0, offset + adjust);
       
  1925       break;
       
  1926       }
       
  1927     }
       
  1928 
       
  1929   /* Otherwise, adjust the recursion offset if it's after the start of this
       
  1930   group. */
       
  1931 
       
  1932   if (hc >= cd->hwm)
       
  1933     {
       
  1934     offset = GET(ptr, 1);
       
  1935     if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
       
  1936     }
       
  1937 
       
  1938   ptr += 1 + LINK_SIZE;
       
  1939   }
       
  1940 }
       
  1941 
       
  1942 
       
  1943 
       
  1944 /*************************************************
       
  1945 *        Insert an automatic callout point       *
       
  1946 *************************************************/
       
  1947 
       
  1948 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
       
  1949 callout points before each pattern item.
       
  1950 
       
  1951 Arguments:
       
  1952   code           current code pointer
       
  1953   ptr            current pattern pointer
       
  1954   cd             pointers to tables etc
       
  1955 
       
  1956 Returns:         new code pointer
       
  1957 */
       
  1958 
       
  1959 static uschar *
       
  1960 auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
       
  1961 {
       
  1962 *code++ = OP_CALLOUT;
       
  1963 *code++ = 255;
       
  1964 PUT(code, 0, ptr - cd->start_pattern);  /* Pattern offset */
       
  1965 PUT(code, LINK_SIZE, 0);                /* Default length */
       
  1966 return code + 2*LINK_SIZE;
       
  1967 }
       
  1968 
       
  1969 
       
  1970 
       
  1971 /*************************************************
       
  1972 *         Complete a callout item                *
       
  1973 *************************************************/
       
  1974 
       
  1975 /* A callout item contains the length of the next item in the pattern, which
       
  1976 we can't fill in till after we have reached the relevant point. This is used
       
  1977 for both automatic and manual callouts.
       
  1978 
       
  1979 Arguments:
       
  1980   previous_callout   points to previous callout item
       
  1981   ptr                current pattern pointer
       
  1982   cd                 pointers to tables etc
       
  1983 
       
  1984 Returns:             nothing
       
  1985 */
       
  1986 
       
  1987 static void
       
  1988 complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
       
  1989 {
       
  1990 int length = ptr - cd->start_pattern - GET(previous_callout, 2);
       
  1991 PUT(previous_callout, 2 + LINK_SIZE, length);
       
  1992 }
       
  1993 
       
  1994 
       
  1995 
       
  1996 #ifdef SUPPORT_UCP
       
  1997 /*************************************************
       
  1998 *           Get othercase range                  *
       
  1999 *************************************************/
       
  2000 
       
  2001 /* This function is passed the start and end of a class range, in UTF-8 mode
       
  2002 with UCP support. It searches up the characters, looking for internal ranges of
       
  2003 characters in the "other" case. Each call returns the next one, updating the
       
  2004 start address.
       
  2005 
       
  2006 Arguments:
       
  2007   cptr        points to starting character value; updated
       
  2008   d           end value
       
  2009   ocptr       where to put start of othercase range
       
  2010   odptr       where to put end of othercase range
       
  2011 
       
  2012 Yield:        TRUE when range returned; FALSE when no more
       
  2013 */
       
  2014 
       
  2015 static BOOL
       
  2016 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
       
  2017   unsigned int *odptr)
       
  2018 {
       
  2019 unsigned int c, othercase, next;
       
  2020 
       
  2021 for (c = *cptr; c <= d; c++)
       
  2022   { if ((othercase = UCD_OTHERCASE(c)) != c) break; }
       
  2023 
       
  2024 if (c > d) return FALSE;
       
  2025 
       
  2026 *ocptr = othercase;
       
  2027 next = othercase + 1;
       
  2028 
       
  2029 for (++c; c <= d; c++)
       
  2030   {
       
  2031   if (UCD_OTHERCASE(c) != next) break;
       
  2032   next++;
       
  2033   }
       
  2034 
       
  2035 *odptr = next - 1;
       
  2036 *cptr = c;
       
  2037 
       
  2038 return TRUE;
       
  2039 }
       
  2040 #endif  /* SUPPORT_UCP */
       
  2041 
       
  2042 
       
  2043 
       
  2044 /*************************************************
       
  2045 *     Check if auto-possessifying is possible    *
       
  2046 *************************************************/
       
  2047 
       
  2048 /* This function is called for unlimited repeats of certain items, to see
       
  2049 whether the next thing could possibly match the repeated item. If not, it makes
       
  2050 sense to automatically possessify the repeated item.
       
  2051 
       
  2052 Arguments:
       
  2053   op_code       the repeated op code
       
  2054   this          data for this item, depends on the opcode
       
  2055   utf8          TRUE in UTF-8 mode
       
  2056   utf8_char     used for utf8 character bytes, NULL if not relevant
       
  2057   ptr           next character in pattern
       
  2058   options       options bits
       
  2059   cd            contains pointers to tables etc.
       
  2060 
       
  2061 Returns:        TRUE if possessifying is wanted
       
  2062 */
       
  2063 
       
  2064 static BOOL
       
  2065 check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
       
  2066   const uschar *ptr, int options, compile_data *cd)
       
  2067 {
       
  2068 int next;
       
  2069 
       
  2070 /* Skip whitespace and comments in extended mode */
       
  2071 
       
  2072 if ((options & PCRE_EXTENDED) != 0)
       
  2073   {
       
  2074   for (;;)
       
  2075     {
       
  2076     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
       
  2077     if (*ptr == '#')
       
  2078       {
       
  2079       while (*(++ptr) != 0)
       
  2080         if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
       
  2081       }
       
  2082     else break;
       
  2083     }
       
  2084   }
       
  2085 
       
  2086 /* If the next item is one that we can handle, get its value. A non-negative
       
  2087 value is a character, a negative value is an escape value. */
       
  2088 
       
  2089 if (*ptr == '\\')
       
  2090   {
       
  2091   int temperrorcode = 0;
       
  2092   next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
       
  2093   if (temperrorcode != 0) return FALSE;
       
  2094   ptr++;    /* Point after the escape sequence */
       
  2095   }
       
  2096 
       
  2097 else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
       
  2098   {
       
  2099 #ifdef SUPPORT_UTF8
       
  2100   if (utf8) { GETCHARINC(next, ptr); } else
       
  2101 #endif
       
  2102   next = *ptr++;
       
  2103   }
       
  2104 
       
  2105 else return FALSE;
       
  2106 
       
  2107 /* Skip whitespace and comments in extended mode */
       
  2108 
       
  2109 if ((options & PCRE_EXTENDED) != 0)
       
  2110   {
       
  2111   for (;;)
       
  2112     {
       
  2113     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
       
  2114     if (*ptr == '#')
       
  2115       {
       
  2116       while (*(++ptr) != 0)
       
  2117         if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
       
  2118       }
       
  2119     else break;
       
  2120     }
       
  2121   }
       
  2122 
       
  2123 /* If the next thing is itself optional, we have to give up. */
       
  2124 
       
  2125 if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
       
  2126   return FALSE;
       
  2127 
       
  2128 /* Now compare the next item with the previous opcode. If the previous is a
       
  2129 positive single character match, "item" either contains the character or, if
       
  2130 "item" is greater than 127 in utf8 mode, the character's bytes are in
       
  2131 utf8_char. */
       
  2132 
       
  2133 
       
  2134 /* Handle cases when the next item is a character. */
       
  2135 
       
  2136 if (next >= 0) switch(op_code)
       
  2137   {
       
  2138   case OP_CHAR:
       
  2139 #ifdef SUPPORT_UTF8
       
  2140   if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
       
  2141 #else
       
  2142   (void)(utf8_char);  /* Keep compiler happy by referencing function argument */
       
  2143 #endif
       
  2144   return item != next;
       
  2145 
       
  2146   /* For CHARNC (caseless character) we must check the other case. If we have
       
  2147   Unicode property support, we can use it to test the other case of
       
  2148   high-valued characters. */
       
  2149 
       
  2150   case OP_CHARNC:
       
  2151 #ifdef SUPPORT_UTF8
       
  2152   if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
       
  2153 #endif
       
  2154   if (item == next) return FALSE;
       
  2155 #ifdef SUPPORT_UTF8
       
  2156   if (utf8)
       
  2157     {
       
  2158     unsigned int othercase;
       
  2159     if (next < 128) othercase = cd->fcc[next]; else
       
  2160 #ifdef SUPPORT_UCP
       
  2161     othercase = UCD_OTHERCASE((unsigned int)next);
       
  2162 #else
       
  2163     othercase = NOTACHAR;
       
  2164 #endif
       
  2165     return (unsigned int)item != othercase;
       
  2166     }
       
  2167   else
       
  2168 #endif  /* SUPPORT_UTF8 */
       
  2169   return (item != cd->fcc[next]);  /* Non-UTF-8 mode */
       
  2170 
       
  2171   /* For OP_NOT, "item" must be a single-byte character. */
       
  2172 
       
  2173   case OP_NOT:
       
  2174   if (item == next) return TRUE;
       
  2175   if ((options & PCRE_CASELESS) == 0) return FALSE;
       
  2176 #ifdef SUPPORT_UTF8
       
  2177   if (utf8)
       
  2178     {
       
  2179     unsigned int othercase;
       
  2180     if (next < 128) othercase = cd->fcc[next]; else
       
  2181 #ifdef SUPPORT_UCP
       
  2182     othercase = UCD_OTHERCASE(next);
       
  2183 #else
       
  2184     othercase = NOTACHAR;
       
  2185 #endif
       
  2186     return (unsigned int)item == othercase;
       
  2187     }
       
  2188   else
       
  2189 #endif  /* SUPPORT_UTF8 */
       
  2190   return (item == cd->fcc[next]);  /* Non-UTF-8 mode */
       
  2191 
       
  2192   case OP_DIGIT:
       
  2193   return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
       
  2194 
       
  2195   case OP_NOT_DIGIT:
       
  2196   return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
       
  2197 
       
  2198   case OP_WHITESPACE:
       
  2199   return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
       
  2200 
       
  2201   case OP_NOT_WHITESPACE:
       
  2202   return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
       
  2203 
       
  2204   case OP_WORDCHAR:
       
  2205   return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
       
  2206 
       
  2207   case OP_NOT_WORDCHAR:
       
  2208   return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
       
  2209 
       
  2210   case OP_HSPACE:
       
  2211   case OP_NOT_HSPACE:
       
  2212   switch(next)
       
  2213     {
       
  2214     case 0x09:
       
  2215     case 0x20:
       
  2216     case 0xa0:
       
  2217     case 0x1680:
       
  2218     case 0x180e:
       
  2219     case 0x2000:
       
  2220     case 0x2001:
       
  2221     case 0x2002:
       
  2222     case 0x2003:
       
  2223     case 0x2004:
       
  2224     case 0x2005:
       
  2225     case 0x2006:
       
  2226     case 0x2007:
       
  2227     case 0x2008:
       
  2228     case 0x2009:
       
  2229     case 0x200A:
       
  2230     case 0x202f:
       
  2231     case 0x205f:
       
  2232     case 0x3000:
       
  2233     return op_code != OP_HSPACE;
       
  2234     default:
       
  2235     return op_code == OP_HSPACE;
       
  2236     }
       
  2237 
       
  2238   case OP_VSPACE:
       
  2239   case OP_NOT_VSPACE:
       
  2240   switch(next)
       
  2241     {
       
  2242     case 0x0a:
       
  2243     case 0x0b:
       
  2244     case 0x0c:
       
  2245     case 0x0d:
       
  2246     case 0x85:
       
  2247     case 0x2028:
       
  2248     case 0x2029:
       
  2249     return op_code != OP_VSPACE;
       
  2250     default:
       
  2251     return op_code == OP_VSPACE;
       
  2252     }
       
  2253 
       
  2254   default:
       
  2255   return FALSE;
       
  2256   }
       
  2257 
       
  2258 
       
  2259 /* Handle the case when the next item is \d, \s, etc. */
       
  2260 
       
  2261 switch(op_code)
       
  2262   {
       
  2263   case OP_CHAR:
       
  2264   case OP_CHARNC:
       
  2265 #ifdef SUPPORT_UTF8
       
  2266   if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
       
  2267 #endif
       
  2268   switch(-next)
       
  2269     {
       
  2270     case ESC_d:
       
  2271     return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
       
  2272 
       
  2273     case ESC_D:
       
  2274     return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
       
  2275 
       
  2276     case ESC_s:
       
  2277     return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
       
  2278 
       
  2279     case ESC_S:
       
  2280     return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
       
  2281 
       
  2282     case ESC_w:
       
  2283     return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
       
  2284 
       
  2285     case ESC_W:
       
  2286     return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
       
  2287 
       
  2288     case ESC_h:
       
  2289     case ESC_H:
       
  2290     switch(item)
       
  2291       {
       
  2292       case 0x09:
       
  2293       case 0x20:
       
  2294       case 0xa0:
       
  2295       case 0x1680:
       
  2296       case 0x180e:
       
  2297       case 0x2000:
       
  2298       case 0x2001:
       
  2299       case 0x2002:
       
  2300       case 0x2003:
       
  2301       case 0x2004:
       
  2302       case 0x2005:
       
  2303       case 0x2006:
       
  2304       case 0x2007:
       
  2305       case 0x2008:
       
  2306       case 0x2009:
       
  2307       case 0x200A:
       
  2308       case 0x202f:
       
  2309       case 0x205f:
       
  2310       case 0x3000:
       
  2311       return -next != ESC_h;
       
  2312       default:
       
  2313       return -next == ESC_h;
       
  2314       }
       
  2315 
       
  2316     case ESC_v:
       
  2317     case ESC_V:
       
  2318     switch(item)
       
  2319       {
       
  2320       case 0x0a:
       
  2321       case 0x0b:
       
  2322       case 0x0c:
       
  2323       case 0x0d:
       
  2324       case 0x85:
       
  2325       case 0x2028:
       
  2326       case 0x2029:
       
  2327       return -next != ESC_v;
       
  2328       default:
       
  2329       return -next == ESC_v;
       
  2330       }
       
  2331 
       
  2332     default:
       
  2333     return FALSE;
       
  2334     }
       
  2335 
       
  2336   case OP_DIGIT:
       
  2337   return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
       
  2338          next == -ESC_h || next == -ESC_v;
       
  2339 
       
  2340   case OP_NOT_DIGIT:
       
  2341   return next == -ESC_d;
       
  2342 
       
  2343   case OP_WHITESPACE:
       
  2344   return next == -ESC_S || next == -ESC_d || next == -ESC_w;
       
  2345 
       
  2346   case OP_NOT_WHITESPACE:
       
  2347   return next == -ESC_s || next == -ESC_h || next == -ESC_v;
       
  2348 
       
  2349   case OP_HSPACE:
       
  2350   return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
       
  2351 
       
  2352   case OP_NOT_HSPACE:
       
  2353   return next == -ESC_h;
       
  2354 
       
  2355   /* Can't have \S in here because VT matches \S (Perl anomaly) */
       
  2356   case OP_VSPACE:
       
  2357   return next == -ESC_V || next == -ESC_d || next == -ESC_w;
       
  2358 
       
  2359   case OP_NOT_VSPACE:
       
  2360   return next == -ESC_v;
       
  2361 
       
  2362   case OP_WORDCHAR:
       
  2363   return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
       
  2364 
       
  2365   case OP_NOT_WORDCHAR:
       
  2366   return next == -ESC_w || next == -ESC_d;
       
  2367 
       
  2368   default:
       
  2369   return FALSE;
       
  2370   }
       
  2371 
       
  2372 /* Control does not reach here */
       
  2373 }
       
  2374 
       
  2375 
       
  2376 
       
  2377 /*************************************************
       
  2378 *           Compile one branch                   *
       
  2379 *************************************************/
       
  2380 
       
  2381 /* Scan the pattern, compiling it into the a vector. If the options are
       
  2382 changed during the branch, the pointer is used to change the external options
       
  2383 bits. This function is used during the pre-compile phase when we are trying
       
  2384 to find out the amount of memory needed, as well as during the real compile
       
  2385 phase. The value of lengthptr distinguishes the two phases.
       
  2386 
       
  2387 Arguments:
       
  2388   optionsptr     pointer to the option bits
       
  2389   codeptr        points to the pointer to the current code point
       
  2390   ptrptr         points to the current pattern pointer
       
  2391   errorcodeptr   points to error code variable
       
  2392   firstbyteptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
       
  2393   reqbyteptr     set to the last literal character required, else < 0
       
  2394   bcptr          points to current branch chain
       
  2395   cd             contains pointers to tables etc.
       
  2396   lengthptr      NULL during the real compile phase
       
  2397                  points to length accumulator during pre-compile phase
       
  2398 
       
  2399 Returns:         TRUE on success
       
  2400                  FALSE, with *errorcodeptr set non-zero on error
       
  2401 */
       
  2402 
       
  2403 static BOOL
       
  2404 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
       
  2405   int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
       
  2406   compile_data *cd, int *lengthptr)
       
  2407 {
       
  2408 int repeat_type, op_type;
       
  2409 int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
       
  2410 int bravalue = 0;
       
  2411 int greedy_default, greedy_non_default;
       
  2412 int firstbyte, reqbyte;
       
  2413 int zeroreqbyte, zerofirstbyte;
       
  2414 int req_caseopt, reqvary, tempreqvary;
       
  2415 int options = *optionsptr;
       
  2416 int after_manual_callout = 0;
       
  2417 int length_prevgroup = 0;
       
  2418 register int c;
       
  2419 register uschar *code = *codeptr;
       
  2420 uschar *last_code = code;
       
  2421 uschar *orig_code = code;
       
  2422 uschar *tempcode;
       
  2423 BOOL inescq = FALSE;
       
  2424 BOOL groupsetfirstbyte = FALSE;
       
  2425 const uschar *ptr = *ptrptr;
       
  2426 const uschar *tempptr;
       
  2427 uschar *previous = NULL;
       
  2428 uschar *previous_callout = NULL;
       
  2429 uschar *save_hwm = NULL;
       
  2430 uschar classbits[32];
       
  2431 
       
  2432 #ifdef SUPPORT_UTF8
       
  2433 BOOL class_utf8;
       
  2434 BOOL utf8 = (options & PCRE_UTF8) != 0;
       
  2435 uschar *class_utf8data;
       
  2436 uschar *class_utf8data_base;
       
  2437 uschar utf8_char[6];
       
  2438 #else
       
  2439 BOOL utf8 = FALSE;
       
  2440 uschar *utf8_char = NULL;
       
  2441 #endif
       
  2442 
       
  2443 #ifdef DEBUG
       
  2444 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
       
  2445 #endif
       
  2446 
       
  2447 /* Set up the default and non-default settings for greediness */
       
  2448 
       
  2449 greedy_default = ((options & PCRE_UNGREEDY) != 0);
       
  2450 greedy_non_default = greedy_default ^ 1;
       
  2451 
       
  2452 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
       
  2453 matching encountered yet". It gets changed to REQ_NONE if we hit something that
       
  2454 matches a non-fixed char first char; reqbyte just remains unset if we never
       
  2455 find one.
       
  2456 
       
  2457 When we hit a repeat whose minimum is zero, we may have to adjust these values
       
  2458 to take the zero repeat into account. This is implemented by setting them to
       
  2459 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
       
  2460 item types that can be repeated set these backoff variables appropriately. */
       
  2461 
       
  2462 firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
       
  2463 
       
  2464 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
       
  2465 according to the current setting of the caseless flag. REQ_CASELESS is a bit
       
  2466 value > 255. It is added into the firstbyte or reqbyte variables to record the
       
  2467 case status of the value. This is used only for ASCII characters. */
       
  2468 
       
  2469 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
       
  2470 
       
  2471 /* Switch on next character until the end of the branch */
       
  2472 
       
  2473 for (;; ptr++)
       
  2474   {
       
  2475   BOOL negate_class;
       
  2476   BOOL should_flip_negation;
       
  2477   BOOL possessive_quantifier;
       
  2478   BOOL is_quantifier;
       
  2479   BOOL is_recurse;
       
  2480   BOOL reset_bracount;
       
  2481   int class_charcount;
       
  2482   int class_lastchar;
       
  2483   int newoptions;
       
  2484   int recno;
       
  2485   int refsign;
       
  2486   int skipbytes;
       
  2487   int subreqbyte;
       
  2488   int subfirstbyte;
       
  2489   int terminator;
       
  2490   int mclength;
       
  2491   uschar mcbuffer[8];
       
  2492 
       
  2493   /* Get next byte in the pattern */
       
  2494 
       
  2495   c = *ptr;
       
  2496 
       
  2497   /* If we are in the pre-compile phase, accumulate the length used for the
       
  2498   previous cycle of this loop. */
       
  2499 
       
  2500   if (lengthptr != NULL)
       
  2501     {
       
  2502 #ifdef DEBUG
       
  2503     if (code > cd->hwm) cd->hwm = code;                 /* High water info */
       
  2504 #endif
       
  2505     if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
       
  2506       {
       
  2507       *errorcodeptr = ERR52;
       
  2508       goto FAILED;
       
  2509       }
       
  2510 
       
  2511     /* There is at least one situation where code goes backwards: this is the
       
  2512     case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
       
  2513     the class is simply eliminated. However, it is created first, so we have to
       
  2514     allow memory for it. Therefore, don't ever reduce the length at this point.
       
  2515     */
       
  2516 
       
  2517     if (code < last_code) code = last_code;
       
  2518 
       
  2519     /* Paranoid check for integer overflow */
       
  2520 
       
  2521     if (OFLOW_MAX - *lengthptr < code - last_code)
       
  2522       {
       
  2523       *errorcodeptr = ERR20;
       
  2524       goto FAILED;
       
  2525       }
       
  2526 
       
  2527     *lengthptr += code - last_code;
       
  2528     DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
       
  2529 
       
  2530     /* If "previous" is set and it is not at the start of the work space, move
       
  2531     it back to there, in order to avoid filling up the work space. Otherwise,
       
  2532     if "previous" is NULL, reset the current code pointer to the start. */
       
  2533 
       
  2534     if (previous != NULL)
       
  2535       {
       
  2536       if (previous > orig_code)
       
  2537         {
       
  2538         memmove(orig_code, previous, code - previous);
       
  2539         code -= previous - orig_code;
       
  2540         previous = orig_code;
       
  2541         }
       
  2542       }
       
  2543     else code = orig_code;
       
  2544 
       
  2545     /* Remember where this code item starts so we can pick up the length
       
  2546     next time round. */
       
  2547 
       
  2548     last_code = code;
       
  2549     }
       
  2550 
       
  2551   /* In the real compile phase, just check the workspace used by the forward
       
  2552   reference list. */
       
  2553 
       
  2554   else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
       
  2555     {
       
  2556     *errorcodeptr = ERR52;
       
  2557     goto FAILED;
       
  2558     }
       
  2559 
       
  2560   /* If in \Q...\E, check for the end; if not, we have a literal */
       
  2561 
       
  2562   if (inescq && c != 0)
       
  2563     {
       
  2564     if (c == '\\' && ptr[1] == 'E')
       
  2565       {
       
  2566       inescq = FALSE;
       
  2567       ptr++;
       
  2568       continue;
       
  2569       }
       
  2570     else
       
  2571       {
       
  2572       if (previous_callout != NULL)
       
  2573         {
       
  2574         if (lengthptr == NULL)  /* Don't attempt in pre-compile phase */
       
  2575           complete_callout(previous_callout, ptr, cd);
       
  2576         previous_callout = NULL;
       
  2577         }
       
  2578       if ((options & PCRE_AUTO_CALLOUT) != 0)
       
  2579         {
       
  2580         previous_callout = code;
       
  2581         code = auto_callout(code, ptr, cd);
       
  2582         }
       
  2583       goto NORMAL_CHAR;
       
  2584       }
       
  2585     }
       
  2586 
       
  2587   /* Fill in length of a previous callout, except when the next thing is
       
  2588   a quantifier. */
       
  2589 
       
  2590   is_quantifier = c == '*' || c == '+' || c == '?' ||
       
  2591     (c == '{' && is_counted_repeat(ptr+1));
       
  2592 
       
  2593   if (!is_quantifier && previous_callout != NULL &&
       
  2594        after_manual_callout-- <= 0)
       
  2595     {
       
  2596     if (lengthptr == NULL)      /* Don't attempt in pre-compile phase */
       
  2597       complete_callout(previous_callout, ptr, cd);
       
  2598     previous_callout = NULL;
       
  2599     }
       
  2600 
       
  2601   /* In extended mode, skip white space and comments */
       
  2602 
       
  2603   if ((options & PCRE_EXTENDED) != 0)
       
  2604     {
       
  2605     if ((cd->ctypes[c] & ctype_space) != 0) continue;
       
  2606     if (c == '#')
       
  2607       {
       
  2608       while (*(++ptr) != 0)
       
  2609         {
       
  2610         if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
       
  2611         }
       
  2612       if (*ptr != 0) continue;
       
  2613 
       
  2614       /* Else fall through to handle end of string */
       
  2615       c = 0;
       
  2616       }
       
  2617     }
       
  2618 
       
  2619   /* No auto callout for quantifiers. */
       
  2620 
       
  2621   if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
       
  2622     {
       
  2623     previous_callout = code;
       
  2624     code = auto_callout(code, ptr, cd);
       
  2625     }
       
  2626 
       
  2627   switch(c)
       
  2628     {
       
  2629     /* ===================================================================*/
       
  2630     case 0:                        /* The branch terminates at string end */
       
  2631     case '|':                      /* or | or ) */
       
  2632     case ')':
       
  2633     *firstbyteptr = firstbyte;
       
  2634     *reqbyteptr = reqbyte;
       
  2635     *codeptr = code;
       
  2636     *ptrptr = ptr;
       
  2637     if (lengthptr != NULL)
       
  2638       {
       
  2639       if (OFLOW_MAX - *lengthptr < code - last_code)
       
  2640         {
       
  2641         *errorcodeptr = ERR20;
       
  2642         goto FAILED;
       
  2643         }
       
  2644       *lengthptr += code - last_code;   /* To include callout length */
       
  2645       DPRINTF((">> end branch\n"));
       
  2646       }
       
  2647     return TRUE;
       
  2648 
       
  2649 
       
  2650     /* ===================================================================*/
       
  2651     /* Handle single-character metacharacters. In multiline mode, ^ disables
       
  2652     the setting of any following char as a first character. */
       
  2653 
       
  2654     case '^':
       
  2655     if ((options & PCRE_MULTILINE) != 0)
       
  2656       {
       
  2657       if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
       
  2658       }
       
  2659     previous = NULL;
       
  2660     *code++ = OP_CIRC;
       
  2661     break;
       
  2662 
       
  2663     case '$':
       
  2664     previous = NULL;
       
  2665     *code++ = OP_DOLL;
       
  2666     break;
       
  2667 
       
  2668     /* There can never be a first char if '.' is first, whatever happens about
       
  2669     repeats. The value of reqbyte doesn't change either. */
       
  2670 
       
  2671     case '.':
       
  2672     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
       
  2673     zerofirstbyte = firstbyte;
       
  2674     zeroreqbyte = reqbyte;
       
  2675     previous = code;
       
  2676     *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
       
  2677     break;
       
  2678 
       
  2679 
       
  2680     /* ===================================================================*/
       
  2681     /* Character classes. If the included characters are all < 256, we build a
       
  2682     32-byte bitmap of the permitted characters, except in the special case
       
  2683     where there is only one such character. For negated classes, we build the
       
  2684     map as usual, then invert it at the end. However, we use a different opcode
       
  2685     so that data characters > 255 can be handled correctly.
       
  2686 
       
  2687     If the class contains characters outside the 0-255 range, a different
       
  2688     opcode is compiled. It may optionally have a bit map for characters < 256,
       
  2689     but those above are are explicitly listed afterwards. A flag byte tells
       
  2690     whether the bitmap is present, and whether this is a negated class or not.
       
  2691 
       
  2692     In JavaScript compatibility mode, an isolated ']' causes an error. In
       
  2693     default (Perl) mode, it is treated as a data character. */
       
  2694 
       
  2695     case ']':
       
  2696     if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
       
  2697       {
       
  2698       *errorcodeptr = ERR64;
       
  2699       goto FAILED;
       
  2700       }
       
  2701     goto NORMAL_CHAR;
       
  2702 
       
  2703     case '[':
       
  2704     previous = code;
       
  2705 
       
  2706     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
       
  2707     they are encountered at the top level, so we'll do that too. */
       
  2708 
       
  2709     if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
       
  2710         check_posix_syntax(ptr, &tempptr))
       
  2711       {
       
  2712       *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
       
  2713       goto FAILED;
       
  2714       }
       
  2715 
       
  2716     /* If the first character is '^', set the negation flag and skip it. Also,
       
  2717     if the first few characters (either before or after ^) are \Q\E or \E we
       
  2718     skip them too. This makes for compatibility with Perl. */
       
  2719 
       
  2720     negate_class = FALSE;
       
  2721     for (;;)
       
  2722       {
       
  2723       c = *(++ptr);
       
  2724       if (c == '\\')
       
  2725         {
       
  2726         if (ptr[1] == 'E') ptr++;
       
  2727           else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3;
       
  2728             else break;
       
  2729         }
       
  2730       else if (!negate_class && c == '^')
       
  2731         negate_class = TRUE;
       
  2732       else break;
       
  2733       }
       
  2734 
       
  2735     /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
       
  2736     an initial ']' is taken as a data character -- the code below handles
       
  2737     that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
       
  2738     [^] must match any character, so generate OP_ALLANY. */
       
  2739 
       
  2740     if (c ==']' && (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
       
  2741       {
       
  2742       *code++ = negate_class? OP_ALLANY : OP_FAIL;
       
  2743       if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
       
  2744       zerofirstbyte = firstbyte;
       
  2745       break;
       
  2746       }
       
  2747 
       
  2748     /* If a class contains a negative special such as \S, we need to flip the
       
  2749     negation flag at the end, so that support for characters > 255 works
       
  2750     correctly (they are all included in the class). */
       
  2751 
       
  2752     should_flip_negation = FALSE;
       
  2753 
       
  2754     /* Keep a count of chars with values < 256 so that we can optimize the case
       
  2755     of just a single character (as long as it's < 256). However, For higher
       
  2756     valued UTF-8 characters, we don't yet do any optimization. */
       
  2757 
       
  2758     class_charcount = 0;
       
  2759     class_lastchar = -1;
       
  2760 
       
  2761     /* Initialize the 32-char bit map to all zeros. We build the map in a
       
  2762     temporary bit of memory, in case the class contains only 1 character (less
       
  2763     than 256), because in that case the compiled code doesn't use the bit map.
       
  2764     */
       
  2765 
       
  2766     memset(classbits, 0, 32 * sizeof(uschar));
       
  2767 
       
  2768 #ifdef SUPPORT_UTF8
       
  2769     class_utf8 = FALSE;                       /* No chars >= 256 */
       
  2770     class_utf8data = code + LINK_SIZE + 2;    /* For UTF-8 items */
       
  2771     class_utf8data_base = class_utf8data;     /* For resetting in pass 1 */
       
  2772 #endif
       
  2773 
       
  2774     /* Process characters until ] is reached. By writing this as a "do" it
       
  2775     means that an initial ] is taken as a data character. At the start of the
       
  2776     loop, c contains the first byte of the character. */
       
  2777 
       
  2778     if (c != 0) do
       
  2779       {
       
  2780       const uschar *oldptr;
       
  2781 
       
  2782 #ifdef SUPPORT_UTF8
       
  2783       if (utf8 && c > 127)
       
  2784         {                           /* Braces are required because the */
       
  2785         GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */
       
  2786         }
       
  2787 
       
  2788       /* In the pre-compile phase, accumulate the length of any UTF-8 extra
       
  2789       data and reset the pointer. This is so that very large classes that
       
  2790       contain a zillion UTF-8 characters no longer overwrite the work space
       
  2791       (which is on the stack). */
       
  2792 
       
  2793       if (lengthptr != NULL)
       
  2794         {
       
  2795         *lengthptr += class_utf8data - class_utf8data_base;
       
  2796         class_utf8data = class_utf8data_base;
       
  2797         }
       
  2798 
       
  2799 #endif
       
  2800 
       
  2801       /* Inside \Q...\E everything is literal except \E */
       
  2802 
       
  2803       if (inescq)
       
  2804         {
       
  2805         if (c == '\\' && ptr[1] == 'E')     /* If we are at \E */
       
  2806           {
       
  2807           inescq = FALSE;                   /* Reset literal state */
       
  2808           ptr++;                            /* Skip the 'E' */
       
  2809           continue;                         /* Carry on with next */
       
  2810           }
       
  2811         goto CHECK_RANGE;                   /* Could be range if \E follows */
       
  2812         }
       
  2813 
       
  2814       /* Handle POSIX class names. Perl allows a negation extension of the
       
  2815       form [:^name:]. A square bracket that doesn't match the syntax is
       
  2816       treated as a literal. We also recognize the POSIX constructions
       
  2817       [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
       
  2818       5.6 and 5.8 do. */
       
  2819 
       
  2820       if (c == '[' &&
       
  2821           (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
       
  2822           check_posix_syntax(ptr, &tempptr))
       
  2823         {
       
  2824         BOOL local_negate = FALSE;
       
  2825         int posix_class, taboffset, tabopt;
       
  2826         register const uschar *cbits = cd->cbits;
       
  2827         uschar pbits[32];
       
  2828 
       
  2829         if (ptr[1] != ':')
       
  2830           {
       
  2831           *errorcodeptr = ERR31;
       
  2832           goto FAILED;
       
  2833           }
       
  2834 
       
  2835         ptr += 2;
       
  2836         if (*ptr == '^')
       
  2837           {
       
  2838           local_negate = TRUE;
       
  2839           should_flip_negation = TRUE;  /* Note negative special */
       
  2840           ptr++;
       
  2841           }
       
  2842 
       
  2843         posix_class = check_posix_name(ptr, tempptr - ptr);
       
  2844         if (posix_class < 0)
       
  2845           {
       
  2846           *errorcodeptr = ERR30;
       
  2847           goto FAILED;
       
  2848           }
       
  2849 
       
  2850         /* If matching is caseless, upper and lower are converted to
       
  2851         alpha. This relies on the fact that the class table starts with
       
  2852         alpha, lower, upper as the first 3 entries. */
       
  2853 
       
  2854         if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
       
  2855           posix_class = 0;
       
  2856 
       
  2857         /* We build the bit map for the POSIX class in a chunk of local store
       
  2858         because we may be adding and subtracting from it, and we don't want to
       
  2859         subtract bits that may be in the main map already. At the end we or the
       
  2860         result into the bit map that is being built. */
       
  2861 
       
  2862         posix_class *= 3;
       
  2863 
       
  2864         /* Copy in the first table (always present) */
       
  2865 
       
  2866         memcpy(pbits, cbits + posix_class_maps[posix_class],
       
  2867           32 * sizeof(uschar));
       
  2868 
       
  2869         /* If there is a second table, add or remove it as required. */
       
  2870 
       
  2871         taboffset = posix_class_maps[posix_class + 1];
       
  2872         tabopt = posix_class_maps[posix_class + 2];
       
  2873 
       
  2874         if (taboffset >= 0)
       
  2875           {
       
  2876           if (tabopt >= 0)
       
  2877             for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
       
  2878           else
       
  2879             for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
       
  2880           }
       
  2881 
       
  2882         /* Not see if we need to remove any special characters. An option
       
  2883         value of 1 removes vertical space and 2 removes underscore. */
       
  2884 
       
  2885         if (tabopt < 0) tabopt = -tabopt;
       
  2886         if (tabopt == 1) pbits[1] &= ~0x3c;
       
  2887           else if (tabopt == 2) pbits[11] &= 0x7f;
       
  2888 
       
  2889         /* Add the POSIX table or its complement into the main table that is
       
  2890         being built and we are done. */
       
  2891 
       
  2892         if (local_negate)
       
  2893           for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
       
  2894         else
       
  2895           for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
       
  2896 
       
  2897         ptr = tempptr + 1;
       
  2898         class_charcount = 10;  /* Set > 1; assumes more than 1 per class */
       
  2899         continue;    /* End of POSIX syntax handling */
       
  2900         }
       
  2901 
       
  2902       /* Backslash may introduce a single character, or it may introduce one
       
  2903       of the specials, which just set a flag. The sequence \b is a special
       
  2904       case. Inside a class (and only there) it is treated as backspace.
       
  2905       Elsewhere it marks a word boundary. Other escapes have preset maps ready
       
  2906       to 'or' into the one we are building. We assume they have more than one
       
  2907       character in them, so set class_charcount bigger than one. */
       
  2908 
       
  2909       if (c == '\\')
       
  2910         {
       
  2911         c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
       
  2912         if (*errorcodeptr != 0) goto FAILED;
       
  2913 
       
  2914         if (-c == ESC_b) c = '\b';       /* \b is backspace in a class */
       
  2915         else if (-c == ESC_X) c = 'X';   /* \X is literal X in a class */
       
  2916         else if (-c == ESC_R) c = 'R';   /* \R is literal R in a class */
       
  2917         else if (-c == ESC_Q)            /* Handle start of quoted string */
       
  2918           {
       
  2919           if (ptr[1] == '\\' && ptr[2] == 'E')
       
  2920             {
       
  2921             ptr += 2; /* avoid empty string */
       
  2922             }
       
  2923           else inescq = TRUE;
       
  2924           continue;
       
  2925           }
       
  2926         else if (-c == ESC_E) continue;  /* Ignore orphan \E */
       
  2927 
       
  2928         if (c < 0)
       
  2929           {
       
  2930           register const uschar *cbits = cd->cbits;
       
  2931           class_charcount += 2;     /* Greater than 1 is what matters */
       
  2932 
       
  2933           /* Save time by not doing this in the pre-compile phase. */
       
  2934 
       
  2935           if (lengthptr == NULL) switch (-c)
       
  2936             {
       
  2937             case ESC_d:
       
  2938             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
       
  2939             continue;
       
  2940 
       
  2941             case ESC_D:
       
  2942             should_flip_negation = TRUE;
       
  2943             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
       
  2944             continue;
       
  2945 
       
  2946             case ESC_w:
       
  2947             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
       
  2948             continue;
       
  2949 
       
  2950             case ESC_W:
       
  2951             should_flip_negation = TRUE;
       
  2952             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
       
  2953             continue;
       
  2954 
       
  2955             case ESC_s:
       
  2956             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
       
  2957             classbits[1] &= ~0x08;   /* Perl 5.004 onwards omits VT from \s */
       
  2958             continue;
       
  2959 
       
  2960             case ESC_S:
       
  2961             should_flip_negation = TRUE;
       
  2962             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
       
  2963             classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */
       
  2964             continue;
       
  2965 
       
  2966             default:    /* Not recognized; fall through */
       
  2967             break;      /* Need "default" setting to stop compiler warning. */
       
  2968             }
       
  2969 
       
  2970           /* In the pre-compile phase, just do the recognition. */
       
  2971 
       
  2972           else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
       
  2973                    c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
       
  2974 
       
  2975           /* We need to deal with \H, \h, \V, and \v in both phases because
       
  2976           they use extra memory. */
       
  2977 
       
  2978           if (-c == ESC_h)
       
  2979             {
       
  2980             SETBIT(classbits, 0x09); /* VT */
       
  2981             SETBIT(classbits, 0x20); /* SPACE */
       
  2982             SETBIT(classbits, 0xa0); /* NSBP */
       
  2983 #ifdef SUPPORT_UTF8
       
  2984             if (utf8)
       
  2985               {
       
  2986               class_utf8 = TRUE;
       
  2987               *class_utf8data++ = XCL_SINGLE;
       
  2988               class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
       
  2989               *class_utf8data++ = XCL_SINGLE;
       
  2990               class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
       
  2991               *class_utf8data++ = XCL_RANGE;
       
  2992               class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
       
  2993               class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
       
  2994               *class_utf8data++ = XCL_SINGLE;
       
  2995               class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
       
  2996               *class_utf8data++ = XCL_SINGLE;
       
  2997               class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
       
  2998               *class_utf8data++ = XCL_SINGLE;
       
  2999               class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
       
  3000               }
       
  3001 #endif
       
  3002             continue;
       
  3003             }
       
  3004 
       
  3005           if (-c == ESC_H)
       
  3006             {
       
  3007             for (c = 0; c < 32; c++)
       
  3008               {
       
  3009               int x = 0xff;
       
  3010               switch (c)
       
  3011                 {
       
  3012                 case 0x09/8: x ^= 1 << (0x09%8); break;
       
  3013                 case 0x20/8: x ^= 1 << (0x20%8); break;
       
  3014                 case 0xa0/8: x ^= 1 << (0xa0%8); break;
       
  3015                 default: break;
       
  3016                 }
       
  3017               classbits[c] |= x;
       
  3018               }
       
  3019 
       
  3020 #ifdef SUPPORT_UTF8
       
  3021             if (utf8)
       
  3022               {
       
  3023               class_utf8 = TRUE;
       
  3024               *class_utf8data++ = XCL_RANGE;
       
  3025               class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
       
  3026               class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
       
  3027               *class_utf8data++ = XCL_RANGE;
       
  3028               class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
       
  3029               class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
       
  3030               *class_utf8data++ = XCL_RANGE;
       
  3031               class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
       
  3032               class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
       
  3033               *class_utf8data++ = XCL_RANGE;
       
  3034               class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
       
  3035               class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
       
  3036               *class_utf8data++ = XCL_RANGE;
       
  3037               class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
       
  3038               class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
       
  3039               *class_utf8data++ = XCL_RANGE;
       
  3040               class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
       
  3041               class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
       
  3042               *class_utf8data++ = XCL_RANGE;
       
  3043               class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
       
  3044               class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
       
  3045               }
       
  3046 #endif
       
  3047             continue;
       
  3048             }
       
  3049 
       
  3050           if (-c == ESC_v)
       
  3051             {
       
  3052             SETBIT(classbits, 0x0a); /* LF */
       
  3053             SETBIT(classbits, 0x0b); /* VT */
       
  3054             SETBIT(classbits, 0x0c); /* FF */
       
  3055             SETBIT(classbits, 0x0d); /* CR */
       
  3056             SETBIT(classbits, 0x85); /* NEL */
       
  3057 #ifdef SUPPORT_UTF8
       
  3058             if (utf8)
       
  3059               {
       
  3060               class_utf8 = TRUE;
       
  3061               *class_utf8data++ = XCL_RANGE;
       
  3062               class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
       
  3063               class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
       
  3064               }
       
  3065 #endif
       
  3066             continue;
       
  3067             }
       
  3068 
       
  3069           if (-c == ESC_V)
       
  3070             {
       
  3071             for (c = 0; c < 32; c++)
       
  3072               {
       
  3073               int x = 0xff;
       
  3074               switch (c)
       
  3075                 {
       
  3076                 case 0x0a/8: x ^= 1 << (0x0a%8);
       
  3077                              x ^= 1 << (0x0b%8);
       
  3078                              x ^= 1 << (0x0c%8);
       
  3079                              x ^= 1 << (0x0d%8);
       
  3080                              break;
       
  3081                 case 0x85/8: x ^= 1 << (0x85%8); break;
       
  3082                 default: break;
       
  3083                 }
       
  3084               classbits[c] |= x;
       
  3085               }
       
  3086 
       
  3087 #ifdef SUPPORT_UTF8
       
  3088             if (utf8)
       
  3089               {
       
  3090               class_utf8 = TRUE;
       
  3091               *class_utf8data++ = XCL_RANGE;
       
  3092               class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
       
  3093               class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
       
  3094               *class_utf8data++ = XCL_RANGE;
       
  3095               class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
       
  3096               class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
       
  3097               }
       
  3098 #endif
       
  3099             continue;
       
  3100             }
       
  3101 
       
  3102           /* We need to deal with \P and \p in both phases. */
       
  3103 
       
  3104 #ifdef SUPPORT_UCP
       
  3105           if (-c == ESC_p || -c == ESC_P)
       
  3106             {
       
  3107             BOOL negated;
       
  3108             int pdata;
       
  3109             int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
       
  3110             if (ptype < 0) goto FAILED;
       
  3111             class_utf8 = TRUE;
       
  3112             *class_utf8data++ = ((-c == ESC_p) != negated)?
       
  3113               XCL_PROP : XCL_NOTPROP;
       
  3114             *class_utf8data++ = ptype;
       
  3115             *class_utf8data++ = pdata;
       
  3116             class_charcount -= 2;   /* Not a < 256 character */
       
  3117             continue;
       
  3118             }
       
  3119 #endif
       
  3120           /* Unrecognized escapes are faulted if PCRE is running in its
       
  3121           strict mode. By default, for compatibility with Perl, they are
       
  3122           treated as literals. */
       
  3123 
       
  3124           if ((options & PCRE_EXTRA) != 0)
       
  3125             {
       
  3126             *errorcodeptr = ERR7;
       
  3127             goto FAILED;
       
  3128             }
       
  3129 
       
  3130           class_charcount -= 2;  /* Undo the default count from above */
       
  3131           c = *ptr;              /* Get the final character and fall through */
       
  3132           }
       
  3133 
       
  3134         /* Fall through if we have a single character (c >= 0). This may be
       
  3135         greater than 256 in UTF-8 mode. */
       
  3136 
       
  3137         }   /* End of backslash handling */
       
  3138 
       
  3139       /* A single character may be followed by '-' to form a range. However,
       
  3140       Perl does not permit ']' to be the end of the range. A '-' character
       
  3141       at the end is treated as a literal. Perl ignores orphaned \E sequences
       
  3142       entirely. The code for handling \Q and \E is messy. */
       
  3143 
       
  3144       CHECK_RANGE:
       
  3145       while (ptr[1] == '\\' && ptr[2] == 'E')
       
  3146         {
       
  3147         inescq = FALSE;
       
  3148         ptr += 2;
       
  3149         }
       
  3150 
       
  3151       oldptr = ptr;
       
  3152 
       
  3153       /* Remember \r or \n */
       
  3154 
       
  3155       if (c == '\r' || c == '\n') cd->external_flags |= PCRE_HASCRORLF;
       
  3156 
       
  3157       /* Check for range */
       
  3158 
       
  3159       if (!inescq && ptr[1] == '-')
       
  3160         {
       
  3161         int d;
       
  3162         ptr += 2;
       
  3163         while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
       
  3164 
       
  3165         /* If we hit \Q (not followed by \E) at this point, go into escaped
       
  3166         mode. */
       
  3167 
       
  3168         while (*ptr == '\\' && ptr[1] == 'Q')
       
  3169           {
       
  3170           ptr += 2;
       
  3171           if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
       
  3172           inescq = TRUE;
       
  3173           break;
       
  3174           }
       
  3175 
       
  3176         if (*ptr == 0 || (!inescq && *ptr == ']'))
       
  3177           {
       
  3178           ptr = oldptr;
       
  3179           goto LONE_SINGLE_CHARACTER;
       
  3180           }
       
  3181 
       
  3182 #ifdef SUPPORT_UTF8
       
  3183         if (utf8)
       
  3184           {                           /* Braces are required because the */
       
  3185           GETCHARLEN(d, ptr, ptr);    /* macro generates multiple statements */
       
  3186           }
       
  3187         else
       
  3188 #endif
       
  3189         d = *ptr;  /* Not UTF-8 mode */
       
  3190 
       
  3191         /* The second part of a range can be a single-character escape, but
       
  3192         not any of the other escapes. Perl 5.6 treats a hyphen as a literal
       
  3193         in such circumstances. */
       
  3194 
       
  3195         if (!inescq && d == '\\')
       
  3196           {
       
  3197           d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
       
  3198           if (*errorcodeptr != 0) goto FAILED;
       
  3199 
       
  3200           /* \b is backspace; \X is literal X; \R is literal R; any other
       
  3201           special means the '-' was literal */
       
  3202 
       
  3203           if (d < 0)
       
  3204             {
       
  3205             if (d == -ESC_b) d = '\b';
       
  3206             else if (d == -ESC_X) d = 'X';
       
  3207             else if (d == -ESC_R) d = 'R'; else
       
  3208               {
       
  3209               ptr = oldptr;
       
  3210               goto LONE_SINGLE_CHARACTER;  /* A few lines below */
       
  3211               }
       
  3212             }
       
  3213           }
       
  3214 
       
  3215         /* Check that the two values are in the correct order. Optimize
       
  3216         one-character ranges */
       
  3217 
       
  3218         if (d < c)
       
  3219           {
       
  3220           *errorcodeptr = ERR8;
       
  3221           goto FAILED;
       
  3222           }
       
  3223 
       
  3224         if (d == c) goto LONE_SINGLE_CHARACTER;  /* A few lines below */
       
  3225 
       
  3226         /* Remember \r or \n */
       
  3227 
       
  3228         if (d == '\r' || d == '\n') cd->external_flags |= PCRE_HASCRORLF;
       
  3229 
       
  3230         /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
       
  3231         matching, we have to use an XCLASS with extra data items. Caseless
       
  3232         matching for characters > 127 is available only if UCP support is
       
  3233         available. */
       
  3234 
       
  3235 #ifdef SUPPORT_UTF8
       
  3236         if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
       
  3237           {
       
  3238           class_utf8 = TRUE;
       
  3239 
       
  3240           /* With UCP support, we can find the other case equivalents of
       
  3241           the relevant characters. There may be several ranges. Optimize how
       
  3242           they fit with the basic range. */
       
  3243 
       
  3244 #ifdef SUPPORT_UCP
       
  3245           if ((options & PCRE_CASELESS) != 0)
       
  3246             {
       
  3247             unsigned int occ, ocd;
       
  3248             unsigned int cc = c;
       
  3249             unsigned int origd = d;
       
  3250             while (get_othercase_range(&cc, origd, &occ, &ocd))
       
  3251               {
       
  3252               if (occ >= (unsigned int)c &&
       
  3253                   ocd <= (unsigned int)d)
       
  3254                 continue;                          /* Skip embedded ranges */
       
  3255 
       
  3256               if (occ < (unsigned int)c  &&
       
  3257                   ocd >= (unsigned int)c - 1)      /* Extend the basic range */
       
  3258                 {                                  /* if there is overlap,   */
       
  3259                 c = occ;                           /* noting that if occ < c */
       
  3260                 continue;                          /* we can't have ocd > d  */
       
  3261                 }                                  /* because a subrange is  */
       
  3262               if (ocd > (unsigned int)d &&
       
  3263                   occ <= (unsigned int)d + 1)      /* always shorter than    */
       
  3264                 {                                  /* the basic range.       */
       
  3265                 d = ocd;
       
  3266                 continue;
       
  3267                 }
       
  3268 
       
  3269               if (occ == ocd)
       
  3270                 {
       
  3271                 *class_utf8data++ = XCL_SINGLE;
       
  3272                 }
       
  3273               else
       
  3274                 {
       
  3275                 *class_utf8data++ = XCL_RANGE;
       
  3276                 class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
       
  3277                 }
       
  3278               class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
       
  3279               }
       
  3280             }
       
  3281 #endif  /* SUPPORT_UCP */
       
  3282 
       
  3283           /* Now record the original range, possibly modified for UCP caseless
       
  3284           overlapping ranges. */
       
  3285 
       
  3286           *class_utf8data++ = XCL_RANGE;
       
  3287           class_utf8data += _pcre_ord2utf8(c, class_utf8data);
       
  3288           class_utf8data += _pcre_ord2utf8(d, class_utf8data);
       
  3289 
       
  3290           /* With UCP support, we are done. Without UCP support, there is no
       
  3291           caseless matching for UTF-8 characters > 127; we can use the bit map
       
  3292           for the smaller ones. */
       
  3293 
       
  3294 #ifdef SUPPORT_UCP
       
  3295           continue;    /* With next character in the class */
       
  3296 #else
       
  3297           if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
       
  3298 
       
  3299           /* Adjust upper limit and fall through to set up the map */
       
  3300 
       
  3301           d = 127;
       
  3302 
       
  3303 #endif  /* SUPPORT_UCP */
       
  3304           }
       
  3305 #endif  /* SUPPORT_UTF8 */
       
  3306 
       
  3307         /* We use the bit map for all cases when not in UTF-8 mode; else
       
  3308         ranges that lie entirely within 0-127 when there is UCP support; else
       
  3309         for partial ranges without UCP support. */
       
  3310 
       
  3311         class_charcount += d - c + 1;
       
  3312         class_lastchar = d;
       
  3313 
       
  3314         /* We can save a bit of time by skipping this in the pre-compile. */
       
  3315 
       
  3316         if (lengthptr == NULL) for (; c <= d; c++)
       
  3317           {
       
  3318           classbits[c/8] |= (1 << (c&7));
       
  3319           if ((options & PCRE_CASELESS) != 0)
       
  3320             {
       
  3321             int uc = cd->fcc[c];           /* flip case */
       
  3322             classbits[uc/8] |= (1 << (uc&7));
       
  3323             }
       
  3324           }
       
  3325 
       
  3326         continue;   /* Go get the next char in the class */
       
  3327         }
       
  3328 
       
  3329       /* Handle a lone single character - we can get here for a normal
       
  3330       non-escape char, or after \ that introduces a single character or for an
       
  3331       apparent range that isn't. */
       
  3332 
       
  3333       LONE_SINGLE_CHARACTER:
       
  3334 
       
  3335       /* Handle a character that cannot go in the bit map */
       
  3336 
       
  3337 #ifdef SUPPORT_UTF8
       
  3338       if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
       
  3339         {
       
  3340         class_utf8 = TRUE;
       
  3341         *class_utf8data++ = XCL_SINGLE;
       
  3342         class_utf8data += _pcre_ord2utf8(c, class_utf8data);
       
  3343 
       
  3344 #ifdef SUPPORT_UCP
       
  3345         if ((options & PCRE_CASELESS) != 0)
       
  3346           {
       
  3347           unsigned int othercase;
       
  3348           if ((othercase = UCD_OTHERCASE(c)) != c)
       
  3349             {
       
  3350             *class_utf8data++ = XCL_SINGLE;
       
  3351             class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
       
  3352             }
       
  3353           }
       
  3354 #endif  /* SUPPORT_UCP */
       
  3355 
       
  3356         }
       
  3357       else
       
  3358 #endif  /* SUPPORT_UTF8 */
       
  3359 
       
  3360       /* Handle a single-byte character */
       
  3361         {
       
  3362         classbits[c/8] |= (1 << (c&7));
       
  3363         if ((options & PCRE_CASELESS) != 0)
       
  3364           {
       
  3365           c = cd->fcc[c];   /* flip case */
       
  3366           classbits[c/8] |= (1 << (c&7));
       
  3367           }
       
  3368         class_charcount++;
       
  3369         class_lastchar = c;
       
  3370         }
       
  3371       }
       
  3372 
       
  3373     /* Loop until ']' reached. This "while" is the end of the "do" above. */
       
  3374 
       
  3375     while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
       
  3376 
       
  3377     if (c == 0)                          /* Missing terminating ']' */
       
  3378       {
       
  3379       *errorcodeptr = ERR6;
       
  3380       goto FAILED;
       
  3381       }
       
  3382 
       
  3383 
       
  3384 /* This code has been disabled because it would mean that \s counts as
       
  3385 an explicit \r or \n reference, and that's not really what is wanted. Now
       
  3386 we set the flag only if there is a literal "\r" or "\n" in the class. */
       
  3387 
       
  3388 #if 0
       
  3389     /* Remember whether \r or \n are in this class */
       
  3390 
       
  3391     if (negate_class)
       
  3392       {
       
  3393       if ((classbits[1] & 0x24) != 0x24) cd->external_flags |= PCRE_HASCRORLF;
       
  3394       }
       
  3395     else
       
  3396       {
       
  3397       if ((classbits[1] & 0x24) != 0) cd->external_flags |= PCRE_HASCRORLF;
       
  3398       }
       
  3399 #endif
       
  3400 
       
  3401 
       
  3402     /* If class_charcount is 1, we saw precisely one character whose value is
       
  3403     less than 256. As long as there were no characters >= 128 and there was no
       
  3404     use of \p or \P, in other words, no use of any XCLASS features, we can
       
  3405     optimize.
       
  3406 
       
  3407     In UTF-8 mode, we can optimize the negative case only if there were no
       
  3408     characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
       
  3409     operate on single-bytes only. This is an historical hangover. Maybe one day
       
  3410     we can tidy these opcodes to handle multi-byte characters.
       
  3411 
       
  3412     The optimization throws away the bit map. We turn the item into a
       
  3413     1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
       
  3414     that OP_NOT does not support multibyte characters. In the positive case, it
       
  3415     can cause firstbyte to be set. Otherwise, there can be no first char if
       
  3416     this item is first, whatever repeat count may follow. In the case of
       
  3417     reqbyte, save the previous value for reinstating. */
       
  3418 
       
  3419 #ifdef SUPPORT_UTF8
       
  3420     if (class_charcount == 1 && !class_utf8 &&
       
  3421       (!utf8 || !negate_class || class_lastchar < 128))
       
  3422 #else
       
  3423     if (class_charcount == 1)
       
  3424 #endif
       
  3425       {
       
  3426       zeroreqbyte = reqbyte;
       
  3427 
       
  3428       /* The OP_NOT opcode works on one-byte characters only. */
       
  3429 
       
  3430       if (negate_class)
       
  3431         {
       
  3432         if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
       
  3433         zerofirstbyte = firstbyte;
       
  3434         *code++ = OP_NOT;
       
  3435         *code++ = class_lastchar;
       
  3436         break;
       
  3437         }
       
  3438 
       
  3439       /* For a single, positive character, get the value into mcbuffer, and
       
  3440       then we can handle this with the normal one-character code. */
       
  3441 
       
  3442 #ifdef SUPPORT_UTF8
       
  3443       if (utf8 && class_lastchar > 127)
       
  3444         mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
       
  3445       else
       
  3446 #endif
       
  3447         {
       
  3448         mcbuffer[0] = class_lastchar;
       
  3449         mclength = 1;
       
  3450         }
       
  3451       goto ONE_CHAR;
       
  3452       }       /* End of 1-char optimization */
       
  3453 
       
  3454     /* The general case - not the one-char optimization. If this is the first
       
  3455     thing in the branch, there can be no first char setting, whatever the
       
  3456     repeat count. Any reqbyte setting must remain unchanged after any kind of
       
  3457     repeat. */
       
  3458 
       
  3459     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
       
  3460     zerofirstbyte = firstbyte;
       
  3461     zeroreqbyte = reqbyte;
       
  3462 
       
  3463     /* If there are characters with values > 255, we have to compile an
       
  3464     extended class, with its own opcode, unless there was a negated special
       
  3465     such as \S in the class, because in that case all characters > 255 are in
       
  3466     the class, so any that were explicitly given as well can be ignored. If
       
  3467     (when there are explicit characters > 255 that must be listed) there are no
       
  3468     characters < 256, we can omit the bitmap in the actual compiled code. */
       
  3469 
       
  3470 #ifdef SUPPORT_UTF8
       
  3471     if (class_utf8 && !should_flip_negation)
       
  3472       {
       
  3473       *class_utf8data++ = XCL_END;    /* Marks the end of extra data */
       
  3474       *code++ = OP_XCLASS;
       
  3475       code += LINK_SIZE;
       
  3476       *code = negate_class? XCL_NOT : 0;
       
  3477 
       
  3478       /* If the map is required, move up the extra data to make room for it;
       
  3479       otherwise just move the code pointer to the end of the extra data. */
       
  3480 
       
  3481       if (class_charcount > 0)
       
  3482         {
       
  3483         *code++ |= XCL_MAP;
       
  3484         memmove(code + 32, code, class_utf8data - code);
       
  3485         memcpy(code, classbits, 32);
       
  3486         code = class_utf8data + 32;
       
  3487         }
       
  3488       else code = class_utf8data;
       
  3489 
       
  3490       /* Now fill in the complete length of the item */
       
  3491 
       
  3492       PUT(previous, 1, code - previous);
       
  3493       break;   /* End of class handling */
       
  3494       }
       
  3495 #endif
       
  3496 
       
  3497     /* If there are no characters > 255, set the opcode to OP_CLASS or
       
  3498     OP_NCLASS, depending on whether the whole class was negated and whether
       
  3499     there were negative specials such as \S in the class. Then copy the 32-byte
       
  3500     map into the code vector, negating it if necessary. */
       
  3501 
       
  3502     *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
       
  3503     if (negate_class)
       
  3504       {
       
  3505       if (lengthptr == NULL)    /* Save time in the pre-compile phase */
       
  3506         for (c = 0; c < 32; c++) code[c] = ~classbits[c];
       
  3507       }
       
  3508     else
       
  3509       {
       
  3510       memcpy(code, classbits, 32);
       
  3511       }
       
  3512     code += 32;
       
  3513     break;
       
  3514 
       
  3515 
       
  3516     /* ===================================================================*/
       
  3517     /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
       
  3518     has been tested above. */
       
  3519 
       
  3520     case '{':
       
  3521     if (!is_quantifier) goto NORMAL_CHAR;
       
  3522     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
       
  3523     if (*errorcodeptr != 0) goto FAILED;
       
  3524     goto REPEAT;
       
  3525 
       
  3526     case '*':
       
  3527     repeat_min = 0;
       
  3528     repeat_max = -1;
       
  3529     goto REPEAT;
       
  3530 
       
  3531     case '+':
       
  3532     repeat_min = 1;
       
  3533     repeat_max = -1;
       
  3534     goto REPEAT;
       
  3535 
       
  3536     case '?':
       
  3537     repeat_min = 0;
       
  3538     repeat_max = 1;
       
  3539 
       
  3540     REPEAT:
       
  3541     if (previous == NULL)
       
  3542       {
       
  3543       *errorcodeptr = ERR9;
       
  3544       goto FAILED;
       
  3545       }
       
  3546 
       
  3547     if (repeat_min == 0)
       
  3548       {
       
  3549       firstbyte = zerofirstbyte;    /* Adjust for zero repeat */
       
  3550       reqbyte = zeroreqbyte;        /* Ditto */
       
  3551       }
       
  3552 
       
  3553     /* Remember whether this is a variable length repeat */
       
  3554 
       
  3555     reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
       
  3556 
       
  3557     op_type = 0;                    /* Default single-char op codes */
       
  3558     possessive_quantifier = FALSE;  /* Default not possessive quantifier */
       
  3559 
       
  3560     /* Save start of previous item, in case we have to move it up to make space
       
  3561     for an inserted OP_ONCE for the additional '+' extension. */
       
  3562 
       
  3563     tempcode = previous;
       
  3564 
       
  3565     /* If the next character is '+', we have a possessive quantifier. This
       
  3566     implies greediness, whatever the setting of the PCRE_UNGREEDY option.
       
  3567     If the next character is '?' this is a minimizing repeat, by default,
       
  3568     but if PCRE_UNGREEDY is set, it works the other way round. We change the
       
  3569     repeat type to the non-default. */
       
  3570 
       
  3571     if (ptr[1] == '+')
       
  3572       {
       
  3573       repeat_type = 0;                  /* Force greedy */
       
  3574       possessive_quantifier = TRUE;
       
  3575       ptr++;
       
  3576       }
       
  3577     else if (ptr[1] == '?')
       
  3578       {
       
  3579       repeat_type = greedy_non_default;
       
  3580       ptr++;
       
  3581       }
       
  3582     else repeat_type = greedy_default;
       
  3583 
       
  3584     /* If previous was a character match, abolish the item and generate a
       
  3585     repeat item instead. If a char item has a minumum of more than one, ensure
       
  3586     that it is set in reqbyte - it might not be if a sequence such as x{3} is
       
  3587     the first thing in a branch because the x will have gone into firstbyte
       
  3588     instead.  */
       
  3589 
       
  3590     if (*previous == OP_CHAR || *previous == OP_CHARNC)
       
  3591       {
       
  3592       /* Deal with UTF-8 characters that take up more than one byte. It's
       
  3593       easier to write this out separately than try to macrify it. Use c to
       
  3594       hold the length of the character in bytes, plus 0x80 to flag that it's a
       
  3595       length rather than a small character. */
       
  3596 
       
  3597 #ifdef SUPPORT_UTF8
       
  3598       if (utf8 && (code[-1] & 0x80) != 0)
       
  3599         {
       
  3600         uschar *lastchar = code - 1;
       
  3601         while((*lastchar & 0xc0) == 0x80) lastchar--;
       
  3602         c = code - lastchar;            /* Length of UTF-8 character */
       
  3603         memcpy(utf8_char, lastchar, c); /* Save the char */
       
  3604         c |= 0x80;                      /* Flag c as a length */
       
  3605         }
       
  3606       else
       
  3607 #endif
       
  3608 
       
  3609       /* Handle the case of a single byte - either with no UTF8 support, or
       
  3610       with UTF-8 disabled, or for a UTF-8 character < 128. */
       
  3611 
       
  3612         {
       
  3613         c = code[-1];
       
  3614         if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
       
  3615         }
       
  3616 
       
  3617       /* If the repetition is unlimited, it pays to see if the next thing on
       
  3618       the line is something that cannot possibly match this character. If so,
       
  3619       automatically possessifying this item gains some performance in the case
       
  3620       where the match fails. */
       
  3621 
       
  3622       if (!possessive_quantifier &&
       
  3623           repeat_max < 0 &&
       
  3624           check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
       
  3625             options, cd))
       
  3626         {
       
  3627         repeat_type = 0;    /* Force greedy */
       
  3628         possessive_quantifier = TRUE;
       
  3629         }
       
  3630 
       
  3631       goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
       
  3632       }
       
  3633 
       
  3634     /* If previous was a single negated character ([^a] or similar), we use
       
  3635     one of the special opcodes, replacing it. The code is shared with single-
       
  3636     character repeats by setting opt_type to add a suitable offset into
       
  3637     repeat_type. We can also test for auto-possessification. OP_NOT is
       
  3638     currently used only for single-byte chars. */
       
  3639 
       
  3640     else if (*previous == OP_NOT)
       
  3641       {
       
  3642       op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */
       
  3643       c = previous[1];
       
  3644       if (!possessive_quantifier &&
       
  3645           repeat_max < 0 &&
       
  3646           check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
       
  3647         {
       
  3648         repeat_type = 0;    /* Force greedy */
       
  3649         possessive_quantifier = TRUE;
       
  3650         }
       
  3651       goto OUTPUT_SINGLE_REPEAT;
       
  3652       }
       
  3653 
       
  3654     /* If previous was a character type match (\d or similar), abolish it and
       
  3655     create a suitable repeat item. The code is shared with single-character
       
  3656     repeats by setting op_type to add a suitable offset into repeat_type. Note
       
  3657     the the Unicode property types will be present only when SUPPORT_UCP is
       
  3658     defined, but we don't wrap the little bits of code here because it just
       
  3659     makes it horribly messy. */
       
  3660 
       
  3661     else if (*previous < OP_EODN)
       
  3662       {
       
  3663       uschar *oldcode;
       
  3664       int prop_type, prop_value;
       
  3665       op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
       
  3666       c = *previous;
       
  3667 
       
  3668       if (!possessive_quantifier &&
       
  3669           repeat_max < 0 &&
       
  3670           check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
       
  3671         {
       
  3672         repeat_type = 0;    /* Force greedy */
       
  3673         possessive_quantifier = TRUE;
       
  3674         }
       
  3675 
       
  3676       OUTPUT_SINGLE_REPEAT:
       
  3677       if (*previous == OP_PROP || *previous == OP_NOTPROP)
       
  3678         {
       
  3679         prop_type = previous[1];
       
  3680         prop_value = previous[2];
       
  3681         }
       
  3682       else prop_type = prop_value = -1;
       
  3683 
       
  3684       oldcode = code;
       
  3685       code = previous;                  /* Usually overwrite previous item */
       
  3686 
       
  3687       /* If the maximum is zero then the minimum must also be zero; Perl allows
       
  3688       this case, so we do too - by simply omitting the item altogether. */
       
  3689 
       
  3690       if (repeat_max == 0) goto END_REPEAT;
       
  3691 
       
  3692       /* All real repeats make it impossible to handle partial matching (maybe
       
  3693       one day we will be able to remove this restriction). */
       
  3694 
       
  3695       if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
       
  3696 
       
  3697       /* Combine the op_type with the repeat_type */
       
  3698 
       
  3699       repeat_type += op_type;
       
  3700 
       
  3701       /* A minimum of zero is handled either as the special case * or ?, or as
       
  3702       an UPTO, with the maximum given. */
       
  3703 
       
  3704       if (repeat_min == 0)
       
  3705         {
       
  3706         if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
       
  3707           else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
       
  3708         else
       
  3709           {
       
  3710           *code++ = OP_UPTO + repeat_type;
       
  3711           PUT2INC(code, 0, repeat_max);
       
  3712           }
       
  3713         }
       
  3714 
       
  3715       /* A repeat minimum of 1 is optimized into some special cases. If the
       
  3716       maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
       
  3717       left in place and, if the maximum is greater than 1, we use OP_UPTO with
       
  3718       one less than the maximum. */
       
  3719 
       
  3720       else if (repeat_min == 1)
       
  3721         {
       
  3722         if (repeat_max == -1)
       
  3723           *code++ = OP_PLUS + repeat_type;
       
  3724         else
       
  3725           {
       
  3726           code = oldcode;                 /* leave previous item in place */
       
  3727           if (repeat_max == 1) goto END_REPEAT;
       
  3728           *code++ = OP_UPTO + repeat_type;
       
  3729           PUT2INC(code, 0, repeat_max - 1);
       
  3730           }
       
  3731         }
       
  3732 
       
  3733       /* The case {n,n} is just an EXACT, while the general case {n,m} is
       
  3734       handled as an EXACT followed by an UPTO. */
       
  3735 
       
  3736       else
       
  3737         {
       
  3738         *code++ = OP_EXACT + op_type;  /* NB EXACT doesn't have repeat_type */
       
  3739         PUT2INC(code, 0, repeat_min);
       
  3740 
       
  3741         /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
       
  3742         we have to insert the character for the previous code. For a repeated
       
  3743         Unicode property match, there are two extra bytes that define the
       
  3744         required property. In UTF-8 mode, long characters have their length in
       
  3745         c, with the 0x80 bit as a flag. */
       
  3746 
       
  3747         if (repeat_max < 0)
       
  3748           {
       
  3749 #ifdef SUPPORT_UTF8
       
  3750           if (utf8 && c >= 128)
       
  3751             {
       
  3752             memcpy(code, utf8_char, c & 7);
       
  3753             code += c & 7;
       
  3754             }
       
  3755           else
       
  3756 #endif
       
  3757             {
       
  3758             *code++ = c;
       
  3759             if (prop_type >= 0)
       
  3760               {
       
  3761               *code++ = prop_type;
       
  3762               *code++ = prop_value;
       
  3763               }
       
  3764             }
       
  3765           *code++ = OP_STAR + repeat_type;
       
  3766           }
       
  3767 
       
  3768         /* Else insert an UPTO if the max is greater than the min, again
       
  3769         preceded by the character, for the previously inserted code. If the
       
  3770         UPTO is just for 1 instance, we can use QUERY instead. */
       
  3771 
       
  3772         else if (repeat_max != repeat_min)
       
  3773           {
       
  3774 #ifdef SUPPORT_UTF8
       
  3775           if (utf8 && c >= 128)
       
  3776             {
       
  3777             memcpy(code, utf8_char, c & 7);
       
  3778             code += c & 7;
       
  3779             }
       
  3780           else
       
  3781 #endif
       
  3782           *code++ = c;
       
  3783           if (prop_type >= 0)
       
  3784             {
       
  3785             *code++ = prop_type;
       
  3786             *code++ = prop_value;
       
  3787             }
       
  3788           repeat_max -= repeat_min;
       
  3789 
       
  3790           if (repeat_max == 1)
       
  3791             {
       
  3792             *code++ = OP_QUERY + repeat_type;
       
  3793             }
       
  3794           else
       
  3795             {
       
  3796             *code++ = OP_UPTO + repeat_type;
       
  3797             PUT2INC(code, 0, repeat_max);
       
  3798             }
       
  3799           }
       
  3800         }
       
  3801 
       
  3802       /* The character or character type itself comes last in all cases. */
       
  3803 
       
  3804 #ifdef SUPPORT_UTF8
       
  3805       if (utf8 && c >= 128)
       
  3806         {
       
  3807         memcpy(code, utf8_char, c & 7);
       
  3808         code += c & 7;
       
  3809         }
       
  3810       else
       
  3811 #endif
       
  3812       *code++ = c;
       
  3813 
       
  3814       /* For a repeated Unicode property match, there are two extra bytes that
       
  3815       define the required property. */
       
  3816 
       
  3817 #ifdef SUPPORT_UCP
       
  3818       if (prop_type >= 0)
       
  3819         {
       
  3820         *code++ = prop_type;
       
  3821         *code++ = prop_value;
       
  3822         }
       
  3823 #endif
       
  3824       }
       
  3825 
       
  3826     /* If previous was a character class or a back reference, we put the repeat
       
  3827     stuff after it, but just skip the item if the repeat was {0,0}. */
       
  3828 
       
  3829     else if (*previous == OP_CLASS ||
       
  3830              *previous == OP_NCLASS ||
       
  3831 #ifdef SUPPORT_UTF8
       
  3832              *previous == OP_XCLASS ||
       
  3833 #endif
       
  3834              *previous == OP_REF)
       
  3835       {
       
  3836       if (repeat_max == 0)
       
  3837         {
       
  3838         code = previous;
       
  3839         goto END_REPEAT;
       
  3840         }
       
  3841 
       
  3842       /* All real repeats make it impossible to handle partial matching (maybe
       
  3843       one day we will be able to remove this restriction). */
       
  3844 
       
  3845       if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
       
  3846 
       
  3847       if (repeat_min == 0 && repeat_max == -1)
       
  3848         *code++ = OP_CRSTAR + repeat_type;
       
  3849       else if (repeat_min == 1 && repeat_max == -1)
       
  3850         *code++ = OP_CRPLUS + repeat_type;
       
  3851       else if (repeat_min == 0 && repeat_max == 1)
       
  3852         *code++ = OP_CRQUERY + repeat_type;
       
  3853       else
       
  3854         {
       
  3855         *code++ = OP_CRRANGE + repeat_type;
       
  3856         PUT2INC(code, 0, repeat_min);
       
  3857         if (repeat_max == -1) repeat_max = 0;  /* 2-byte encoding for max */
       
  3858         PUT2INC(code, 0, repeat_max);
       
  3859         }
       
  3860       }
       
  3861 
       
  3862     /* If previous was a bracket group, we may have to replicate it in certain
       
  3863     cases. */
       
  3864 
       
  3865     else if (*previous == OP_BRA  || *previous == OP_CBRA ||
       
  3866              *previous == OP_ONCE || *previous == OP_COND)
       
  3867       {
       
  3868       register int i;
       
  3869       int ketoffset = 0;
       
  3870       int len = code - previous;
       
  3871       uschar *bralink = NULL;
       
  3872 
       
  3873       /* Repeating a DEFINE group is pointless */
       
  3874 
       
  3875       if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
       
  3876         {
       
  3877         *errorcodeptr = ERR55;
       
  3878         goto FAILED;
       
  3879         }
       
  3880 
       
  3881       /* If the maximum repeat count is unlimited, find the end of the bracket
       
  3882       by scanning through from the start, and compute the offset back to it
       
  3883       from the current code pointer. There may be an OP_OPT setting following
       
  3884       the final KET, so we can't find the end just by going back from the code
       
  3885       pointer. */
       
  3886 
       
  3887       if (repeat_max == -1)
       
  3888         {
       
  3889         register uschar *ket = previous;
       
  3890         do ket += GET(ket, 1); while (*ket != OP_KET);
       
  3891         ketoffset = code - ket;
       
  3892         }
       
  3893 
       
  3894       /* The case of a zero minimum is special because of the need to stick
       
  3895       OP_BRAZERO in front of it, and because the group appears once in the
       
  3896       data, whereas in other cases it appears the minimum number of times. For
       
  3897       this reason, it is simplest to treat this case separately, as otherwise
       
  3898       the code gets far too messy. There are several special subcases when the
       
  3899       minimum is zero. */
       
  3900 
       
  3901       if (repeat_min == 0)
       
  3902         {
       
  3903         /* If the maximum is also zero, we used to just omit the group from the
       
  3904         output altogether, like this:
       
  3905 
       
  3906         ** if (repeat_max == 0)
       
  3907         **   {
       
  3908         **   code = previous;
       
  3909         **   goto END_REPEAT;
       
  3910         **   }
       
  3911 
       
  3912         However, that fails when a group is referenced as a subroutine from
       
  3913         elsewhere in the pattern, so now we stick in OP_SKIPZERO in front of it
       
  3914         so that it is skipped on execution. As we don't have a list of which
       
  3915         groups are referenced, we cannot do this selectively.
       
  3916 
       
  3917         If the maximum is 1 or unlimited, we just have to stick in the BRAZERO
       
  3918         and do no more at this point. However, we do need to adjust any
       
  3919         OP_RECURSE calls inside the group that refer to the group itself or any
       
  3920         internal or forward referenced group, because the offset is from the
       
  3921         start of the whole regex. Temporarily terminate the pattern while doing
       
  3922         this. */
       
  3923 
       
  3924         if (repeat_max <= 1)    /* Covers 0, 1, and unlimited */
       
  3925           {
       
  3926           *code = OP_END;
       
  3927           adjust_recurse(previous, 1, utf8, cd, save_hwm);
       
  3928           memmove(previous+1, previous, len);
       
  3929           code++;
       
  3930           if (repeat_max == 0)
       
  3931             {
       
  3932             *previous++ = OP_SKIPZERO;
       
  3933             goto END_REPEAT;
       
  3934             }
       
  3935           *previous++ = OP_BRAZERO + repeat_type;
       
  3936           }
       
  3937 
       
  3938         /* If the maximum is greater than 1 and limited, we have to replicate
       
  3939         in a nested fashion, sticking OP_BRAZERO before each set of brackets.
       
  3940         The first one has to be handled carefully because it's the original
       
  3941         copy, which has to be moved up. The remainder can be handled by code
       
  3942         that is common with the non-zero minimum case below. We have to
       
  3943         adjust the value or repeat_max, since one less copy is required. Once
       
  3944         again, we may have to adjust any OP_RECURSE calls inside the group. */
       
  3945 
       
  3946         else
       
  3947           {
       
  3948           int offset;
       
  3949           *code = OP_END;
       
  3950           adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
       
  3951           memmove(previous + 2 + LINK_SIZE, previous, len);
       
  3952           code += 2 + LINK_SIZE;
       
  3953           *previous++ = OP_BRAZERO + repeat_type;
       
  3954           *previous++ = OP_BRA;
       
  3955 
       
  3956           /* We chain together the bracket offset fields that have to be
       
  3957           filled in later when the ends of the brackets are reached. */
       
  3958 
       
  3959           offset = (bralink == NULL)? 0 : previous - bralink;
       
  3960           bralink = previous;
       
  3961           PUTINC(previous, 0, offset);
       
  3962           }
       
  3963 
       
  3964         repeat_max--;
       
  3965         }
       
  3966 
       
  3967       /* If the minimum is greater than zero, replicate the group as many
       
  3968       times as necessary, and adjust the maximum to the number of subsequent
       
  3969       copies that we need. If we set a first char from the group, and didn't
       
  3970       set a required char, copy the latter from the former. If there are any
       
  3971       forward reference subroutine calls in the group, there will be entries on
       
  3972       the workspace list; replicate these with an appropriate increment. */
       
  3973 
       
  3974       else
       
  3975         {
       
  3976         if (repeat_min > 1)
       
  3977           {
       
  3978           /* In the pre-compile phase, we don't actually do the replication. We
       
  3979           just adjust the length as if we had. Do some paranoid checks for
       
  3980           potential integer overflow. */
       
  3981 
       
  3982           if (lengthptr != NULL)
       
  3983             {
       
  3984             int delta = (repeat_min - 1)*length_prevgroup;
       
  3985             if ((double)(repeat_min - 1)*(double)length_prevgroup >
       
  3986                                                             (double)INT_MAX ||
       
  3987                 OFLOW_MAX - *lengthptr < delta)
       
  3988               {
       
  3989               *errorcodeptr = ERR20;
       
  3990               goto FAILED;
       
  3991               }
       
  3992             *lengthptr += delta;
       
  3993             }
       
  3994 
       
  3995           /* This is compiling for real */
       
  3996 
       
  3997           else
       
  3998             {
       
  3999             if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
       
  4000             for (i = 1; i < repeat_min; i++)
       
  4001               {
       
  4002               uschar *hc;
       
  4003               uschar *this_hwm = cd->hwm;
       
  4004               memcpy(code, previous, len);
       
  4005               for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
       
  4006                 {
       
  4007                 PUT(cd->hwm, 0, GET(hc, 0) + len);
       
  4008                 cd->hwm += LINK_SIZE;
       
  4009                 }
       
  4010               save_hwm = this_hwm;
       
  4011               code += len;
       
  4012               }
       
  4013             }
       
  4014           }
       
  4015 
       
  4016         if (repeat_max > 0) repeat_max -= repeat_min;
       
  4017         }
       
  4018 
       
  4019       /* This code is common to both the zero and non-zero minimum cases. If
       
  4020       the maximum is limited, it replicates the group in a nested fashion,
       
  4021       remembering the bracket starts on a stack. In the case of a zero minimum,
       
  4022       the first one was set up above. In all cases the repeat_max now specifies
       
  4023       the number of additional copies needed. Again, we must remember to
       
  4024       replicate entries on the forward reference list. */
       
  4025 
       
  4026       if (repeat_max >= 0)
       
  4027         {
       
  4028         /* In the pre-compile phase, we don't actually do the replication. We
       
  4029         just adjust the length as if we had. For each repetition we must add 1
       
  4030         to the length for BRAZERO and for all but the last repetition we must
       
  4031         add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
       
  4032         paranoid checks to avoid integer overflow. */
       
  4033 
       
  4034         if (lengthptr != NULL && repeat_max > 0)
       
  4035           {
       
  4036           int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
       
  4037                       2 - 2*LINK_SIZE;   /* Last one doesn't nest */
       
  4038           if ((double)repeat_max *
       
  4039                 (double)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
       
  4040                   > (double)INT_MAX ||
       
  4041               OFLOW_MAX - *lengthptr < delta)
       
  4042             {
       
  4043             *errorcodeptr = ERR20;
       
  4044             goto FAILED;
       
  4045             }
       
  4046           *lengthptr += delta;
       
  4047           }
       
  4048 
       
  4049         /* This is compiling for real */
       
  4050 
       
  4051         else for (i = repeat_max - 1; i >= 0; i--)
       
  4052           {
       
  4053           uschar *hc;
       
  4054           uschar *this_hwm = cd->hwm;
       
  4055 
       
  4056           *code++ = OP_BRAZERO + repeat_type;
       
  4057 
       
  4058           /* All but the final copy start a new nesting, maintaining the
       
  4059           chain of brackets outstanding. */
       
  4060 
       
  4061           if (i != 0)
       
  4062             {
       
  4063             int offset;
       
  4064             *code++ = OP_BRA;
       
  4065             offset = (bralink == NULL)? 0 : code - bralink;
       
  4066             bralink = code;
       
  4067             PUTINC(code, 0, offset);
       
  4068             }
       
  4069 
       
  4070           memcpy(code, previous, len);
       
  4071           for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
       
  4072             {
       
  4073             PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
       
  4074             cd->hwm += LINK_SIZE;
       
  4075             }
       
  4076           save_hwm = this_hwm;
       
  4077           code += len;
       
  4078           }
       
  4079 
       
  4080         /* Now chain through the pending brackets, and fill in their length
       
  4081         fields (which are holding the chain links pro tem). */
       
  4082 
       
  4083         while (bralink != NULL)
       
  4084           {
       
  4085           int oldlinkoffset;
       
  4086           int offset = code - bralink + 1;
       
  4087           uschar *bra = code - offset;
       
  4088           oldlinkoffset = GET(bra, 1);
       
  4089           bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
       
  4090           *code++ = OP_KET;
       
  4091           PUTINC(code, 0, offset);
       
  4092           PUT(bra, 1, offset);
       
  4093           }
       
  4094         }
       
  4095 
       
  4096       /* If the maximum is unlimited, set a repeater in the final copy. We
       
  4097       can't just offset backwards from the current code point, because we
       
  4098       don't know if there's been an options resetting after the ket. The
       
  4099       correct offset was computed above.
       
  4100 
       
  4101       Then, when we are doing the actual compile phase, check to see whether
       
  4102       this group is a non-atomic one that could match an empty string. If so,
       
  4103       convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
       
  4104       that runtime checking can be done. [This check is also applied to
       
  4105       atomic groups at runtime, but in a different way.] */
       
  4106 
       
  4107       else
       
  4108         {
       
  4109         uschar *ketcode = code - ketoffset;
       
  4110         uschar *bracode = ketcode - GET(ketcode, 1);
       
  4111         *ketcode = OP_KETRMAX + repeat_type;
       
  4112         if (lengthptr == NULL && *bracode != OP_ONCE)
       
  4113           {
       
  4114           uschar *scode = bracode;
       
  4115           do
       
  4116             {
       
  4117             if (could_be_empty_branch(scode, ketcode, utf8))
       
  4118               {
       
  4119               *bracode += OP_SBRA - OP_BRA;
       
  4120               break;
       
  4121               }
       
  4122             scode += GET(scode, 1);
       
  4123             }
       
  4124           while (*scode == OP_ALT);
       
  4125           }
       
  4126         }
       
  4127       }
       
  4128 
       
  4129     /* If previous is OP_FAIL, it was generated by an empty class [] in
       
  4130     JavaScript mode. The other ways in which OP_FAIL can be generated, that is
       
  4131     by (*FAIL) or (?!) set previous to NULL, which gives a "nothing to repeat"
       
  4132     error above. We can just ignore the repeat in JS case. */
       
  4133 
       
  4134     else if (*previous == OP_FAIL) goto END_REPEAT;
       
  4135 
       
  4136     /* Else there's some kind of shambles */
       
  4137 
       
  4138     else
       
  4139       {
       
  4140       *errorcodeptr = ERR11;
       
  4141       goto FAILED;
       
  4142       }
       
  4143 
       
  4144     /* If the character following a repeat is '+', or if certain optimization
       
  4145     tests above succeeded, possessive_quantifier is TRUE. For some of the
       
  4146     simpler opcodes, there is an special alternative opcode for this. For
       
  4147     anything else, we wrap the entire repeated item inside OP_ONCE brackets.
       
  4148     The '+' notation is just syntactic sugar, taken from Sun's Java package,
       
  4149     but the special opcodes can optimize it a bit. The repeated item starts at
       
  4150     tempcode, not at previous, which might be the first part of a string whose
       
  4151     (former) last char we repeated.
       
  4152 
       
  4153     Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
       
  4154     an 'upto' may follow. We skip over an 'exact' item, and then test the
       
  4155     length of what remains before proceeding. */
       
  4156 
       
  4157     if (possessive_quantifier)
       
  4158       {
       
  4159       int len;
       
  4160       if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
       
  4161           *tempcode == OP_NOTEXACT)
       
  4162         tempcode += _pcre_OP_lengths[*tempcode] +
       
  4163           ((*tempcode == OP_TYPEEXACT &&
       
  4164              (tempcode[3] == OP_PROP || tempcode[3] == OP_NOTPROP))? 2:0);
       
  4165       len = code - tempcode;
       
  4166       if (len > 0) switch (*tempcode)
       
  4167         {
       
  4168         case OP_STAR:  *tempcode = OP_POSSTAR; break;
       
  4169         case OP_PLUS:  *tempcode = OP_POSPLUS; break;
       
  4170         case OP_QUERY: *tempcode = OP_POSQUERY; break;
       
  4171         case OP_UPTO:  *tempcode = OP_POSUPTO; break;
       
  4172 
       
  4173         case OP_TYPESTAR:  *tempcode = OP_TYPEPOSSTAR; break;
       
  4174         case OP_TYPEPLUS:  *tempcode = OP_TYPEPOSPLUS; break;
       
  4175         case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
       
  4176         case OP_TYPEUPTO:  *tempcode = OP_TYPEPOSUPTO; break;
       
  4177 
       
  4178         case OP_NOTSTAR:  *tempcode = OP_NOTPOSSTAR; break;
       
  4179         case OP_NOTPLUS:  *tempcode = OP_NOTPOSPLUS; break;
       
  4180         case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
       
  4181         case OP_NOTUPTO:  *tempcode = OP_NOTPOSUPTO; break;
       
  4182 
       
  4183         default:
       
  4184         memmove(tempcode + 1+LINK_SIZE, tempcode, len);
       
  4185         code += 1 + LINK_SIZE;
       
  4186         len += 1 + LINK_SIZE;
       
  4187         tempcode[0] = OP_ONCE;
       
  4188         *code++ = OP_KET;
       
  4189         PUTINC(code, 0, len);
       
  4190         PUT(tempcode, 1, len);
       
  4191         break;
       
  4192         }
       
  4193       }
       
  4194 
       
  4195     /* In all case we no longer have a previous item. We also set the
       
  4196     "follows varying string" flag for subsequently encountered reqbytes if
       
  4197     it isn't already set and we have just passed a varying length item. */
       
  4198 
       
  4199     END_REPEAT:
       
  4200     previous = NULL;
       
  4201     cd->req_varyopt |= reqvary;
       
  4202     break;
       
  4203 
       
  4204 
       
  4205     /* ===================================================================*/
       
  4206     /* Start of nested parenthesized sub-expression, or comment or lookahead or
       
  4207     lookbehind or option setting or condition or all the other extended
       
  4208     parenthesis forms.  */
       
  4209 
       
  4210     case '(':
       
  4211     newoptions = options;
       
  4212     skipbytes = 0;
       
  4213     bravalue = OP_CBRA;
       
  4214     save_hwm = cd->hwm;
       
  4215     reset_bracount = FALSE;
       
  4216 
       
  4217     /* First deal with various "verbs" that can be introduced by '*'. */
       
  4218 
       
  4219     if (*(++ptr) == '*' && (cd->ctypes[ptr[1]] & ctype_letter) != 0)
       
  4220       {
       
  4221       int i, namelen;
       
  4222       const char *vn = verbnames;
       
  4223       const uschar *name = ++ptr;
       
  4224       previous = NULL;
       
  4225       while ((cd->ctypes[*++ptr] & ctype_letter) != 0) {};
       
  4226       if (*ptr == ':')
       
  4227         {
       
  4228         *errorcodeptr = ERR59;   /* Not supported */
       
  4229         goto FAILED;
       
  4230         }
       
  4231       if (*ptr != ')')
       
  4232         {
       
  4233         *errorcodeptr = ERR60;
       
  4234         goto FAILED;
       
  4235         }
       
  4236       namelen = ptr - name;
       
  4237       for (i = 0; i < verbcount; i++)
       
  4238         {
       
  4239         if (namelen == verbs[i].len &&
       
  4240             strncmp((char *)name, vn, namelen) == 0)
       
  4241           {
       
  4242           *code = verbs[i].op;
       
  4243           if (*code++ == OP_ACCEPT) cd->had_accept = TRUE;
       
  4244           break;
       
  4245           }
       
  4246         vn += verbs[i].len + 1;
       
  4247         }
       
  4248       if (i < verbcount) continue;
       
  4249       *errorcodeptr = ERR60;
       
  4250       goto FAILED;
       
  4251       }
       
  4252 
       
  4253     /* Deal with the extended parentheses; all are introduced by '?', and the
       
  4254     appearance of any of them means that this is not a capturing group. */
       
  4255 
       
  4256     else if (*ptr == '?')
       
  4257       {
       
  4258       int i, set, unset, namelen;
       
  4259       int *optset;
       
  4260       const uschar *name;
       
  4261       uschar *slot;
       
  4262 
       
  4263       switch (*(++ptr))
       
  4264         {
       
  4265         case '#':                 /* Comment; skip to ket */
       
  4266         ptr++;
       
  4267         while (*ptr != 0 && *ptr != ')') ptr++;
       
  4268         if (*ptr == 0)
       
  4269           {
       
  4270           *errorcodeptr = ERR18;
       
  4271           goto FAILED;
       
  4272           }
       
  4273         continue;
       
  4274 
       
  4275 
       
  4276         /* ------------------------------------------------------------ */
       
  4277         case '|':                 /* Reset capture count for each branch */
       
  4278         reset_bracount = TRUE;
       
  4279         /* Fall through */
       
  4280 
       
  4281         /* ------------------------------------------------------------ */
       
  4282         case ':':                 /* Non-capturing bracket */
       
  4283         bravalue = OP_BRA;
       
  4284         ptr++;
       
  4285         break;
       
  4286 
       
  4287 
       
  4288         /* ------------------------------------------------------------ */
       
  4289         case '(':
       
  4290         bravalue = OP_COND;       /* Conditional group */
       
  4291 
       
  4292         /* A condition can be an assertion, a number (referring to a numbered
       
  4293         group), a name (referring to a named group), or 'R', referring to
       
  4294         recursion. R<digits> and R&name are also permitted for recursion tests.
       
  4295 
       
  4296         There are several syntaxes for testing a named group: (?(name)) is used
       
  4297         by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
       
  4298 
       
  4299         There are two unfortunate ambiguities, caused by history. (a) 'R' can
       
  4300         be the recursive thing or the name 'R' (and similarly for 'R' followed
       
  4301         by digits), and (b) a number could be a name that consists of digits.
       
  4302         In both cases, we look for a name first; if not found, we try the other
       
  4303         cases. */
       
  4304 
       
  4305         /* For conditions that are assertions, check the syntax, and then exit
       
  4306         the switch. This will take control down to where bracketed groups,
       
  4307         including assertions, are processed. */
       
  4308 
       
  4309         if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<'))
       
  4310           break;
       
  4311 
       
  4312         /* Most other conditions use OP_CREF (a couple change to OP_RREF
       
  4313         below), and all need to skip 3 bytes at the start of the group. */
       
  4314 
       
  4315         code[1+LINK_SIZE] = OP_CREF;
       
  4316         skipbytes = 3;
       
  4317         refsign = -1;
       
  4318 
       
  4319         /* Check for a test for recursion in a named group. */
       
  4320 
       
  4321         if (ptr[1] == 'R' && ptr[2] == '&')
       
  4322           {
       
  4323           terminator = -1;
       
  4324           ptr += 2;
       
  4325           code[1+LINK_SIZE] = OP_RREF;    /* Change the type of test */
       
  4326           }
       
  4327 
       
  4328         /* Check for a test for a named group's having been set, using the Perl
       
  4329         syntax (?(<name>) or (?('name') */
       
  4330 
       
  4331         else if (ptr[1] == '<')
       
  4332           {
       
  4333           terminator = '>';
       
  4334           ptr++;
       
  4335           }
       
  4336         else if (ptr[1] == '\'')
       
  4337           {
       
  4338           terminator = '\'';
       
  4339           ptr++;
       
  4340           }
       
  4341         else
       
  4342           {
       
  4343           terminator = 0;
       
  4344           if (ptr[1] == '-' || ptr[1] == '+') refsign = *(++ptr);
       
  4345           }
       
  4346 
       
  4347         /* We now expect to read a name; any thing else is an error */
       
  4348 
       
  4349         if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
       
  4350           {
       
  4351           ptr += 1;  /* To get the right offset */
       
  4352           *errorcodeptr = ERR28;
       
  4353           goto FAILED;
       
  4354           }
       
  4355 
       
  4356         /* Read the name, but also get it as a number if it's all digits */
       
  4357 
       
  4358         recno = 0;
       
  4359         name = ++ptr;
       
  4360         while ((cd->ctypes[*ptr] & ctype_word) != 0)
       
  4361           {
       
  4362           if (recno >= 0)
       
  4363             recno = ((digitab[*ptr] & ctype_digit) != 0)?
       
  4364               recno * 10 + *ptr - '0' : -1;
       
  4365           ptr++;
       
  4366           }
       
  4367         namelen = ptr - name;
       
  4368 
       
  4369         if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')')
       
  4370           {
       
  4371           ptr--;      /* Error offset */
       
  4372           *errorcodeptr = ERR26;
       
  4373           goto FAILED;
       
  4374           }
       
  4375 
       
  4376         /* Do no further checking in the pre-compile phase. */
       
  4377 
       
  4378         if (lengthptr != NULL) break;
       
  4379 
       
  4380         /* In the real compile we do the work of looking for the actual
       
  4381         reference. If the string started with "+" or "-" we require the rest to
       
  4382         be digits, in which case recno will be set. */
       
  4383 
       
  4384         if (refsign > 0)
       
  4385           {
       
  4386           if (recno <= 0)
       
  4387             {
       
  4388             *errorcodeptr = ERR58;
       
  4389             goto FAILED;
       
  4390             }
       
  4391           recno = (refsign == '-')?
       
  4392             cd->bracount - recno + 1 : recno +cd->bracount;
       
  4393           if (recno <= 0 || recno > cd->final_bracount)
       
  4394             {
       
  4395             *errorcodeptr = ERR15;
       
  4396             goto FAILED;
       
  4397             }
       
  4398           PUT2(code, 2+LINK_SIZE, recno);
       
  4399           break;
       
  4400           }
       
  4401 
       
  4402         /* Otherwise (did not start with "+" or "-"), start by looking for the
       
  4403         name. */
       
  4404 
       
  4405         slot = cd->name_table;
       
  4406         for (i = 0; i < cd->names_found; i++)
       
  4407           {
       
  4408           if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
       
  4409           slot += cd->name_entry_size;
       
  4410           }
       
  4411 
       
  4412         /* Found a previous named subpattern */
       
  4413 
       
  4414         if (i < cd->names_found)
       
  4415           {
       
  4416           recno = GET2(slot, 0);
       
  4417           PUT2(code, 2+LINK_SIZE, recno);
       
  4418           }
       
  4419 
       
  4420         /* Search the pattern for a forward reference */
       
  4421 
       
  4422         else if ((i = find_parens(ptr, cd, name, namelen,
       
  4423                         (options & PCRE_EXTENDED) != 0)) > 0)
       
  4424           {
       
  4425           PUT2(code, 2+LINK_SIZE, i);
       
  4426           }
       
  4427 
       
  4428         /* If terminator == 0 it means that the name followed directly after
       
  4429         the opening parenthesis [e.g. (?(abc)...] and in this case there are
       
  4430         some further alternatives to try. For the cases where terminator != 0
       
  4431         [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
       
  4432         now checked all the possibilities, so give an error. */
       
  4433 
       
  4434         else if (terminator != 0)
       
  4435           {
       
  4436           *errorcodeptr = ERR15;
       
  4437           goto FAILED;
       
  4438           }
       
  4439 
       
  4440         /* Check for (?(R) for recursion. Allow digits after R to specify a
       
  4441         specific group number. */
       
  4442 
       
  4443         else if (*name == 'R')
       
  4444           {
       
  4445           recno = 0;
       
  4446           for (i = 1; i < namelen; i++)
       
  4447             {
       
  4448             if ((digitab[name[i]] & ctype_digit) == 0)
       
  4449               {
       
  4450               *errorcodeptr = ERR15;
       
  4451               goto FAILED;
       
  4452               }
       
  4453             recno = recno * 10 + name[i] - '0';
       
  4454             }
       
  4455           if (recno == 0) recno = RREF_ANY;
       
  4456           code[1+LINK_SIZE] = OP_RREF;      /* Change test type */
       
  4457           PUT2(code, 2+LINK_SIZE, recno);
       
  4458           }
       
  4459 
       
  4460         /* Similarly, check for the (?(DEFINE) "condition", which is always
       
  4461         false. */
       
  4462 
       
  4463         else if (namelen == 6 && strncmp((char *)name, "DEFINE", 6) == 0)
       
  4464           {
       
  4465           code[1+LINK_SIZE] = OP_DEF;
       
  4466           skipbytes = 1;
       
  4467           }
       
  4468 
       
  4469         /* Check for the "name" actually being a subpattern number. We are
       
  4470         in the second pass here, so final_bracount is set. */
       
  4471 
       
  4472         else if (recno > 0 && recno <= cd->final_bracount)
       
  4473           {
       
  4474           PUT2(code, 2+LINK_SIZE, recno);
       
  4475           }
       
  4476 
       
  4477         /* Either an unidentified subpattern, or a reference to (?(0) */
       
  4478 
       
  4479         else
       
  4480           {
       
  4481           *errorcodeptr = (recno == 0)? ERR35: ERR15;
       
  4482           goto FAILED;
       
  4483           }
       
  4484         break;
       
  4485 
       
  4486 
       
  4487         /* ------------------------------------------------------------ */
       
  4488         case '=':                 /* Positive lookahead */
       
  4489         bravalue = OP_ASSERT;
       
  4490         ptr++;
       
  4491         break;
       
  4492 
       
  4493 
       
  4494         /* ------------------------------------------------------------ */
       
  4495         case '!':                 /* Negative lookahead */
       
  4496         ptr++;
       
  4497         if (*ptr == ')')          /* Optimize (?!) */
       
  4498           {
       
  4499           *code++ = OP_FAIL;
       
  4500           previous = NULL;
       
  4501           continue;
       
  4502           }
       
  4503         bravalue = OP_ASSERT_NOT;
       
  4504         break;
       
  4505 
       
  4506 
       
  4507         /* ------------------------------------------------------------ */
       
  4508         case '<':                 /* Lookbehind or named define */
       
  4509         switch (ptr[1])
       
  4510           {
       
  4511           case '=':               /* Positive lookbehind */
       
  4512           bravalue = OP_ASSERTBACK;
       
  4513           ptr += 2;
       
  4514           break;
       
  4515 
       
  4516           case '!':               /* Negative lookbehind */
       
  4517           bravalue = OP_ASSERTBACK_NOT;
       
  4518           ptr += 2;
       
  4519           break;
       
  4520 
       
  4521           default:                /* Could be name define, else bad */
       
  4522           if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
       
  4523           ptr++;                  /* Correct offset for error */
       
  4524           *errorcodeptr = ERR24;
       
  4525           goto FAILED;
       
  4526           }
       
  4527         break;
       
  4528 
       
  4529 
       
  4530         /* ------------------------------------------------------------ */
       
  4531         case '>':                 /* One-time brackets */
       
  4532         bravalue = OP_ONCE;
       
  4533         ptr++;
       
  4534         break;
       
  4535 
       
  4536 
       
  4537         /* ------------------------------------------------------------ */
       
  4538         case 'C':                 /* Callout - may be followed by digits; */
       
  4539         previous_callout = code;  /* Save for later completion */
       
  4540         after_manual_callout = 1; /* Skip one item before completing */
       
  4541         *code++ = OP_CALLOUT;
       
  4542           {
       
  4543           int n = 0;
       
  4544           while ((digitab[*(++ptr)] & ctype_digit) != 0)
       
  4545             n = n * 10 + *ptr - '0';
       
  4546           if (*ptr != ')')
       
  4547             {
       
  4548             *errorcodeptr = ERR39;
       
  4549             goto FAILED;
       
  4550             }
       
  4551           if (n > 255)
       
  4552             {
       
  4553             *errorcodeptr = ERR38;
       
  4554             goto FAILED;
       
  4555             }
       
  4556           *code++ = n;
       
  4557           PUT(code, 0, ptr - cd->start_pattern + 1);  /* Pattern offset */
       
  4558           PUT(code, LINK_SIZE, 0);                    /* Default length */
       
  4559           code += 2 * LINK_SIZE;
       
  4560           }
       
  4561         previous = NULL;
       
  4562         continue;
       
  4563 
       
  4564 
       
  4565         /* ------------------------------------------------------------ */
       
  4566         case 'P':                 /* Python-style named subpattern handling */
       
  4567         if (*(++ptr) == '=' || *ptr == '>')  /* Reference or recursion */
       
  4568           {
       
  4569           is_recurse = *ptr == '>';
       
  4570           terminator = ')';
       
  4571           goto NAMED_REF_OR_RECURSE;
       
  4572           }
       
  4573         else if (*ptr != '<')    /* Test for Python-style definition */
       
  4574           {
       
  4575           *errorcodeptr = ERR41;
       
  4576           goto FAILED;
       
  4577           }
       
  4578         /* Fall through to handle (?P< as (?< is handled */
       
  4579 
       
  4580 
       
  4581         /* ------------------------------------------------------------ */
       
  4582         DEFINE_NAME:    /* Come here from (?< handling */
       
  4583         case '\'':
       
  4584           {
       
  4585           terminator = (*ptr == '<')? '>' : '\'';
       
  4586           name = ++ptr;
       
  4587 
       
  4588           while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
       
  4589           namelen = ptr - name;
       
  4590 
       
  4591           /* In the pre-compile phase, just do a syntax check. */
       
  4592 
       
  4593           if (lengthptr != NULL)
       
  4594             {
       
  4595             if (*ptr != terminator)
       
  4596               {
       
  4597               *errorcodeptr = ERR42;
       
  4598               goto FAILED;
       
  4599               }
       
  4600             if (cd->names_found >= MAX_NAME_COUNT)
       
  4601               {
       
  4602               *errorcodeptr = ERR49;
       
  4603               goto FAILED;
       
  4604               }
       
  4605             if (namelen + 3 > cd->name_entry_size)
       
  4606               {
       
  4607               cd->name_entry_size = namelen + 3;
       
  4608               if (namelen > MAX_NAME_SIZE)
       
  4609                 {
       
  4610                 *errorcodeptr = ERR48;
       
  4611                 goto FAILED;
       
  4612                 }
       
  4613               }
       
  4614             }
       
  4615 
       
  4616           /* In the real compile, create the entry in the table */
       
  4617 
       
  4618           else
       
  4619             {
       
  4620             slot = cd->name_table;
       
  4621             for (i = 0; i < cd->names_found; i++)
       
  4622               {
       
  4623               int crc = memcmp(name, slot+2, namelen);
       
  4624               if (crc == 0)
       
  4625                 {
       
  4626                 if (slot[2+namelen] == 0)
       
  4627                   {
       
  4628                   if ((options & PCRE_DUPNAMES) == 0)
       
  4629                     {
       
  4630                     *errorcodeptr = ERR43;
       
  4631                     goto FAILED;
       
  4632                     }
       
  4633                   }
       
  4634                 else crc = -1;      /* Current name is substring */
       
  4635                 }
       
  4636               if (crc < 0)
       
  4637                 {
       
  4638                 memmove(slot + cd->name_entry_size, slot,
       
  4639                   (cd->names_found - i) * cd->name_entry_size);
       
  4640                 break;
       
  4641                 }
       
  4642               slot += cd->name_entry_size;
       
  4643               }
       
  4644 
       
  4645             PUT2(slot, 0, cd->bracount + 1);
       
  4646             memcpy(slot + 2, name, namelen);
       
  4647             slot[2+namelen] = 0;
       
  4648             }
       
  4649           }
       
  4650 
       
  4651         /* In both cases, count the number of names we've encountered. */
       
  4652 
       
  4653         ptr++;                    /* Move past > or ' */
       
  4654         cd->names_found++;
       
  4655         goto NUMBERED_GROUP;
       
  4656 
       
  4657 
       
  4658         /* ------------------------------------------------------------ */
       
  4659         case '&':                 /* Perl recursion/subroutine syntax */
       
  4660         terminator = ')';
       
  4661         is_recurse = TRUE;
       
  4662         /* Fall through */
       
  4663 
       
  4664         /* We come here from the Python syntax above that handles both
       
  4665         references (?P=name) and recursion (?P>name), as well as falling
       
  4666         through from the Perl recursion syntax (?&name). We also come here from
       
  4667         the Perl \k<name> or \k'name' back reference syntax and the \k{name}
       
  4668         .NET syntax, and the Oniguruma \g<...> and \g'...' subroutine syntax. */
       
  4669 
       
  4670         NAMED_REF_OR_RECURSE:
       
  4671         name = ++ptr;
       
  4672         while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
       
  4673         namelen = ptr - name;
       
  4674 
       
  4675         /* In the pre-compile phase, do a syntax check and set a dummy
       
  4676         reference number. */
       
  4677 
       
  4678         if (lengthptr != NULL)
       
  4679           {
       
  4680           if (namelen == 0)
       
  4681             {
       
  4682             *errorcodeptr = ERR62;
       
  4683             goto FAILED;
       
  4684             }
       
  4685           if (*ptr != terminator)
       
  4686             {
       
  4687             *errorcodeptr = ERR42;
       
  4688             goto FAILED;
       
  4689             }
       
  4690           if (namelen > MAX_NAME_SIZE)
       
  4691             {
       
  4692             *errorcodeptr = ERR48;
       
  4693             goto FAILED;
       
  4694             }
       
  4695           recno = 0;
       
  4696           }
       
  4697 
       
  4698         /* In the real compile, seek the name in the table. We check the name
       
  4699         first, and then check that we have reached the end of the name in the
       
  4700         table. That way, if the name that is longer than any in the table,
       
  4701         the comparison will fail without reading beyond the table entry. */
       
  4702 
       
  4703         else
       
  4704           {
       
  4705           slot = cd->name_table;
       
  4706           for (i = 0; i < cd->names_found; i++)
       
  4707             {
       
  4708             if (strncmp((char *)name, (char *)slot+2, namelen) == 0 &&
       
  4709                 slot[2+namelen] == 0)
       
  4710               break;
       
  4711             slot += cd->name_entry_size;
       
  4712             }
       
  4713 
       
  4714           if (i < cd->names_found)         /* Back reference */
       
  4715             {
       
  4716             recno = GET2(slot, 0);
       
  4717             }
       
  4718           else if ((recno =                /* Forward back reference */
       
  4719                     find_parens(ptr, cd, name, namelen,
       
  4720                       (options & PCRE_EXTENDED) != 0)) <= 0)
       
  4721             {
       
  4722             *errorcodeptr = ERR15;
       
  4723             goto FAILED;
       
  4724             }
       
  4725           }
       
  4726 
       
  4727         /* In both phases, we can now go to the code than handles numerical
       
  4728         recursion or backreferences. */
       
  4729 
       
  4730         if (is_recurse) goto HANDLE_RECURSION;
       
  4731           else goto HANDLE_REFERENCE;
       
  4732 
       
  4733 
       
  4734         /* ------------------------------------------------------------ */
       
  4735         case 'R':                 /* Recursion */
       
  4736         ptr++;                    /* Same as (?0)      */
       
  4737         /* Fall through */
       
  4738 
       
  4739 
       
  4740         /* ------------------------------------------------------------ */
       
  4741         case '-': case '+':
       
  4742         case '0': case '1': case '2': case '3': case '4':   /* Recursion or */
       
  4743         case '5': case '6': case '7': case '8': case '9':   /* subroutine */
       
  4744           {
       
  4745           const uschar *called;
       
  4746           terminator = ')';
       
  4747 
       
  4748           /* Come here from the \g<...> and \g'...' code (Oniguruma
       
  4749           compatibility). However, the syntax has been checked to ensure that
       
  4750           the ... are a (signed) number, so that neither ERR63 nor ERR29 will
       
  4751           be called on this path, nor with the jump to OTHER_CHAR_AFTER_QUERY
       
  4752           ever be taken. */
       
  4753 
       
  4754           HANDLE_NUMERICAL_RECURSION:
       
  4755 
       
  4756           if ((refsign = *ptr) == '+')
       
  4757             {
       
  4758             ptr++;
       
  4759             if ((digitab[*ptr] & ctype_digit) == 0)
       
  4760               {
       
  4761               *errorcodeptr = ERR63;
       
  4762               goto FAILED;
       
  4763               }
       
  4764             }
       
  4765           else if (refsign == '-')
       
  4766             {
       
  4767             if ((digitab[ptr[1]] & ctype_digit) == 0)
       
  4768               goto OTHER_CHAR_AFTER_QUERY;
       
  4769             ptr++;
       
  4770             }
       
  4771 
       
  4772           recno = 0;
       
  4773           while((digitab[*ptr] & ctype_digit) != 0)
       
  4774             recno = recno * 10 + *ptr++ - '0';
       
  4775 
       
  4776           if (*ptr != terminator)
       
  4777             {
       
  4778             *errorcodeptr = ERR29;
       
  4779             goto FAILED;
       
  4780             }
       
  4781 
       
  4782           if (refsign == '-')
       
  4783             {
       
  4784             if (recno == 0)
       
  4785               {
       
  4786               *errorcodeptr = ERR58;
       
  4787               goto FAILED;
       
  4788               }
       
  4789             recno = cd->bracount - recno + 1;
       
  4790             if (recno <= 0)
       
  4791               {
       
  4792               *errorcodeptr = ERR15;
       
  4793               goto FAILED;
       
  4794               }
       
  4795             }
       
  4796           else if (refsign == '+')
       
  4797             {
       
  4798             if (recno == 0)
       
  4799               {
       
  4800               *errorcodeptr = ERR58;
       
  4801               goto FAILED;
       
  4802               }
       
  4803             recno += cd->bracount;
       
  4804             }
       
  4805 
       
  4806           /* Come here from code above that handles a named recursion */
       
  4807 
       
  4808           HANDLE_RECURSION:
       
  4809 
       
  4810           previous = code;
       
  4811           called = cd->start_code;
       
  4812 
       
  4813           /* When we are actually compiling, find the bracket that is being
       
  4814           referenced. Temporarily end the regex in case it doesn't exist before
       
  4815           this point. If we end up with a forward reference, first check that
       
  4816           the bracket does occur later so we can give the error (and position)
       
  4817           now. Then remember this forward reference in the workspace so it can
       
  4818           be filled in at the end. */
       
  4819 
       
  4820           if (lengthptr == NULL)
       
  4821             {
       
  4822             *code = OP_END;
       
  4823             if (recno != 0) called = find_bracket(cd->start_code, utf8, recno);
       
  4824 
       
  4825             /* Forward reference */
       
  4826 
       
  4827             if (called == NULL)
       
  4828               {
       
  4829               if (find_parens(ptr, cd, NULL, recno,
       
  4830                     (options & PCRE_EXTENDED) != 0) < 0)
       
  4831                 {
       
  4832                 *errorcodeptr = ERR15;
       
  4833                 goto FAILED;
       
  4834                 }
       
  4835               called = cd->start_code + recno;
       
  4836               PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
       
  4837               }
       
  4838 
       
  4839             /* If not a forward reference, and the subpattern is still open,
       
  4840             this is a recursive call. We check to see if this is a left
       
  4841             recursion that could loop for ever, and diagnose that case. */
       
  4842 
       
  4843             else if (GET(called, 1) == 0 &&
       
  4844                      could_be_empty(called, code, bcptr, utf8))
       
  4845               {
       
  4846               *errorcodeptr = ERR40;
       
  4847               goto FAILED;
       
  4848               }
       
  4849             }
       
  4850 
       
  4851           /* Insert the recursion/subroutine item, automatically wrapped inside
       
  4852           "once" brackets. Set up a "previous group" length so that a
       
  4853           subsequent quantifier will work. */
       
  4854 
       
  4855           *code = OP_ONCE;
       
  4856           PUT(code, 1, 2 + 2*LINK_SIZE);
       
  4857           code += 1 + LINK_SIZE;
       
  4858 
       
  4859           *code = OP_RECURSE;
       
  4860           PUT(code, 1, called - cd->start_code);
       
  4861           code += 1 + LINK_SIZE;
       
  4862 
       
  4863           *code = OP_KET;
       
  4864           PUT(code, 1, 2 + 2*LINK_SIZE);
       
  4865           code += 1 + LINK_SIZE;
       
  4866 
       
  4867           length_prevgroup = 3 + 3*LINK_SIZE;
       
  4868           }
       
  4869 
       
  4870         /* Can't determine a first byte now */
       
  4871 
       
  4872         if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
       
  4873         continue;
       
  4874 
       
  4875 
       
  4876         /* ------------------------------------------------------------ */
       
  4877         default:              /* Other characters: check option setting */
       
  4878         OTHER_CHAR_AFTER_QUERY:
       
  4879         set = unset = 0;
       
  4880         optset = &set;
       
  4881 
       
  4882         while (*ptr != ')' && *ptr != ':')
       
  4883           {
       
  4884           switch (*ptr++)
       
  4885             {
       
  4886             case '-': optset = &unset; break;
       
  4887 
       
  4888             case 'J':    /* Record that it changed in the external options */
       
  4889             *optset |= PCRE_DUPNAMES;
       
  4890             cd->external_flags |= PCRE_JCHANGED;
       
  4891             break;
       
  4892 
       
  4893             case 'i': *optset |= PCRE_CASELESS; break;
       
  4894             case 'm': *optset |= PCRE_MULTILINE; break;
       
  4895             case 's': *optset |= PCRE_DOTALL; break;
       
  4896             case 'x': *optset |= PCRE_EXTENDED; break;
       
  4897             case 'U': *optset |= PCRE_UNGREEDY; break;
       
  4898             case 'X': *optset |= PCRE_EXTRA; break;
       
  4899 
       
  4900             default:  *errorcodeptr = ERR12;
       
  4901                       ptr--;    /* Correct the offset */
       
  4902                       goto FAILED;
       
  4903             }
       
  4904           }
       
  4905 
       
  4906         /* Set up the changed option bits, but don't change anything yet. */
       
  4907 
       
  4908         newoptions = (options | set) & (~unset);
       
  4909 
       
  4910         /* If the options ended with ')' this is not the start of a nested
       
  4911         group with option changes, so the options change at this level. If this
       
  4912         item is right at the start of the pattern, the options can be
       
  4913         abstracted and made external in the pre-compile phase, and ignored in
       
  4914         the compile phase. This can be helpful when matching -- for instance in
       
  4915         caseless checking of required bytes.
       
  4916 
       
  4917         If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
       
  4918         definitely *not* at the start of the pattern because something has been
       
  4919         compiled. In the pre-compile phase, however, the code pointer can have
       
  4920         that value after the start, because it gets reset as code is discarded
       
  4921         during the pre-compile. However, this can happen only at top level - if
       
  4922         we are within parentheses, the starting BRA will still be present. At
       
  4923         any parenthesis level, the length value can be used to test if anything
       
  4924         has been compiled at that level. Thus, a test for both these conditions
       
  4925         is necessary to ensure we correctly detect the start of the pattern in
       
  4926         both phases.
       
  4927 
       
  4928         If we are not at the pattern start, compile code to change the ims
       
  4929         options if this setting actually changes any of them, and reset the
       
  4930         greedy defaults and the case value for firstbyte and reqbyte. */
       
  4931 
       
  4932         if (*ptr == ')')
       
  4933           {
       
  4934           if (code == cd->start_code + 1 + LINK_SIZE &&
       
  4935                (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
       
  4936             {
       
  4937             cd->external_options = newoptions;
       
  4938             }
       
  4939          else
       
  4940             {
       
  4941             if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
       
  4942               {
       
  4943               *code++ = OP_OPT;
       
  4944               *code++ = newoptions & PCRE_IMS;
       
  4945               }
       
  4946             greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
       
  4947             greedy_non_default = greedy_default ^ 1;
       
  4948             req_caseopt = ((newoptions & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
       
  4949             }
       
  4950 
       
  4951           /* Change options at this level, and pass them back for use
       
  4952           in subsequent branches. When not at the start of the pattern, this
       
  4953           information is also necessary so that a resetting item can be
       
  4954           compiled at the end of a group (if we are in a group). */
       
  4955 
       
  4956           *optionsptr = options = newoptions;
       
  4957           previous = NULL;       /* This item can't be repeated */
       
  4958           continue;              /* It is complete */
       
  4959           }
       
  4960 
       
  4961         /* If the options ended with ':' we are heading into a nested group
       
  4962         with possible change of options. Such groups are non-capturing and are
       
  4963         not assertions of any kind. All we need to do is skip over the ':';
       
  4964         the newoptions value is handled below. */
       
  4965 
       
  4966         bravalue = OP_BRA;
       
  4967         ptr++;
       
  4968         }     /* End of switch for character following (? */
       
  4969       }       /* End of (? handling */
       
  4970 
       
  4971     /* Opening parenthesis not followed by '?'. If PCRE_NO_AUTO_CAPTURE is set,
       
  4972     all unadorned brackets become non-capturing and behave like (?:...)
       
  4973     brackets. */
       
  4974 
       
  4975     else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
       
  4976       {
       
  4977       bravalue = OP_BRA;
       
  4978       }
       
  4979 
       
  4980     /* Else we have a capturing group. */
       
  4981 
       
  4982     else
       
  4983       {
       
  4984       NUMBERED_GROUP:
       
  4985       cd->bracount += 1;
       
  4986       PUT2(code, 1+LINK_SIZE, cd->bracount);
       
  4987       skipbytes = 2;
       
  4988       }
       
  4989 
       
  4990     /* Process nested bracketed regex. Assertions may not be repeated, but
       
  4991     other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a
       
  4992     non-register variable in order to be able to pass its address because some
       
  4993     compilers complain otherwise. Pass in a new setting for the ims options if
       
  4994     they have changed. */
       
  4995 
       
  4996     previous = (bravalue >= OP_ONCE)? code : NULL;
       
  4997     *code = bravalue;
       
  4998     tempcode = code;
       
  4999     tempreqvary = cd->req_varyopt;     /* Save value before bracket */
       
  5000     length_prevgroup = 0;              /* Initialize for pre-compile phase */
       
  5001 
       
  5002     if (!compile_regex(
       
  5003          newoptions,                   /* The complete new option state */
       
  5004          options & PCRE_IMS,           /* The previous ims option state */
       
  5005          &tempcode,                    /* Where to put code (updated) */
       
  5006          &ptr,                         /* Input pointer (updated) */
       
  5007          errorcodeptr,                 /* Where to put an error message */
       
  5008          (bravalue == OP_ASSERTBACK ||
       
  5009           bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
       
  5010          reset_bracount,               /* True if (?| group */
       
  5011          skipbytes,                    /* Skip over bracket number */
       
  5012          &subfirstbyte,                /* For possible first char */
       
  5013          &subreqbyte,                  /* For possible last char */
       
  5014          bcptr,                        /* Current branch chain */
       
  5015          cd,                           /* Tables block */
       
  5016          (lengthptr == NULL)? NULL :   /* Actual compile phase */
       
  5017            &length_prevgroup           /* Pre-compile phase */
       
  5018          ))
       
  5019       goto FAILED;
       
  5020 
       
  5021     /* At the end of compiling, code is still pointing to the start of the
       
  5022     group, while tempcode has been updated to point past the end of the group
       
  5023     and any option resetting that may follow it. The pattern pointer (ptr)
       
  5024     is on the bracket. */
       
  5025 
       
  5026     /* If this is a conditional bracket, check that there are no more than
       
  5027     two branches in the group, or just one if it's a DEFINE group. We do this
       
  5028     in the real compile phase, not in the pre-pass, where the whole group may
       
  5029     not be available. */
       
  5030 
       
  5031     if (bravalue == OP_COND && lengthptr == NULL)
       
  5032       {
       
  5033       uschar *tc = code;
       
  5034       int condcount = 0;
       
  5035 
       
  5036       do {
       
  5037          condcount++;
       
  5038          tc += GET(tc,1);
       
  5039          }
       
  5040       while (*tc != OP_KET);
       
  5041 
       
  5042       /* A DEFINE group is never obeyed inline (the "condition" is always
       
  5043       false). It must have only one branch. */
       
  5044 
       
  5045       if (code[LINK_SIZE+1] == OP_DEF)
       
  5046         {
       
  5047         if (condcount > 1)
       
  5048           {
       
  5049           *errorcodeptr = ERR54;
       
  5050           goto FAILED;
       
  5051           }
       
  5052         bravalue = OP_DEF;   /* Just a flag to suppress char handling below */
       
  5053         }
       
  5054 
       
  5055       /* A "normal" conditional group. If there is just one branch, we must not
       
  5056       make use of its firstbyte or reqbyte, because this is equivalent to an
       
  5057       empty second branch. */
       
  5058 
       
  5059       else
       
  5060         {
       
  5061         if (condcount > 2)
       
  5062           {
       
  5063           *errorcodeptr = ERR27;
       
  5064           goto FAILED;
       
  5065           }
       
  5066         if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
       
  5067         }
       
  5068       }
       
  5069 
       
  5070     /* Error if hit end of pattern */
       
  5071 
       
  5072     if (*ptr != ')')
       
  5073       {
       
  5074       *errorcodeptr = ERR14;
       
  5075       goto FAILED;
       
  5076       }
       
  5077 
       
  5078     /* In the pre-compile phase, update the length by the length of the group,
       
  5079     less the brackets at either end. Then reduce the compiled code to just a
       
  5080     set of non-capturing brackets so that it doesn't use much memory if it is
       
  5081     duplicated by a quantifier.*/
       
  5082 
       
  5083     if (lengthptr != NULL)
       
  5084       {
       
  5085       if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
       
  5086         {
       
  5087         *errorcodeptr = ERR20;
       
  5088         goto FAILED;
       
  5089         }
       
  5090       *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
       
  5091       *code++ = OP_BRA;
       
  5092       PUTINC(code, 0, 1 + LINK_SIZE);
       
  5093       *code++ = OP_KET;
       
  5094       PUTINC(code, 0, 1 + LINK_SIZE);
       
  5095       break;    /* No need to waste time with special character handling */
       
  5096       }
       
  5097 
       
  5098     /* Otherwise update the main code pointer to the end of the group. */
       
  5099 
       
  5100     code = tempcode;
       
  5101 
       
  5102     /* For a DEFINE group, required and first character settings are not
       
  5103     relevant. */
       
  5104 
       
  5105     if (bravalue == OP_DEF) break;
       
  5106 
       
  5107     /* Handle updating of the required and first characters for other types of
       
  5108     group. Update for normal brackets of all kinds, and conditions with two
       
  5109     branches (see code above). If the bracket is followed by a quantifier with
       
  5110     zero repeat, we have to back off. Hence the definition of zeroreqbyte and
       
  5111     zerofirstbyte outside the main loop so that they can be accessed for the
       
  5112     back off. */
       
  5113 
       
  5114     zeroreqbyte = reqbyte;
       
  5115     zerofirstbyte = firstbyte;
       
  5116     groupsetfirstbyte = FALSE;
       
  5117 
       
  5118     if (bravalue >= OP_ONCE)
       
  5119       {
       
  5120       /* If we have not yet set a firstbyte in this branch, take it from the
       
  5121       subpattern, remembering that it was set here so that a repeat of more
       
  5122       than one can replicate it as reqbyte if necessary. If the subpattern has
       
  5123       no firstbyte, set "none" for the whole branch. In both cases, a zero
       
  5124       repeat forces firstbyte to "none". */
       
  5125 
       
  5126       if (firstbyte == REQ_UNSET)
       
  5127         {
       
  5128         if (subfirstbyte >= 0)
       
  5129           {
       
  5130           firstbyte = subfirstbyte;
       
  5131           groupsetfirstbyte = TRUE;
       
  5132           }
       
  5133         else firstbyte = REQ_NONE;
       
  5134         zerofirstbyte = REQ_NONE;
       
  5135         }
       
  5136 
       
  5137       /* If firstbyte was previously set, convert the subpattern's firstbyte
       
  5138       into reqbyte if there wasn't one, using the vary flag that was in
       
  5139       existence beforehand. */
       
  5140 
       
  5141       else if (subfirstbyte >= 0 && subreqbyte < 0)
       
  5142         subreqbyte = subfirstbyte | tempreqvary;
       
  5143 
       
  5144       /* If the subpattern set a required byte (or set a first byte that isn't
       
  5145       really the first byte - see above), set it. */
       
  5146 
       
  5147       if (subreqbyte >= 0) reqbyte = subreqbyte;
       
  5148       }
       
  5149 
       
  5150     /* For a forward assertion, we take the reqbyte, if set. This can be
       
  5151     helpful if the pattern that follows the assertion doesn't set a different
       
  5152     char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
       
  5153     for an assertion, however because it leads to incorrect effect for patterns
       
  5154     such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
       
  5155     of a firstbyte. This is overcome by a scan at the end if there's no
       
  5156     firstbyte, looking for an asserted first char. */
       
  5157 
       
  5158     else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
       
  5159     break;     /* End of processing '(' */
       
  5160 
       
  5161 
       
  5162     /* ===================================================================*/
       
  5163     /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
       
  5164     are arranged to be the negation of the corresponding OP_values. For the
       
  5165     back references, the values are ESC_REF plus the reference number. Only
       
  5166     back references and those types that consume a character may be repeated.
       
  5167     We can test for values between ESC_b and ESC_Z for the latter; this may
       
  5168     have to change if any new ones are ever created. */
       
  5169 
       
  5170     case '\\':
       
  5171     tempptr = ptr;
       
  5172     c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);
       
  5173     if (*errorcodeptr != 0) goto FAILED;
       
  5174 
       
  5175     if (c < 0)
       
  5176       {
       
  5177       if (-c == ESC_Q)            /* Handle start of quoted string */
       
  5178         {
       
  5179         if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
       
  5180           else inescq = TRUE;
       
  5181         continue;
       
  5182         }
       
  5183 
       
  5184       if (-c == ESC_E) continue;  /* Perl ignores an orphan \E */
       
  5185 
       
  5186       /* For metasequences that actually match a character, we disable the
       
  5187       setting of a first character if it hasn't already been set. */
       
  5188 
       
  5189       if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
       
  5190         firstbyte = REQ_NONE;
       
  5191 
       
  5192       /* Set values to reset to if this is followed by a zero repeat. */
       
  5193 
       
  5194       zerofirstbyte = firstbyte;
       
  5195       zeroreqbyte = reqbyte;
       
  5196 
       
  5197       /* \g<name> or \g'name' is a subroutine call by name and \g<n> or \g'n'
       
  5198       is a subroutine call by number (Oniguruma syntax). In fact, the value
       
  5199       -ESC_g is returned only for these cases. So we don't need to check for <
       
  5200       or ' if the value is -ESC_g. For the Perl syntax \g{n} the value is
       
  5201       -ESC_REF+n, and for the Perl syntax \g{name} the result is -ESC_k (as
       
  5202       that is a synonym for a named back reference). */
       
  5203 
       
  5204       if (-c == ESC_g)
       
  5205         {
       
  5206         const uschar *p;
       
  5207         save_hwm = cd->hwm;   /* Normally this is set when '(' is read */
       
  5208         terminator = (*(++ptr) == '<')? '>' : '\'';
       
  5209 
       
  5210         /* These two statements stop the compiler for warning about possibly
       
  5211         unset variables caused by the jump to HANDLE_NUMERICAL_RECURSION. In
       
  5212         fact, because we actually check for a number below, the paths that
       
  5213         would actually be in error are never taken. */
       
  5214 
       
  5215         skipbytes = 0;
       
  5216         reset_bracount = FALSE;
       
  5217 
       
  5218         /* Test for a name */
       
  5219 
       
  5220         if (ptr[1] != '+' && ptr[1] != '-')
       
  5221           {
       
  5222           BOOL isnumber = TRUE;
       
  5223           for (p = ptr + 1; *p != 0 && *p != terminator; p++)
       
  5224             {
       
  5225             if ((cd->ctypes[*p] & ctype_digit) == 0) isnumber = FALSE;
       
  5226             if ((cd->ctypes[*p] & ctype_word) == 0) break;
       
  5227             }
       
  5228           if (*p != terminator)
       
  5229             {
       
  5230             *errorcodeptr = ERR57;
       
  5231             break;
       
  5232             }
       
  5233           if (isnumber)
       
  5234             {
       
  5235             ptr++;
       
  5236             goto HANDLE_NUMERICAL_RECURSION;
       
  5237             }
       
  5238           is_recurse = TRUE;
       
  5239           goto NAMED_REF_OR_RECURSE;
       
  5240           }
       
  5241 
       
  5242         /* Test a signed number in angle brackets or quotes. */
       
  5243 
       
  5244         p = ptr + 2;
       
  5245         while ((digitab[*p] & ctype_digit) != 0) p++;
       
  5246         if (*p != terminator)
       
  5247           {
       
  5248           *errorcodeptr = ERR57;
       
  5249           break;
       
  5250           }
       
  5251         ptr++;
       
  5252         goto HANDLE_NUMERICAL_RECURSION;
       
  5253         }
       
  5254 
       
  5255       /* \k<name> or \k'name' is a back reference by name (Perl syntax).
       
  5256       We also support \k{name} (.NET syntax) */
       
  5257 
       
  5258       if (-c == ESC_k && (ptr[1] == '<' || ptr[1] == '\'' || ptr[1] == '{'))
       
  5259         {
       
  5260         is_recurse = FALSE;
       
  5261         terminator = (*(++ptr) == '<')? '>' : (*ptr == '\'')? '\'' : '}';
       
  5262         goto NAMED_REF_OR_RECURSE;
       
  5263         }
       
  5264 
       
  5265       /* Back references are handled specially; must disable firstbyte if
       
  5266       not set to cope with cases like (?=(\w+))\1: which would otherwise set
       
  5267       ':' later. */
       
  5268 
       
  5269       if (-c >= ESC_REF)
       
  5270         {
       
  5271         recno = -c - ESC_REF;
       
  5272 
       
  5273         HANDLE_REFERENCE:    /* Come here from named backref handling */
       
  5274         if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
       
  5275         previous = code;
       
  5276         *code++ = OP_REF;
       
  5277         PUT2INC(code, 0, recno);
       
  5278         cd->backref_map |= (recno < 32)? (1 << recno) : 1;
       
  5279         if (recno > cd->top_backref) cd->top_backref = recno;
       
  5280         }
       
  5281 
       
  5282       /* So are Unicode property matches, if supported. */
       
  5283 
       
  5284 #ifdef SUPPORT_UCP
       
  5285       else if (-c == ESC_P || -c == ESC_p)
       
  5286         {
       
  5287         BOOL negated;
       
  5288         int pdata;
       
  5289         int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
       
  5290         if (ptype < 0) goto FAILED;
       
  5291         previous = code;
       
  5292         *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
       
  5293         *code++ = ptype;
       
  5294         *code++ = pdata;
       
  5295         }
       
  5296 #else
       
  5297 
       
  5298       /* If Unicode properties are not supported, \X, \P, and \p are not
       
  5299       allowed. */
       
  5300 
       
  5301       else if (-c == ESC_X || -c == ESC_P || -c == ESC_p)
       
  5302         {
       
  5303         *errorcodeptr = ERR45;
       
  5304         goto FAILED;
       
  5305         }
       
  5306 #endif
       
  5307 
       
  5308       /* For the rest (including \X when Unicode properties are supported), we
       
  5309       can obtain the OP value by negating the escape value. */
       
  5310 
       
  5311       else
       
  5312         {
       
  5313         previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
       
  5314         *code++ = -c;
       
  5315         }
       
  5316       continue;
       
  5317       }
       
  5318 
       
  5319     /* We have a data character whose value is in c. In UTF-8 mode it may have
       
  5320     a value > 127. We set its representation in the length/buffer, and then
       
  5321     handle it as a data character. */
       
  5322 
       
  5323 #ifdef SUPPORT_UTF8
       
  5324     if (utf8 && c > 127)
       
  5325       mclength = _pcre_ord2utf8(c, mcbuffer);
       
  5326     else
       
  5327 #endif
       
  5328 
       
  5329      {
       
  5330      mcbuffer[0] = c;
       
  5331      mclength = 1;
       
  5332      }
       
  5333     goto ONE_CHAR;
       
  5334 
       
  5335 
       
  5336     /* ===================================================================*/
       
  5337     /* Handle a literal character. It is guaranteed not to be whitespace or #
       
  5338     when the extended flag is set. If we are in UTF-8 mode, it may be a
       
  5339     multi-byte literal character. */
       
  5340 
       
  5341     default:
       
  5342     NORMAL_CHAR:
       
  5343     mclength = 1;
       
  5344     mcbuffer[0] = c;
       
  5345 
       
  5346 #ifdef SUPPORT_UTF8
       
  5347     if (utf8 && c >= 0xc0)
       
  5348       {
       
  5349       while ((ptr[1] & 0xc0) == 0x80)
       
  5350         mcbuffer[mclength++] = *(++ptr);
       
  5351       }
       
  5352 #endif
       
  5353 
       
  5354     /* At this point we have the character's bytes in mcbuffer, and the length
       
  5355     in mclength. When not in UTF-8 mode, the length is always 1. */
       
  5356 
       
  5357     ONE_CHAR:
       
  5358     previous = code;
       
  5359     *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
       
  5360     for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
       
  5361 
       
  5362     /* Remember if \r or \n were seen */
       
  5363 
       
  5364     if (mcbuffer[0] == '\r' || mcbuffer[0] == '\n')
       
  5365       cd->external_flags |= PCRE_HASCRORLF;
       
  5366 
       
  5367     /* Set the first and required bytes appropriately. If no previous first
       
  5368     byte, set it from this character, but revert to none on a zero repeat.
       
  5369     Otherwise, leave the firstbyte value alone, and don't change it on a zero
       
  5370     repeat. */
       
  5371 
       
  5372     if (firstbyte == REQ_UNSET)
       
  5373       {
       
  5374       zerofirstbyte = REQ_NONE;
       
  5375       zeroreqbyte = reqbyte;
       
  5376 
       
  5377       /* If the character is more than one byte long, we can set firstbyte
       
  5378       only if it is not to be matched caselessly. */
       
  5379 
       
  5380       if (mclength == 1 || req_caseopt == 0)
       
  5381         {
       
  5382         firstbyte = mcbuffer[0] | req_caseopt;
       
  5383         if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
       
  5384         }
       
  5385       else firstbyte = reqbyte = REQ_NONE;
       
  5386       }
       
  5387 
       
  5388     /* firstbyte was previously set; we can set reqbyte only the length is
       
  5389     1 or the matching is caseful. */
       
  5390 
       
  5391     else
       
  5392       {
       
  5393       zerofirstbyte = firstbyte;
       
  5394       zeroreqbyte = reqbyte;
       
  5395       if (mclength == 1 || req_caseopt == 0)
       
  5396         reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
       
  5397       }
       
  5398 
       
  5399     break;            /* End of literal character handling */
       
  5400     }
       
  5401   }                   /* end of big loop */
       
  5402 
       
  5403 
       
  5404 /* Control never reaches here by falling through, only by a goto for all the
       
  5405 error states. Pass back the position in the pattern so that it can be displayed
       
  5406 to the user for diagnosing the error. */
       
  5407 
       
  5408 FAILED:
       
  5409 *ptrptr = ptr;
       
  5410 return FALSE;
       
  5411 }
       
  5412 
       
  5413 
       
  5414 
       
  5415 
       
  5416 /*************************************************
       
  5417 *     Compile sequence of alternatives           *
       
  5418 *************************************************/
       
  5419 
       
  5420 /* On entry, ptr is pointing past the bracket character, but on return it
       
  5421 points to the closing bracket, or vertical bar, or end of string. The code
       
  5422 variable is pointing at the byte into which the BRA operator has been stored.
       
  5423 If the ims options are changed at the start (for a (?ims: group) or during any
       
  5424 branch, we need to insert an OP_OPT item at the start of every following branch
       
  5425 to ensure they get set correctly at run time, and also pass the new options
       
  5426 into every subsequent branch compile.
       
  5427 
       
  5428 This function is used during the pre-compile phase when we are trying to find
       
  5429 out the amount of memory needed, as well as during the real compile phase. The
       
  5430 value of lengthptr distinguishes the two phases.
       
  5431 
       
  5432 Arguments:
       
  5433   options        option bits, including any changes for this subpattern
       
  5434   oldims         previous settings of ims option bits
       
  5435   codeptr        -> the address of the current code pointer
       
  5436   ptrptr         -> the address of the current pattern pointer
       
  5437   errorcodeptr   -> pointer to error code variable
       
  5438   lookbehind     TRUE if this is a lookbehind assertion
       
  5439   reset_bracount TRUE to reset the count for each branch
       
  5440   skipbytes      skip this many bytes at start (for brackets and OP_COND)
       
  5441   firstbyteptr   place to put the first required character, or a negative number
       
  5442   reqbyteptr     place to put the last required character, or a negative number
       
  5443   bcptr          pointer to the chain of currently open branches
       
  5444   cd             points to the data block with tables pointers etc.
       
  5445   lengthptr      NULL during the real compile phase
       
  5446                  points to length accumulator during pre-compile phase
       
  5447 
       
  5448 Returns:         TRUE on success
       
  5449 */
       
  5450 
       
  5451 static BOOL
       
  5452 compile_regex(int options, int oldims, uschar **codeptr, const uschar **ptrptr,
       
  5453   int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,
       
  5454   int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd,
       
  5455   int *lengthptr)
       
  5456 {
       
  5457 const uschar *ptr = *ptrptr;
       
  5458 uschar *code = *codeptr;
       
  5459 uschar *last_branch = code;
       
  5460 uschar *start_bracket = code;
       
  5461 uschar *reverse_count = NULL;
       
  5462 int firstbyte, reqbyte;
       
  5463 int branchfirstbyte, branchreqbyte;
       
  5464 int length;
       
  5465 int orig_bracount;
       
  5466 int max_bracount;
       
  5467 branch_chain bc;
       
  5468 
       
  5469 bc.outer = bcptr;
       
  5470 bc.current = code;
       
  5471 
       
  5472 firstbyte = reqbyte = REQ_UNSET;
       
  5473 
       
  5474 /* Accumulate the length for use in the pre-compile phase. Start with the
       
  5475 length of the BRA and KET and any extra bytes that are required at the
       
  5476 beginning. We accumulate in a local variable to save frequent testing of
       
  5477 lenthptr for NULL. We cannot do this by looking at the value of code at the
       
  5478 start and end of each alternative, because compiled items are discarded during
       
  5479 the pre-compile phase so that the work space is not exceeded. */
       
  5480 
       
  5481 length = 2 + 2*LINK_SIZE + skipbytes;
       
  5482 
       
  5483 /* WARNING: If the above line is changed for any reason, you must also change
       
  5484 the code that abstracts option settings at the start of the pattern and makes
       
  5485 them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
       
  5486 pre-compile phase to find out whether anything has yet been compiled or not. */
       
  5487 
       
  5488 /* Offset is set zero to mark that this bracket is still open */
       
  5489 
       
  5490 PUT(code, 1, 0);
       
  5491 code += 1 + LINK_SIZE + skipbytes;
       
  5492 
       
  5493 /* Loop for each alternative branch */
       
  5494 
       
  5495 orig_bracount = max_bracount = cd->bracount;
       
  5496 for (;;)
       
  5497   {
       
  5498   /* For a (?| group, reset the capturing bracket count so that each branch
       
  5499   uses the same numbers. */
       
  5500 
       
  5501   if (reset_bracount) cd->bracount = orig_bracount;
       
  5502 
       
  5503   /* Handle a change of ims options at the start of the branch */
       
  5504 
       
  5505   if ((options & PCRE_IMS) != oldims)
       
  5506     {
       
  5507     *code++ = OP_OPT;
       
  5508     *code++ = options & PCRE_IMS;
       
  5509     length += 2;
       
  5510     }
       
  5511 
       
  5512   /* Set up dummy OP_REVERSE if lookbehind assertion */
       
  5513 
       
  5514   if (lookbehind)
       
  5515     {
       
  5516     *code++ = OP_REVERSE;
       
  5517     reverse_count = code;
       
  5518     PUTINC(code, 0, 0);
       
  5519     length += 1 + LINK_SIZE;
       
  5520     }
       
  5521 
       
  5522   /* Now compile the branch; in the pre-compile phase its length gets added
       
  5523   into the length. */
       
  5524 
       
  5525   if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstbyte,
       
  5526         &branchreqbyte, &bc, cd, (lengthptr == NULL)? NULL : &length))
       
  5527     {
       
  5528     *ptrptr = ptr;
       
  5529     return FALSE;
       
  5530     }
       
  5531 
       
  5532   /* Keep the highest bracket count in case (?| was used and some branch
       
  5533   has fewer than the rest. */
       
  5534 
       
  5535   if (cd->bracount > max_bracount) max_bracount = cd->bracount;
       
  5536 
       
  5537   /* In the real compile phase, there is some post-processing to be done. */
       
  5538 
       
  5539   if (lengthptr == NULL)
       
  5540     {
       
  5541     /* If this is the first branch, the firstbyte and reqbyte values for the
       
  5542     branch become the values for the regex. */
       
  5543 
       
  5544     if (*last_branch != OP_ALT)
       
  5545       {
       
  5546       firstbyte = branchfirstbyte;
       
  5547       reqbyte = branchreqbyte;
       
  5548       }
       
  5549 
       
  5550     /* If this is not the first branch, the first char and reqbyte have to
       
  5551     match the values from all the previous branches, except that if the
       
  5552     previous value for reqbyte didn't have REQ_VARY set, it can still match,
       
  5553     and we set REQ_VARY for the regex. */
       
  5554 
       
  5555     else
       
  5556       {
       
  5557       /* If we previously had a firstbyte, but it doesn't match the new branch,
       
  5558       we have to abandon the firstbyte for the regex, but if there was
       
  5559       previously no reqbyte, it takes on the value of the old firstbyte. */
       
  5560 
       
  5561       if (firstbyte >= 0 && firstbyte != branchfirstbyte)
       
  5562         {
       
  5563         if (reqbyte < 0) reqbyte = firstbyte;
       
  5564         firstbyte = REQ_NONE;
       
  5565         }
       
  5566 
       
  5567       /* If we (now or from before) have no firstbyte, a firstbyte from the
       
  5568       branch becomes a reqbyte if there isn't a branch reqbyte. */
       
  5569 
       
  5570       if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
       
  5571           branchreqbyte = branchfirstbyte;
       
  5572 
       
  5573       /* Now ensure that the reqbytes match */
       
  5574 
       
  5575       if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
       
  5576         reqbyte = REQ_NONE;
       
  5577       else reqbyte |= branchreqbyte;   /* To "or" REQ_VARY */
       
  5578       }
       
  5579 
       
  5580     /* If lookbehind, check that this branch matches a fixed-length string, and
       
  5581     put the length into the OP_REVERSE item. Temporarily mark the end of the
       
  5582     branch with OP_END. */
       
  5583 
       
  5584     if (lookbehind)
       
  5585       {
       
  5586       int fixed_length;
       
  5587       *code = OP_END;
       
  5588       fixed_length = find_fixedlength(last_branch, options);
       
  5589       DPRINTF(("fixed length = %d\n", fixed_length));
       
  5590       if (fixed_length < 0)
       
  5591         {
       
  5592         *errorcodeptr = (fixed_length == -2)? ERR36 : ERR25;
       
  5593         *ptrptr = ptr;
       
  5594         return FALSE;
       
  5595         }
       
  5596       PUT(reverse_count, 0, fixed_length);
       
  5597       }
       
  5598     }
       
  5599 
       
  5600   /* Reached end of expression, either ')' or end of pattern. In the real
       
  5601   compile phase, go back through the alternative branches and reverse the chain
       
  5602   of offsets, with the field in the BRA item now becoming an offset to the
       
  5603   first alternative. If there are no alternatives, it points to the end of the
       
  5604   group. The length in the terminating ket is always the length of the whole
       
  5605   bracketed item. If any of the ims options were changed inside the group,
       
  5606   compile a resetting op-code following, except at the very end of the pattern.
       
  5607   Return leaving the pointer at the terminating char. */
       
  5608 
       
  5609   if (*ptr != '|')
       
  5610     {
       
  5611     if (lengthptr == NULL)
       
  5612       {
       
  5613       int branch_length = code - last_branch;
       
  5614       do
       
  5615         {
       
  5616         int prev_length = GET(last_branch, 1);
       
  5617         PUT(last_branch, 1, branch_length);
       
  5618         branch_length = prev_length;
       
  5619         last_branch -= branch_length;
       
  5620         }
       
  5621       while (branch_length > 0);
       
  5622       }
       
  5623 
       
  5624     /* Fill in the ket */
       
  5625 
       
  5626     *code = OP_KET;
       
  5627     PUT(code, 1, code - start_bracket);
       
  5628     code += 1 + LINK_SIZE;
       
  5629 
       
  5630     /* Resetting option if needed */
       
  5631 
       
  5632     if ((options & PCRE_IMS) != oldims && *ptr == ')')
       
  5633       {
       
  5634       *code++ = OP_OPT;
       
  5635       *code++ = oldims;
       
  5636       length += 2;
       
  5637       }
       
  5638 
       
  5639     /* Retain the highest bracket number, in case resetting was used. */
       
  5640 
       
  5641     cd->bracount = max_bracount;
       
  5642 
       
  5643     /* Set values to pass back */
       
  5644 
       
  5645     *codeptr = code;
       
  5646     *ptrptr = ptr;
       
  5647     *firstbyteptr = firstbyte;
       
  5648     *reqbyteptr = reqbyte;
       
  5649     if (lengthptr != NULL)
       
  5650       {
       
  5651       if (OFLOW_MAX - *lengthptr < length)
       
  5652         {
       
  5653         *errorcodeptr = ERR20;
       
  5654         return FALSE;
       
  5655         }
       
  5656       *lengthptr += length;
       
  5657       }
       
  5658     return TRUE;
       
  5659     }
       
  5660 
       
  5661   /* Another branch follows. In the pre-compile phase, we can move the code
       
  5662   pointer back to where it was for the start of the first branch. (That is,
       
  5663   pretend that each branch is the only one.)
       
  5664 
       
  5665   In the real compile phase, insert an ALT node. Its length field points back
       
  5666   to the previous branch while the bracket remains open. At the end the chain
       
  5667   is reversed. It's done like this so that the start of the bracket has a
       
  5668   zero offset until it is closed, making it possible to detect recursion. */
       
  5669 
       
  5670   if (lengthptr != NULL)
       
  5671     {
       
  5672     code = *codeptr + 1 + LINK_SIZE + skipbytes;
       
  5673     length += 1 + LINK_SIZE;
       
  5674     }
       
  5675   else
       
  5676     {
       
  5677     *code = OP_ALT;
       
  5678     PUT(code, 1, code - last_branch);
       
  5679     bc.current = last_branch = code;
       
  5680     code += 1 + LINK_SIZE;
       
  5681     }
       
  5682 
       
  5683   ptr++;
       
  5684   }
       
  5685 /* Control never reaches here */
       
  5686 }
       
  5687 
       
  5688 
       
  5689 
       
  5690 
       
  5691 /*************************************************
       
  5692 *          Check for anchored expression         *
       
  5693 *************************************************/
       
  5694 
       
  5695 /* Try to find out if this is an anchored regular expression. Consider each
       
  5696 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
       
  5697 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
       
  5698 it's anchored. However, if this is a multiline pattern, then only OP_SOD
       
  5699 counts, since OP_CIRC can match in the middle.
       
  5700 
       
  5701 We can also consider a regex to be anchored if OP_SOM starts all its branches.
       
  5702 This is the code for \G, which means "match at start of match position, taking
       
  5703 into account the match offset".
       
  5704 
       
  5705 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
       
  5706 because that will try the rest of the pattern at all possible matching points,
       
  5707 so there is no point trying again.... er ....
       
  5708 
       
  5709 .... except when the .* appears inside capturing parentheses, and there is a
       
  5710 subsequent back reference to those parentheses. We haven't enough information
       
  5711 to catch that case precisely.
       
  5712 
       
  5713 At first, the best we could do was to detect when .* was in capturing brackets
       
  5714 and the highest back reference was greater than or equal to that level.
       
  5715 However, by keeping a bitmap of the first 31 back references, we can catch some
       
  5716 of the more common cases more precisely.
       
  5717 
       
  5718 Arguments:
       
  5719   code           points to start of expression (the bracket)
       
  5720   options        points to the options setting
       
  5721   bracket_map    a bitmap of which brackets we are inside while testing; this
       
  5722                   handles up to substring 31; after that we just have to take
       
  5723                   the less precise approach
       
  5724   backref_map    the back reference bitmap
       
  5725 
       
  5726 Returns:     TRUE or FALSE
       
  5727 */
       
  5728 
       
  5729 static BOOL
       
  5730 is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
       
  5731   unsigned int backref_map)
       
  5732 {
       
  5733 do {
       
  5734    const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
       
  5735      options, PCRE_MULTILINE, FALSE);
       
  5736    register int op = *scode;
       
  5737 
       
  5738    /* Non-capturing brackets */
       
  5739 
       
  5740    if (op == OP_BRA)
       
  5741      {
       
  5742      if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
       
  5743      }
       
  5744 
       
  5745    /* Capturing brackets */
       
  5746 
       
  5747    else if (op == OP_CBRA)
       
  5748      {
       
  5749      int n = GET2(scode, 1+LINK_SIZE);
       
  5750      int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
       
  5751      if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
       
  5752      }
       
  5753 
       
  5754    /* Other brackets */
       
  5755 
       
  5756    else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
       
  5757      {
       
  5758      if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
       
  5759      }
       
  5760 
       
  5761    /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and
       
  5762    it isn't in brackets that are or may be referenced. */
       
  5763 
       
  5764    else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
       
  5765              op == OP_TYPEPOSSTAR))
       
  5766      {
       
  5767      if (scode[1] != OP_ALLANY || (bracket_map & backref_map) != 0)
       
  5768        return FALSE;
       
  5769      }
       
  5770 
       
  5771    /* Check for explicit anchoring */
       
  5772 
       
  5773    else if (op != OP_SOD && op != OP_SOM &&
       
  5774            ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
       
  5775      return FALSE;
       
  5776    code += GET(code, 1);
       
  5777    }
       
  5778 while (*code == OP_ALT);   /* Loop for each alternative */
       
  5779 return TRUE;
       
  5780 }
       
  5781 
       
  5782 
       
  5783 
       
  5784 /*************************************************
       
  5785 *         Check for starting with ^ or .*        *
       
  5786 *************************************************/
       
  5787 
       
  5788 /* This is called to find out if every branch starts with ^ or .* so that
       
  5789 "first char" processing can be done to speed things up in multiline
       
  5790 matching and for non-DOTALL patterns that start with .* (which must start at
       
  5791 the beginning or after \n). As in the case of is_anchored() (see above), we
       
  5792 have to take account of back references to capturing brackets that contain .*
       
  5793 because in that case we can't make the assumption.
       
  5794 
       
  5795 Arguments:
       
  5796   code           points to start of expression (the bracket)
       
  5797   bracket_map    a bitmap of which brackets we are inside while testing; this
       
  5798                   handles up to substring 31; after that we just have to take
       
  5799                   the less precise approach
       
  5800   backref_map    the back reference bitmap
       
  5801 
       
  5802 Returns:         TRUE or FALSE
       
  5803 */
       
  5804 
       
  5805 static BOOL
       
  5806 is_startline(const uschar *code, unsigned int bracket_map,
       
  5807   unsigned int backref_map)
       
  5808 {
       
  5809 do {
       
  5810    const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
       
  5811      NULL, 0, FALSE);
       
  5812    register int op = *scode;
       
  5813 
       
  5814    /* Non-capturing brackets */
       
  5815 
       
  5816    if (op == OP_BRA)
       
  5817      {
       
  5818      if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
       
  5819      }
       
  5820 
       
  5821    /* Capturing brackets */
       
  5822 
       
  5823    else if (op == OP_CBRA)
       
  5824      {
       
  5825      int n = GET2(scode, 1+LINK_SIZE);
       
  5826      int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
       
  5827      if (!is_startline(scode, new_map, backref_map)) return FALSE;
       
  5828      }
       
  5829 
       
  5830    /* Other brackets */
       
  5831 
       
  5832    else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
       
  5833      { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
       
  5834 
       
  5835    /* .* means "start at start or after \n" if it isn't in brackets that
       
  5836    may be referenced. */
       
  5837 
       
  5838    else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
       
  5839      {
       
  5840      if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
       
  5841      }
       
  5842 
       
  5843    /* Check for explicit circumflex */
       
  5844 
       
  5845    else if (op != OP_CIRC) return FALSE;
       
  5846 
       
  5847    /* Move on to the next alternative */
       
  5848 
       
  5849    code += GET(code, 1);
       
  5850    }
       
  5851 while (*code == OP_ALT);  /* Loop for each alternative */
       
  5852 return TRUE;
       
  5853 }
       
  5854 
       
  5855 
       
  5856 
       
  5857 /*************************************************
       
  5858 *       Check for asserted fixed first char      *
       
  5859 *************************************************/
       
  5860 
       
  5861 /* During compilation, the "first char" settings from forward assertions are
       
  5862 discarded, because they can cause conflicts with actual literals that follow.
       
  5863 However, if we end up without a first char setting for an unanchored pattern,
       
  5864 it is worth scanning the regex to see if there is an initial asserted first
       
  5865 char. If all branches start with the same asserted char, or with a bracket all
       
  5866 of whose alternatives start with the same asserted char (recurse ad lib), then
       
  5867 we return that char, otherwise -1.
       
  5868 
       
  5869 Arguments:
       
  5870   code       points to start of expression (the bracket)
       
  5871   options    pointer to the options (used to check casing changes)
       
  5872   inassert   TRUE if in an assertion
       
  5873 
       
  5874 Returns:     -1 or the fixed first char
       
  5875 */
       
  5876 
       
  5877 static int
       
  5878 find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
       
  5879 {
       
  5880 register int c = -1;
       
  5881 do {
       
  5882    int d;
       
  5883    const uschar *scode =
       
  5884      first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
       
  5885    register int op = *scode;
       
  5886 
       
  5887    switch(op)
       
  5888      {
       
  5889      default:
       
  5890      return -1;
       
  5891 
       
  5892      case OP_BRA:
       
  5893      case OP_CBRA:
       
  5894      case OP_ASSERT:
       
  5895      case OP_ONCE:
       
  5896      case OP_COND:
       
  5897      if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
       
  5898        return -1;
       
  5899      if (c < 0) c = d; else if (c != d) return -1;
       
  5900      break;
       
  5901 
       
  5902      case OP_EXACT:       /* Fall through */
       
  5903      scode += 2;
       
  5904 
       
  5905      case OP_CHAR:
       
  5906      case OP_CHARNC:
       
  5907      case OP_PLUS:
       
  5908      case OP_MINPLUS:
       
  5909      case OP_POSPLUS:
       
  5910      if (!inassert) return -1;
       
  5911      if (c < 0)
       
  5912        {
       
  5913        c = scode[1];
       
  5914        if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
       
  5915        }
       
  5916      else if (c != scode[1]) return -1;
       
  5917      break;
       
  5918      }
       
  5919 
       
  5920    code += GET(code, 1);
       
  5921    }
       
  5922 while (*code == OP_ALT);
       
  5923 return c;
       
  5924 }
       
  5925 
       
  5926 
       
  5927 
       
  5928 /*************************************************
       
  5929 *        Compile a Regular Expression            *
       
  5930 *************************************************/
       
  5931 
       
  5932 /* This function takes a string and returns a pointer to a block of store
       
  5933 holding a compiled version of the expression. The original API for this
       
  5934 function had no error code return variable; it is retained for backwards
       
  5935 compatibility. The new function is given a new name.
       
  5936 
       
  5937 Arguments:
       
  5938   pattern       the regular expression
       
  5939   options       various option bits
       
  5940   errorcodeptr  pointer to error code variable (pcre_compile2() only)
       
  5941                   can be NULL if you don't want a code value
       
  5942   errorptr      pointer to pointer to error text
       
  5943   erroroffset   ptr offset in pattern where error was detected
       
  5944   tables        pointer to character tables or NULL
       
  5945 
       
  5946 Returns:        pointer to compiled data block, or NULL on error,
       
  5947                 with errorptr and erroroffset set
       
  5948 */
       
  5949 
       
  5950 PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
       
  5951 pcre_compile(const char *pattern, int options, const char **errorptr,
       
  5952   int *erroroffset, const unsigned char *tables)
       
  5953 {
       
  5954 return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
       
  5955 }
       
  5956 
       
  5957 
       
  5958 PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
       
  5959 pcre_compile2(const char *pattern, int options, int *errorcodeptr,
       
  5960   const char **errorptr, int *erroroffset, const unsigned char *tables)
       
  5961 {
       
  5962 real_pcre *re;
       
  5963 int length = 1;  /* For final END opcode */
       
  5964 int firstbyte, reqbyte, newline;
       
  5965 int errorcode = 0;
       
  5966 int skipatstart = 0;
       
  5967 #ifdef SUPPORT_UTF8
       
  5968 BOOL utf8;
       
  5969 #endif
       
  5970 size_t size;
       
  5971 uschar *code;
       
  5972 const uschar *codestart;
       
  5973 const uschar *ptr;
       
  5974 compile_data compile_block;
       
  5975 compile_data *cd = &compile_block;
       
  5976 
       
  5977 /* This space is used for "compiling" into during the first phase, when we are
       
  5978 computing the amount of memory that is needed. Compiled items are thrown away
       
  5979 as soon as possible, so that a fairly large buffer should be sufficient for
       
  5980 this purpose. The same space is used in the second phase for remembering where
       
  5981 to fill in forward references to subpatterns. */
       
  5982 
       
  5983 uschar cworkspace[COMPILE_WORK_SIZE];
       
  5984 
       
  5985 /* Set this early so that early errors get offset 0. */
       
  5986 
       
  5987 ptr = (const uschar *)pattern;
       
  5988 
       
  5989 /* We can't pass back an error message if errorptr is NULL; I guess the best we
       
  5990 can do is just return NULL, but we can set a code value if there is a code
       
  5991 pointer. */
       
  5992 
       
  5993 if (errorptr == NULL)
       
  5994   {
       
  5995   if (errorcodeptr != NULL) *errorcodeptr = 99;
       
  5996   return NULL;
       
  5997   }
       
  5998 
       
  5999 *errorptr = NULL;
       
  6000 if (errorcodeptr != NULL) *errorcodeptr = ERR0;
       
  6001 
       
  6002 /* However, we can give a message for this error */
       
  6003 
       
  6004 if (erroroffset == NULL)
       
  6005   {
       
  6006   errorcode = ERR16;
       
  6007   goto PCRE_EARLY_ERROR_RETURN2;
       
  6008   }
       
  6009 
       
  6010 *erroroffset = 0;
       
  6011 
       
  6012 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
       
  6013 
       
  6014 #ifdef SUPPORT_UTF8
       
  6015 utf8 = (options & PCRE_UTF8) != 0;
       
  6016 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
       
  6017      (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)
       
  6018   {
       
  6019   errorcode = ERR44;
       
  6020   goto PCRE_EARLY_ERROR_RETURN2;
       
  6021   }
       
  6022 #else
       
  6023 if ((options & PCRE_UTF8) != 0)
       
  6024   {
       
  6025   errorcode = ERR32;
       
  6026   goto PCRE_EARLY_ERROR_RETURN;
       
  6027   }
       
  6028 #endif
       
  6029 
       
  6030 if ((options & ~PUBLIC_OPTIONS) != 0)
       
  6031   {
       
  6032   errorcode = ERR17;
       
  6033   goto PCRE_EARLY_ERROR_RETURN;
       
  6034   }
       
  6035 
       
  6036 /* Set up pointers to the individual character tables */
       
  6037 
       
  6038 if (tables == NULL) tables = _pcre_default_tables;
       
  6039 cd->lcc = tables + lcc_offset;
       
  6040 cd->fcc = tables + fcc_offset;
       
  6041 cd->cbits = tables + cbits_offset;
       
  6042 cd->ctypes = tables + ctypes_offset;
       
  6043 
       
  6044 /* Check for global one-time settings at the start of the pattern, and remember
       
  6045 the offset for later. */
       
  6046 
       
  6047 while (ptr[skipatstart] == '(' && ptr[skipatstart+1] == '*')
       
  6048   {
       
  6049   int newnl = 0;
       
  6050   int newbsr = 0;
       
  6051 
       
  6052   if (strncmp((char *)(ptr+skipatstart+2), "CR)", 3) == 0)
       
  6053     { skipatstart += 5; newnl = PCRE_NEWLINE_CR; }
       
  6054   else if (strncmp((char *)(ptr+skipatstart+2), "LF)", 3)  == 0)
       
  6055     { skipatstart += 5; newnl = PCRE_NEWLINE_LF; }
       
  6056   else if (strncmp((char *)(ptr+skipatstart+2), "CRLF)", 5)  == 0)
       
  6057     { skipatstart += 7; newnl = PCRE_NEWLINE_CR + PCRE_NEWLINE_LF; }
       
  6058   else if (strncmp((char *)(ptr+skipatstart+2), "ANY)", 4) == 0)
       
  6059     { skipatstart += 6; newnl = PCRE_NEWLINE_ANY; }
       
  6060   else if (strncmp((char *)(ptr+skipatstart+2), "ANYCRLF)", 8)  == 0)
       
  6061     { skipatstart += 10; newnl = PCRE_NEWLINE_ANYCRLF; }
       
  6062 
       
  6063   else if (strncmp((char *)(ptr+skipatstart+2), "BSR_ANYCRLF)", 12) == 0)
       
  6064     { skipatstart += 14; newbsr = PCRE_BSR_ANYCRLF; }
       
  6065   else if (strncmp((char *)(ptr+skipatstart+2), "BSR_UNICODE)", 12) == 0)
       
  6066     { skipatstart += 14; newbsr = PCRE_BSR_UNICODE; }
       
  6067 
       
  6068   if (newnl != 0)
       
  6069     options = (options & ~PCRE_NEWLINE_BITS) | newnl;
       
  6070   else if (newbsr != 0)
       
  6071     options = (options & ~(PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) | newbsr;
       
  6072   else break;
       
  6073   }
       
  6074 
       
  6075 /* Check validity of \R options. */
       
  6076 
       
  6077 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
       
  6078   {
       
  6079   case 0:
       
  6080   case PCRE_BSR_ANYCRLF:
       
  6081   case PCRE_BSR_UNICODE:
       
  6082   break;
       
  6083   default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
       
  6084   }
       
  6085 
       
  6086 /* Handle different types of newline. The three bits give seven cases. The
       
  6087 current code allows for fixed one- or two-byte sequences, plus "any" and
       
  6088 "anycrlf". */
       
  6089 
       
  6090 switch (options & PCRE_NEWLINE_BITS)
       
  6091   {
       
  6092   case 0: newline = NEWLINE; break;   /* Build-time default */
       
  6093   case PCRE_NEWLINE_CR: newline = '\r'; break;
       
  6094   case PCRE_NEWLINE_LF: newline = '\n'; break;
       
  6095   case PCRE_NEWLINE_CR+
       
  6096        PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
       
  6097   case PCRE_NEWLINE_ANY: newline = -1; break;
       
  6098   case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
       
  6099   default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
       
  6100   }
       
  6101 
       
  6102 if (newline == -2)
       
  6103   {
       
  6104   cd->nltype = NLTYPE_ANYCRLF;
       
  6105   }
       
  6106 else if (newline < 0)
       
  6107   {
       
  6108   cd->nltype = NLTYPE_ANY;
       
  6109   }
       
  6110 else
       
  6111   {
       
  6112   cd->nltype = NLTYPE_FIXED;
       
  6113   if (newline > 255)
       
  6114     {
       
  6115     cd->nllen = 2;
       
  6116     cd->nl[0] = (newline >> 8) & 255;
       
  6117     cd->nl[1] = newline & 255;
       
  6118     }
       
  6119   else
       
  6120     {
       
  6121     cd->nllen = 1;
       
  6122     cd->nl[0] = newline;
       
  6123     }
       
  6124   }
       
  6125 
       
  6126 /* Maximum back reference and backref bitmap. The bitmap records up to 31 back
       
  6127 references to help in deciding whether (.*) can be treated as anchored or not.
       
  6128 */
       
  6129 
       
  6130 cd->top_backref = 0;
       
  6131 cd->backref_map = 0;
       
  6132 
       
  6133 /* Reflect pattern for debugging output */
       
  6134 
       
  6135 DPRINTF(("------------------------------------------------------------------\n"));
       
  6136 DPRINTF(("%s\n", pattern));
       
  6137 
       
  6138 /* Pretend to compile the pattern while actually just accumulating the length
       
  6139 of memory required. This behaviour is triggered by passing a non-NULL final
       
  6140 argument to compile_regex(). We pass a block of workspace (cworkspace) for it
       
  6141 to compile parts of the pattern into; the compiled code is discarded when it is
       
  6142 no longer needed, so hopefully this workspace will never overflow, though there
       
  6143 is a test for its doing so. */
       
  6144 
       
  6145 cd->bracount = cd->final_bracount = 0;
       
  6146 cd->names_found = 0;
       
  6147 cd->name_entry_size = 0;
       
  6148 cd->name_table = NULL;
       
  6149 cd->start_workspace = cworkspace;
       
  6150 cd->start_code = cworkspace;
       
  6151 cd->hwm = cworkspace;
       
  6152 cd->start_pattern = (const uschar *)pattern;
       
  6153 cd->end_pattern = (const uschar *)(pattern + strlen(pattern));
       
  6154 cd->req_varyopt = 0;
       
  6155 cd->external_options = options;
       
  6156 cd->external_flags = 0;
       
  6157 
       
  6158 /* Now do the pre-compile. On error, errorcode will be set non-zero, so we
       
  6159 don't need to look at the result of the function here. The initial options have
       
  6160 been put into the cd block so that they can be changed if an option setting is
       
  6161 found within the regex right at the beginning. Bringing initial option settings
       
  6162 outside can help speed up starting point checks. */
       
  6163 
       
  6164 ptr += skipatstart;
       
  6165 code = cworkspace;
       
  6166 *code = OP_BRA;
       
  6167 (void)compile_regex(cd->external_options, cd->external_options & PCRE_IMS,
       
  6168   &code, &ptr, &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd,
       
  6169   &length);
       
  6170 if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
       
  6171 
       
  6172 DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
       
  6173   cd->hwm - cworkspace));
       
  6174 
       
  6175 if (length > MAX_PATTERN_SIZE)
       
  6176   {
       
  6177   errorcode = ERR20;
       
  6178   goto PCRE_EARLY_ERROR_RETURN;
       
  6179   }
       
  6180 
       
  6181 /* Compute the size of data block needed and get it, either from malloc or
       
  6182 externally provided function. Integer overflow should no longer be possible
       
  6183 because nowadays we limit the maximum value of cd->names_found and
       
  6184 cd->name_entry_size. */
       
  6185 
       
  6186 size = length + sizeof(real_pcre) + cd->names_found * (cd->name_entry_size + 3);
       
  6187 re = (real_pcre *)(pcre_malloc)(size);
       
  6188 
       
  6189 if (re == NULL)
       
  6190   {
       
  6191   errorcode = ERR21;
       
  6192   goto PCRE_EARLY_ERROR_RETURN;
       
  6193   }
       
  6194 
       
  6195 /* Put in the magic number, and save the sizes, initial options, internal
       
  6196 flags, and character table pointer. NULL is used for the default character
       
  6197 tables. The nullpad field is at the end; it's there to help in the case when a
       
  6198 regex compiled on a system with 4-byte pointers is run on another with 8-byte
       
  6199 pointers. */
       
  6200 
       
  6201 re->magic_number = MAGIC_NUMBER;
       
  6202 re->size = size;
       
  6203 re->options = cd->external_options;
       
  6204 re->flags = cd->external_flags;
       
  6205 re->dummy1 = 0;
       
  6206 re->first_byte = 0;
       
  6207 re->req_byte = 0;
       
  6208 re->name_table_offset = sizeof(real_pcre);
       
  6209 re->name_entry_size = cd->name_entry_size;
       
  6210 re->name_count = cd->names_found;
       
  6211 re->ref_count = 0;
       
  6212 re->tables = (tables == _pcre_default_tables)? NULL : tables;
       
  6213 re->nullpad = NULL;
       
  6214 
       
  6215 /* The starting points of the name/number translation table and of the code are
       
  6216 passed around in the compile data block. The start/end pattern and initial
       
  6217 options are already set from the pre-compile phase, as is the name_entry_size
       
  6218 field. Reset the bracket count and the names_found field. Also reset the hwm
       
  6219 field; this time it's used for remembering forward references to subpatterns.
       
  6220 */
       
  6221 
       
  6222 cd->final_bracount = cd->bracount;  /* Save for checking forward references */
       
  6223 cd->bracount = 0;
       
  6224 cd->names_found = 0;
       
  6225 cd->name_table = (uschar *)re + re->name_table_offset;
       
  6226 codestart = cd->name_table + re->name_entry_size * re->name_count;
       
  6227 cd->start_code = codestart;
       
  6228 cd->hwm = cworkspace;
       
  6229 cd->req_varyopt = 0;
       
  6230 cd->had_accept = FALSE;
       
  6231 
       
  6232 /* Set up a starting, non-extracting bracket, then compile the expression. On
       
  6233 error, errorcode will be set non-zero, so we don't need to look at the result
       
  6234 of the function here. */
       
  6235 
       
  6236 ptr = (const uschar *)pattern + skipatstart;
       
  6237 code = (uschar *)codestart;
       
  6238 *code = OP_BRA;
       
  6239 (void)compile_regex(re->options, re->options & PCRE_IMS, &code, &ptr,
       
  6240   &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, NULL);
       
  6241 re->top_bracket = cd->bracount;
       
  6242 re->top_backref = cd->top_backref;
       
  6243 re->flags = cd->external_flags;
       
  6244 
       
  6245 if (cd->had_accept) reqbyte = -1;   /* Must disable after (*ACCEPT) */
       
  6246 
       
  6247 /* If not reached end of pattern on success, there's an excess bracket. */
       
  6248 
       
  6249 if (errorcode == 0 && *ptr != 0) errorcode = ERR22;
       
  6250 
       
  6251 /* Fill in the terminating state and check for disastrous overflow, but
       
  6252 if debugging, leave the test till after things are printed out. */
       
  6253 
       
  6254 *code++ = OP_END;
       
  6255 
       
  6256 #ifndef DEBUG
       
  6257 if (code - codestart > length) errorcode = ERR23;
       
  6258 #endif
       
  6259 
       
  6260 /* Fill in any forward references that are required. */
       
  6261 
       
  6262 while (errorcode == 0 && cd->hwm > cworkspace)
       
  6263   {
       
  6264   int offset, recno;
       
  6265   const uschar *groupptr;
       
  6266   cd->hwm -= LINK_SIZE;
       
  6267   offset = GET(cd->hwm, 0);
       
  6268   recno = GET(codestart, offset);
       
  6269   groupptr = find_bracket(codestart, (re->options & PCRE_UTF8) != 0, recno);
       
  6270   if (groupptr == NULL) errorcode = ERR53;
       
  6271     else PUT(((uschar *)codestart), offset, groupptr - codestart);
       
  6272   }
       
  6273 
       
  6274 /* Give an error if there's back reference to a non-existent capturing
       
  6275 subpattern. */
       
  6276 
       
  6277 if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
       
  6278 
       
  6279 /* Failed to compile, or error while post-processing */
       
  6280 
       
  6281 if (errorcode != 0)
       
  6282   {
       
  6283   (pcre_free)(re);
       
  6284   PCRE_EARLY_ERROR_RETURN:
       
  6285   *erroroffset = ptr - (const uschar *)pattern;
       
  6286   PCRE_EARLY_ERROR_RETURN2:
       
  6287   *errorptr = find_error_text(errorcode);
       
  6288   if (errorcodeptr != NULL) *errorcodeptr = errorcode;
       
  6289   return NULL;
       
  6290   }
       
  6291 
       
  6292 /* If the anchored option was not passed, set the flag if we can determine that
       
  6293 the pattern is anchored by virtue of ^ characters or \A or anything else (such
       
  6294 as starting with .* when DOTALL is set).
       
  6295 
       
  6296 Otherwise, if we know what the first byte has to be, save it, because that
       
  6297 speeds up unanchored matches no end. If not, see if we can set the
       
  6298 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
       
  6299 start with ^. and also when all branches start with .* for non-DOTALL matches.
       
  6300 */
       
  6301 
       
  6302 if ((re->options & PCRE_ANCHORED) == 0)
       
  6303   {
       
  6304   int temp_options = re->options;   /* May get changed during these scans */
       
  6305   if (is_anchored(codestart, &temp_options, 0, cd->backref_map))
       
  6306     re->options |= PCRE_ANCHORED;
       
  6307   else
       
  6308     {
       
  6309     if (firstbyte < 0)
       
  6310       firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE);
       
  6311     if (firstbyte >= 0)   /* Remove caseless flag for non-caseable chars */
       
  6312       {
       
  6313       int ch = firstbyte & 255;
       
  6314       re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
       
  6315          cd->fcc[ch] == ch)? ch : firstbyte;
       
  6316       re->flags |= PCRE_FIRSTSET;
       
  6317       }
       
  6318     else if (is_startline(codestart, 0, cd->backref_map))
       
  6319       re->flags |= PCRE_STARTLINE;
       
  6320     }
       
  6321   }
       
  6322 
       
  6323 /* For an anchored pattern, we use the "required byte" only if it follows a
       
  6324 variable length item in the regex. Remove the caseless flag for non-caseable
       
  6325 bytes. */
       
  6326 
       
  6327 if (reqbyte >= 0 &&
       
  6328      ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
       
  6329   {
       
  6330   int ch = reqbyte & 255;
       
  6331   re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
       
  6332     cd->fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
       
  6333   re->flags |= PCRE_REQCHSET;
       
  6334   }
       
  6335 
       
  6336 /* Print out the compiled data if debugging is enabled. This is never the
       
  6337 case when building a production library. */
       
  6338 
       
  6339 #ifdef DEBUG
       
  6340 
       
  6341 printf("Length = %d top_bracket = %d top_backref = %d\n",
       
  6342   length, re->top_bracket, re->top_backref);
       
  6343 
       
  6344 printf("Options=%08x\n", re->options);
       
  6345 
       
  6346 if ((re->flags & PCRE_FIRSTSET) != 0)
       
  6347   {
       
  6348   int ch = re->first_byte & 255;
       
  6349   const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)?
       
  6350     "" : " (caseless)";
       
  6351   if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
       
  6352     else printf("First char = \\x%02x%s\n", ch, caseless);
       
  6353   }
       
  6354 
       
  6355 if ((re->flags & PCRE_REQCHSET) != 0)
       
  6356   {
       
  6357   int ch = re->req_byte & 255;
       
  6358   const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)?
       
  6359     "" : " (caseless)";
       
  6360   if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
       
  6361     else printf("Req char = \\x%02x%s\n", ch, caseless);
       
  6362   }
       
  6363 
       
  6364 pcre_printint(re, stdout, TRUE);
       
  6365 
       
  6366 /* This check is done here in the debugging case so that the code that
       
  6367 was compiled can be seen. */
       
  6368 
       
  6369 if (code - codestart > length)
       
  6370   {
       
  6371   (pcre_free)(re);
       
  6372   *errorptr = find_error_text(ERR23);
       
  6373   *erroroffset = ptr - (uschar *)pattern;
       
  6374   if (errorcodeptr != NULL) *errorcodeptr = ERR23;
       
  6375   return NULL;
       
  6376   }
       
  6377 #endif   /* DEBUG */
       
  6378 
       
  6379 return (pcre *)re;
       
  6380 }
       
  6381 
       
  6382 /* End of pcre_compile.c */