|
1 /************************************************* |
|
2 * Perl-Compatible Regular Expressions * |
|
3 *************************************************/ |
|
4 |
|
5 /* PCRE is a library of functions to support regular expressions whose syntax |
|
6 and semantics are as close as possible to those of the Perl 5 language. |
|
7 |
|
8 Written by Philip Hazel |
|
9 Copyright (c) 1997-2008 University of Cambridge |
|
10 |
|
11 ----------------------------------------------------------------------------- |
|
12 Redistribution and use in source and binary forms, with or without |
|
13 modification, are permitted provided that the following conditions are met: |
|
14 |
|
15 * Redistributions of source code must retain the above copyright notice, |
|
16 this list of conditions and the following disclaimer. |
|
17 |
|
18 * Redistributions in binary form must reproduce the above copyright |
|
19 notice, this list of conditions and the following disclaimer in the |
|
20 documentation and/or other materials provided with the distribution. |
|
21 |
|
22 * Neither the name of the University of Cambridge nor the names of its |
|
23 contributors may be used to endorse or promote products derived from |
|
24 this software without specific prior written permission. |
|
25 |
|
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
|
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
|
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
|
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
|
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
|
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
|
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
|
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
|
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
|
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
|
36 POSSIBILITY OF SUCH DAMAGE. |
|
37 ----------------------------------------------------------------------------- |
|
38 */ |
|
39 |
|
40 |
|
41 /* This module contains the external function pcre_compile(), along with |
|
42 supporting internal functions that are not used by other modules. */ |
|
43 |
|
44 |
|
45 #ifdef HAVE_CONFIG_H |
|
46 #include "config.h" |
|
47 #endif |
|
48 |
|
49 #define NLBLOCK cd /* Block containing newline information */ |
|
50 #define PSSTART start_pattern /* Field containing processed string start */ |
|
51 #define PSEND end_pattern /* Field containing processed string end */ |
|
52 |
|
53 #include "pcre_internal.h" |
|
54 |
|
55 |
|
56 /* When DEBUG is defined, we need the pcre_printint() function, which is also |
|
57 used by pcretest. DEBUG is not defined when building a production library. */ |
|
58 |
|
59 #ifdef DEBUG |
|
60 #include "pcre_printint.src" |
|
61 #endif |
|
62 |
|
63 |
|
64 /* Macro for setting individual bits in class bitmaps. */ |
|
65 |
|
66 #define SETBIT(a,b) a[b/8] |= (1 << (b%8)) |
|
67 |
|
68 /* Maximum length value to check against when making sure that the integer that |
|
69 holds the compiled pattern length does not overflow. We make it a bit less than |
|
70 INT_MAX to allow for adding in group terminating bytes, so that we don't have |
|
71 to check them every time. */ |
|
72 |
|
73 #define OFLOW_MAX (INT_MAX - 20) |
|
74 |
|
75 |
|
76 /************************************************* |
|
77 * Code parameters and static tables * |
|
78 *************************************************/ |
|
79 |
|
80 /* This value specifies the size of stack workspace that is used during the |
|
81 first pre-compile phase that determines how much memory is required. The regex |
|
82 is partly compiled into this space, but the compiled parts are discarded as |
|
83 soon as they can be, so that hopefully there will never be an overrun. The code |
|
84 does, however, check for an overrun. The largest amount I've seen used is 218, |
|
85 so this number is very generous. |
|
86 |
|
87 The same workspace is used during the second, actual compile phase for |
|
88 remembering forward references to groups so that they can be filled in at the |
|
89 end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE |
|
90 is 4 there is plenty of room. */ |
|
91 |
|
92 #define COMPILE_WORK_SIZE (4096) |
|
93 |
|
94 |
|
95 /* Table for handling escaped characters in the range '0'-'z'. Positive returns |
|
96 are simple data values; negative values are for special things like \d and so |
|
97 on. Zero means further processing is needed (for things like \x), or the escape |
|
98 is invalid. */ |
|
99 |
|
100 #ifndef EBCDIC /* This is the "normal" table for ASCII systems */ |
|
101 static const short int escapes[] = { |
|
102 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */ |
|
103 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */ |
|
104 '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */ |
|
105 -ESC_H, 0, 0, -ESC_K, 0, 0, 0, 0, /* H - O */ |
|
106 -ESC_P, -ESC_Q, -ESC_R, -ESC_S, 0, 0, -ESC_V, -ESC_W, /* P - W */ |
|
107 -ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */ |
|
108 '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */ |
|
109 -ESC_h, 0, 0, -ESC_k, 0, 0, ESC_n, 0, /* h - o */ |
|
110 -ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, -ESC_v, -ESC_w, /* p - w */ |
|
111 0, 0, -ESC_z /* x - z */ |
|
112 }; |
|
113 |
|
114 #else /* This is the "abnormal" table for EBCDIC systems */ |
|
115 static const short int escapes[] = { |
|
116 /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|', |
|
117 /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0, |
|
118 /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~', |
|
119 /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0, |
|
120 /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?', |
|
121 /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0, |
|
122 /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"', |
|
123 /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, |
|
124 /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0, |
|
125 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p, |
|
126 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0, |
|
127 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0, |
|
128 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0, |
|
129 /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0, |
|
130 /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-', |
|
131 /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G, |
|
132 /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0, |
|
133 /* D0 */ '}', 0, -ESC_K, 0, 0, 0, 0, -ESC_P, |
|
134 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0, |
|
135 /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X, |
|
136 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0, |
|
137 /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0, |
|
138 /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0 |
|
139 }; |
|
140 #endif |
|
141 |
|
142 |
|
143 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is |
|
144 searched linearly. Put all the names into a single string, in order to reduce |
|
145 the number of relocations when a shared library is dynamically linked. */ |
|
146 |
|
147 typedef struct verbitem { |
|
148 int len; |
|
149 int op; |
|
150 } verbitem; |
|
151 |
|
152 static const char verbnames[] = |
|
153 "ACCEPT\0" |
|
154 "COMMIT\0" |
|
155 "F\0" |
|
156 "FAIL\0" |
|
157 "PRUNE\0" |
|
158 "SKIP\0" |
|
159 "THEN"; |
|
160 |
|
161 static const verbitem verbs[] = { |
|
162 { 6, OP_ACCEPT }, |
|
163 { 6, OP_COMMIT }, |
|
164 { 1, OP_FAIL }, |
|
165 { 4, OP_FAIL }, |
|
166 { 5, OP_PRUNE }, |
|
167 { 4, OP_SKIP }, |
|
168 { 4, OP_THEN } |
|
169 }; |
|
170 |
|
171 static const int verbcount = sizeof(verbs)/sizeof(verbitem); |
|
172 |
|
173 |
|
174 /* Tables of names of POSIX character classes and their lengths. The names are |
|
175 now all in a single string, to reduce the number of relocations when a shared |
|
176 library is dynamically loaded. The list of lengths is terminated by a zero |
|
177 length entry. The first three must be alpha, lower, upper, as this is assumed |
|
178 for handling case independence. */ |
|
179 |
|
180 static const char posix_names[] = |
|
181 "alpha\0" "lower\0" "upper\0" "alnum\0" "ascii\0" "blank\0" |
|
182 "cntrl\0" "digit\0" "graph\0" "print\0" "punct\0" "space\0" |
|
183 "word\0" "xdigit"; |
|
184 |
|
185 static const uschar posix_name_lengths[] = { |
|
186 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 }; |
|
187 |
|
188 /* Table of class bit maps for each POSIX class. Each class is formed from a |
|
189 base map, with an optional addition or removal of another map. Then, for some |
|
190 classes, there is some additional tweaking: for [:blank:] the vertical space |
|
191 characters are removed, and for [:alpha:] and [:alnum:] the underscore |
|
192 character is removed. The triples in the table consist of the base map offset, |
|
193 second map offset or -1 if no second map, and a non-negative value for map |
|
194 addition or a negative value for map subtraction (if there are two maps). The |
|
195 absolute value of the third field has these meanings: 0 => no tweaking, 1 => |
|
196 remove vertical space characters, 2 => remove underscore. */ |
|
197 |
|
198 static const int posix_class_maps[] = { |
|
199 cbit_word, cbit_digit, -2, /* alpha */ |
|
200 cbit_lower, -1, 0, /* lower */ |
|
201 cbit_upper, -1, 0, /* upper */ |
|
202 cbit_word, -1, 2, /* alnum - word without underscore */ |
|
203 cbit_print, cbit_cntrl, 0, /* ascii */ |
|
204 cbit_space, -1, 1, /* blank - a GNU extension */ |
|
205 cbit_cntrl, -1, 0, /* cntrl */ |
|
206 cbit_digit, -1, 0, /* digit */ |
|
207 cbit_graph, -1, 0, /* graph */ |
|
208 cbit_print, -1, 0, /* print */ |
|
209 cbit_punct, -1, 0, /* punct */ |
|
210 cbit_space, -1, 0, /* space */ |
|
211 cbit_word, -1, 0, /* word - a Perl extension */ |
|
212 cbit_xdigit,-1, 0 /* xdigit */ |
|
213 }; |
|
214 |
|
215 |
|
216 #define STRING(a) # a |
|
217 #define XSTRING(s) STRING(s) |
|
218 |
|
219 /* The texts of compile-time error messages. These are "char *" because they |
|
220 are passed to the outside world. Do not ever re-use any error number, because |
|
221 they are documented. Always add a new error instead. Messages marked DEAD below |
|
222 are no longer used. This used to be a table of strings, but in order to reduce |
|
223 the number of relocations needed when a shared library is loaded dynamically, |
|
224 it is now one long string. We cannot use a table of offsets, because the |
|
225 lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we |
|
226 simply count through to the one we want - this isn't a performance issue |
|
227 because these strings are used only when there is a compilation error. */ |
|
228 |
|
229 static const char error_texts[] = |
|
230 "no error\0" |
|
231 "\\ at end of pattern\0" |
|
232 "\\c at end of pattern\0" |
|
233 "unrecognized character follows \\\0" |
|
234 "numbers out of order in {} quantifier\0" |
|
235 /* 5 */ |
|
236 "number too big in {} quantifier\0" |
|
237 "missing terminating ] for character class\0" |
|
238 "invalid escape sequence in character class\0" |
|
239 "range out of order in character class\0" |
|
240 "nothing to repeat\0" |
|
241 /* 10 */ |
|
242 "operand of unlimited repeat could match the empty string\0" /** DEAD **/ |
|
243 "internal error: unexpected repeat\0" |
|
244 "unrecognized character after (? or (?-\0" |
|
245 "POSIX named classes are supported only within a class\0" |
|
246 "missing )\0" |
|
247 /* 15 */ |
|
248 "reference to non-existent subpattern\0" |
|
249 "erroffset passed as NULL\0" |
|
250 "unknown option bit(s) set\0" |
|
251 "missing ) after comment\0" |
|
252 "parentheses nested too deeply\0" /** DEAD **/ |
|
253 /* 20 */ |
|
254 "regular expression is too large\0" |
|
255 "failed to get memory\0" |
|
256 "unmatched parentheses\0" |
|
257 "internal error: code overflow\0" |
|
258 "unrecognized character after (?<\0" |
|
259 /* 25 */ |
|
260 "lookbehind assertion is not fixed length\0" |
|
261 "malformed number or name after (?(\0" |
|
262 "conditional group contains more than two branches\0" |
|
263 "assertion expected after (?(\0" |
|
264 "(?R or (?[+-]digits must be followed by )\0" |
|
265 /* 30 */ |
|
266 "unknown POSIX class name\0" |
|
267 "POSIX collating elements are not supported\0" |
|
268 "this version of PCRE is not compiled with PCRE_UTF8 support\0" |
|
269 "spare error\0" /** DEAD **/ |
|
270 "character value in \\x{...} sequence is too large\0" |
|
271 /* 35 */ |
|
272 "invalid condition (?(0)\0" |
|
273 "\\C not allowed in lookbehind assertion\0" |
|
274 "PCRE does not support \\L, \\l, \\N, \\U, or \\u\0" |
|
275 "number after (?C is > 255\0" |
|
276 "closing ) for (?C expected\0" |
|
277 /* 40 */ |
|
278 "recursive call could loop indefinitely\0" |
|
279 "unrecognized character after (?P\0" |
|
280 "syntax error in subpattern name (missing terminator)\0" |
|
281 "two named subpatterns have the same name\0" |
|
282 "invalid UTF-8 string\0" |
|
283 /* 45 */ |
|
284 "support for \\P, \\p, and \\X has not been compiled\0" |
|
285 "malformed \\P or \\p sequence\0" |
|
286 "unknown property name after \\P or \\p\0" |
|
287 "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0" |
|
288 "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0" |
|
289 /* 50 */ |
|
290 "repeated subpattern is too long\0" /** DEAD **/ |
|
291 "octal value is greater than \\377 (not in UTF-8 mode)\0" |
|
292 "internal error: overran compiling workspace\0" |
|
293 "internal error: previously-checked referenced subpattern not found\0" |
|
294 "DEFINE group contains more than one branch\0" |
|
295 /* 55 */ |
|
296 "repeating a DEFINE group is not allowed\0" |
|
297 "inconsistent NEWLINE options\0" |
|
298 "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0" |
|
299 "a numbered reference must not be zero\0" |
|
300 "(*VERB) with an argument is not supported\0" |
|
301 /* 60 */ |
|
302 "(*VERB) not recognized\0" |
|
303 "number is too big\0" |
|
304 "subpattern name expected\0" |
|
305 "digit expected after (?+\0" |
|
306 "] is an invalid data character in JavaScript compatibility mode"; |
|
307 |
|
308 |
|
309 /* Table to identify digits and hex digits. This is used when compiling |
|
310 patterns. Note that the tables in chartables are dependent on the locale, and |
|
311 may mark arbitrary characters as digits - but the PCRE compiling code expects |
|
312 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have |
|
313 a private table here. It costs 256 bytes, but it is a lot faster than doing |
|
314 character value tests (at least in some simple cases I timed), and in some |
|
315 applications one wants PCRE to compile efficiently as well as match |
|
316 efficiently. |
|
317 |
|
318 For convenience, we use the same bit definitions as in chartables: |
|
319 |
|
320 0x04 decimal digit |
|
321 0x08 hexadecimal digit |
|
322 |
|
323 Then we can use ctype_digit and ctype_xdigit in the code. */ |
|
324 |
|
325 #ifndef EBCDIC /* This is the "normal" case, for ASCII systems */ |
|
326 static const unsigned char digitab[] = |
|
327 { |
|
328 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */ |
|
329 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */ |
|
330 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */ |
|
331 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */ |
|
332 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */ |
|
333 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */ |
|
334 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */ |
|
335 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */ |
|
336 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */ |
|
337 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */ |
|
338 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */ |
|
339 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */ |
|
340 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */ |
|
341 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */ |
|
342 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */ |
|
343 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */ |
|
344 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */ |
|
345 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */ |
|
346 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */ |
|
347 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */ |
|
348 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */ |
|
349 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */ |
|
350 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */ |
|
351 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */ |
|
352 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */ |
|
353 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */ |
|
354 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */ |
|
355 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */ |
|
356 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */ |
|
357 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */ |
|
358 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */ |
|
359 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */ |
|
360 |
|
361 #else /* This is the "abnormal" case, for EBCDIC systems */ |
|
362 static const unsigned char digitab[] = |
|
363 { |
|
364 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */ |
|
365 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */ |
|
366 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */ |
|
367 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */ |
|
368 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */ |
|
369 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */ |
|
370 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */ |
|
371 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */ |
|
372 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */ |
|
373 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */ |
|
374 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */ |
|
375 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */ |
|
376 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */ |
|
377 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */ |
|
378 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */ |
|
379 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */ |
|
380 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */ |
|
381 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */ |
|
382 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */ |
|
383 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */ |
|
384 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */ |
|
385 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */ |
|
386 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */ |
|
387 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */ |
|
388 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */ |
|
389 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */ |
|
390 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */ |
|
391 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */ |
|
392 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */ |
|
393 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */ |
|
394 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */ |
|
395 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */ |
|
396 |
|
397 static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */ |
|
398 0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */ |
|
399 0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */ |
|
400 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */ |
|
401 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */ |
|
402 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */ |
|
403 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */ |
|
404 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */ |
|
405 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */ |
|
406 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */ |
|
407 0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */ |
|
408 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */ |
|
409 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */ |
|
410 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */ |
|
411 0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */ |
|
412 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */ |
|
413 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */ |
|
414 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */ |
|
415 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */ |
|
416 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */ |
|
417 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */ |
|
418 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */ |
|
419 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */ |
|
420 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */ |
|
421 0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */ |
|
422 0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */ |
|
423 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */ |
|
424 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */ |
|
425 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */ |
|
426 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */ |
|
427 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */ |
|
428 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */ |
|
429 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */ |
|
430 #endif |
|
431 |
|
432 |
|
433 /* Definition to allow mutual recursion */ |
|
434 |
|
435 static BOOL |
|
436 compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int, |
|
437 int *, int *, branch_chain *, compile_data *, int *); |
|
438 |
|
439 |
|
440 |
|
441 /************************************************* |
|
442 * Find an error text * |
|
443 *************************************************/ |
|
444 |
|
445 /* The error texts are now all in one long string, to save on relocations. As |
|
446 some of the text is of unknown length, we can't use a table of offsets. |
|
447 Instead, just count through the strings. This is not a performance issue |
|
448 because it happens only when there has been a compilation error. |
|
449 |
|
450 Argument: the error number |
|
451 Returns: pointer to the error string |
|
452 */ |
|
453 |
|
454 static const char * |
|
455 find_error_text(int n) |
|
456 { |
|
457 const char *s = error_texts; |
|
458 for (; n > 0; n--) while (*s++ != 0) {}; |
|
459 return s; |
|
460 } |
|
461 |
|
462 |
|
463 /************************************************* |
|
464 * Handle escapes * |
|
465 *************************************************/ |
|
466 |
|
467 /* This function is called when a \ has been encountered. It either returns a |
|
468 positive value for a simple escape such as \n, or a negative value which |
|
469 encodes one of the more complicated things such as \d. A backreference to group |
|
470 n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When |
|
471 UTF-8 is enabled, a positive value greater than 255 may be returned. On entry, |
|
472 ptr is pointing at the \. On exit, it is on the final character of the escape |
|
473 sequence. |
|
474 |
|
475 Arguments: |
|
476 ptrptr points to the pattern position pointer |
|
477 errorcodeptr points to the errorcode variable |
|
478 bracount number of previous extracting brackets |
|
479 options the options bits |
|
480 isclass TRUE if inside a character class |
|
481 |
|
482 Returns: zero or positive => a data character |
|
483 negative => a special escape sequence |
|
484 on error, errorcodeptr is set |
|
485 */ |
|
486 |
|
487 static int |
|
488 check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount, |
|
489 int options, BOOL isclass) |
|
490 { |
|
491 BOOL utf8 = (options & PCRE_UTF8) != 0; |
|
492 const uschar *ptr = *ptrptr + 1; |
|
493 int c, i; |
|
494 |
|
495 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */ |
|
496 ptr--; /* Set pointer back to the last byte */ |
|
497 |
|
498 /* If backslash is at the end of the pattern, it's an error. */ |
|
499 |
|
500 if (c == 0) *errorcodeptr = ERR1; |
|
501 |
|
502 /* Non-alphanumerics are literals. For digits or letters, do an initial lookup |
|
503 in a table. A non-zero result is something that can be returned immediately. |
|
504 Otherwise further processing may be required. */ |
|
505 |
|
506 #ifndef EBCDIC /* ASCII coding */ |
|
507 else if (c < '0' || c > 'z') {} /* Not alphanumeric */ |
|
508 else if ((i = escapes[c - '0']) != 0) c = i; |
|
509 |
|
510 #else /* EBCDIC coding */ |
|
511 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphanumeric */ |
|
512 else if ((i = escapes[c - 0x48]) != 0) c = i; |
|
513 #endif |
|
514 |
|
515 /* Escapes that need further processing, or are illegal. */ |
|
516 |
|
517 else |
|
518 { |
|
519 const uschar *oldptr; |
|
520 BOOL braced, negated; |
|
521 |
|
522 switch (c) |
|
523 { |
|
524 /* A number of Perl escapes are not handled by PCRE. We give an explicit |
|
525 error. */ |
|
526 |
|
527 case 'l': |
|
528 case 'L': |
|
529 case 'N': |
|
530 case 'u': |
|
531 case 'U': |
|
532 *errorcodeptr = ERR37; |
|
533 break; |
|
534 |
|
535 /* \g must be followed by one of a number of specific things: |
|
536 |
|
537 (1) A number, either plain or braced. If positive, it is an absolute |
|
538 backreference. If negative, it is a relative backreference. This is a Perl |
|
539 5.10 feature. |
|
540 |
|
541 (2) Perl 5.10 also supports \g{name} as a reference to a named group. This |
|
542 is part of Perl's movement towards a unified syntax for back references. As |
|
543 this is synonymous with \k{name}, we fudge it up by pretending it really |
|
544 was \k. |
|
545 |
|
546 (3) For Oniguruma compatibility we also support \g followed by a name or a |
|
547 number either in angle brackets or in single quotes. However, these are |
|
548 (possibly recursive) subroutine calls, _not_ backreferences. Just return |
|
549 the -ESC_g code (cf \k). */ |
|
550 |
|
551 case 'g': |
|
552 if (ptr[1] == '<' || ptr[1] == '\'') |
|
553 { |
|
554 c = -ESC_g; |
|
555 break; |
|
556 } |
|
557 |
|
558 /* Handle the Perl-compatible cases */ |
|
559 |
|
560 if (ptr[1] == '{') |
|
561 { |
|
562 const uschar *p; |
|
563 for (p = ptr+2; *p != 0 && *p != '}'; p++) |
|
564 if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break; |
|
565 if (*p != 0 && *p != '}') |
|
566 { |
|
567 c = -ESC_k; |
|
568 break; |
|
569 } |
|
570 braced = TRUE; |
|
571 ptr++; |
|
572 } |
|
573 else braced = FALSE; |
|
574 |
|
575 if (ptr[1] == '-') |
|
576 { |
|
577 negated = TRUE; |
|
578 ptr++; |
|
579 } |
|
580 else negated = FALSE; |
|
581 |
|
582 c = 0; |
|
583 while ((digitab[ptr[1]] & ctype_digit) != 0) |
|
584 c = c * 10 + *(++ptr) - '0'; |
|
585 |
|
586 if (c < 0) /* Integer overflow */ |
|
587 { |
|
588 *errorcodeptr = ERR61; |
|
589 break; |
|
590 } |
|
591 |
|
592 if (braced && *(++ptr) != '}') |
|
593 { |
|
594 *errorcodeptr = ERR57; |
|
595 break; |
|
596 } |
|
597 |
|
598 if (c == 0) |
|
599 { |
|
600 *errorcodeptr = ERR58; |
|
601 break; |
|
602 } |
|
603 |
|
604 if (negated) |
|
605 { |
|
606 if (c > bracount) |
|
607 { |
|
608 *errorcodeptr = ERR15; |
|
609 break; |
|
610 } |
|
611 c = bracount - (c - 1); |
|
612 } |
|
613 |
|
614 c = -(ESC_REF + c); |
|
615 break; |
|
616 |
|
617 /* The handling of escape sequences consisting of a string of digits |
|
618 starting with one that is not zero is not straightforward. By experiment, |
|
619 the way Perl works seems to be as follows: |
|
620 |
|
621 Outside a character class, the digits are read as a decimal number. If the |
|
622 number is less than 10, or if there are that many previous extracting |
|
623 left brackets, then it is a back reference. Otherwise, up to three octal |
|
624 digits are read to form an escaped byte. Thus \123 is likely to be octal |
|
625 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal |
|
626 value is greater than 377, the least significant 8 bits are taken. Inside a |
|
627 character class, \ followed by a digit is always an octal number. */ |
|
628 |
|
629 case '1': case '2': case '3': case '4': case '5': |
|
630 case '6': case '7': case '8': case '9': |
|
631 |
|
632 if (!isclass) |
|
633 { |
|
634 oldptr = ptr; |
|
635 c -= '0'; |
|
636 while ((digitab[ptr[1]] & ctype_digit) != 0) |
|
637 c = c * 10 + *(++ptr) - '0'; |
|
638 if (c < 0) /* Integer overflow */ |
|
639 { |
|
640 *errorcodeptr = ERR61; |
|
641 break; |
|
642 } |
|
643 if (c < 10 || c <= bracount) |
|
644 { |
|
645 c = -(ESC_REF + c); |
|
646 break; |
|
647 } |
|
648 ptr = oldptr; /* Put the pointer back and fall through */ |
|
649 } |
|
650 |
|
651 /* Handle an octal number following \. If the first digit is 8 or 9, Perl |
|
652 generates a binary zero byte and treats the digit as a following literal. |
|
653 Thus we have to pull back the pointer by one. */ |
|
654 |
|
655 if ((c = *ptr) >= '8') |
|
656 { |
|
657 ptr--; |
|
658 c = 0; |
|
659 break; |
|
660 } |
|
661 |
|
662 /* \0 always starts an octal number, but we may drop through to here with a |
|
663 larger first octal digit. The original code used just to take the least |
|
664 significant 8 bits of octal numbers (I think this is what early Perls used |
|
665 to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more |
|
666 than 3 octal digits. */ |
|
667 |
|
668 case '0': |
|
669 c -= '0'; |
|
670 while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7') |
|
671 c = c * 8 + *(++ptr) - '0'; |
|
672 if (!utf8 && c > 255) *errorcodeptr = ERR51; |
|
673 break; |
|
674 |
|
675 /* \x is complicated. \x{ddd} is a character number which can be greater |
|
676 than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is |
|
677 treated as a data character. */ |
|
678 |
|
679 case 'x': |
|
680 if (ptr[1] == '{') |
|
681 { |
|
682 const uschar *pt = ptr + 2; |
|
683 int count = 0; |
|
684 |
|
685 c = 0; |
|
686 while ((digitab[*pt] & ctype_xdigit) != 0) |
|
687 { |
|
688 register int cc = *pt++; |
|
689 if (c == 0 && cc == '0') continue; /* Leading zeroes */ |
|
690 count++; |
|
691 |
|
692 #ifndef EBCDIC /* ASCII coding */ |
|
693 if (cc >= 'a') cc -= 32; /* Convert to upper case */ |
|
694 c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10)); |
|
695 #else /* EBCDIC coding */ |
|
696 if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */ |
|
697 c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10)); |
|
698 #endif |
|
699 } |
|
700 |
|
701 if (*pt == '}') |
|
702 { |
|
703 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34; |
|
704 ptr = pt; |
|
705 break; |
|
706 } |
|
707 |
|
708 /* If the sequence of hex digits does not end with '}', then we don't |
|
709 recognize this construct; fall through to the normal \x handling. */ |
|
710 } |
|
711 |
|
712 /* Read just a single-byte hex-defined char */ |
|
713 |
|
714 c = 0; |
|
715 while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0) |
|
716 { |
|
717 int cc; /* Some compilers don't like ++ */ |
|
718 cc = *(++ptr); /* in initializers */ |
|
719 #ifndef EBCDIC /* ASCII coding */ |
|
720 if (cc >= 'a') cc -= 32; /* Convert to upper case */ |
|
721 c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10)); |
|
722 #else /* EBCDIC coding */ |
|
723 if (cc <= 'z') cc += 64; /* Convert to upper case */ |
|
724 c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10)); |
|
725 #endif |
|
726 } |
|
727 break; |
|
728 |
|
729 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped. |
|
730 This coding is ASCII-specific, but then the whole concept of \cx is |
|
731 ASCII-specific. (However, an EBCDIC equivalent has now been added.) */ |
|
732 |
|
733 case 'c': |
|
734 c = *(++ptr); |
|
735 if (c == 0) |
|
736 { |
|
737 *errorcodeptr = ERR2; |
|
738 break; |
|
739 } |
|
740 |
|
741 #ifndef EBCDIC /* ASCII coding */ |
|
742 if (c >= 'a' && c <= 'z') c -= 32; |
|
743 c ^= 0x40; |
|
744 #else /* EBCDIC coding */ |
|
745 if (c >= 'a' && c <= 'z') c += 64; |
|
746 c ^= 0xC0; |
|
747 #endif |
|
748 break; |
|
749 |
|
750 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any |
|
751 other alphanumeric following \ is an error if PCRE_EXTRA was set; |
|
752 otherwise, for Perl compatibility, it is a literal. This code looks a bit |
|
753 odd, but there used to be some cases other than the default, and there may |
|
754 be again in future, so I haven't "optimized" it. */ |
|
755 |
|
756 default: |
|
757 if ((options & PCRE_EXTRA) != 0) switch(c) |
|
758 { |
|
759 default: |
|
760 *errorcodeptr = ERR3; |
|
761 break; |
|
762 } |
|
763 break; |
|
764 } |
|
765 } |
|
766 |
|
767 *ptrptr = ptr; |
|
768 return c; |
|
769 } |
|
770 |
|
771 |
|
772 |
|
773 #ifdef SUPPORT_UCP |
|
774 /************************************************* |
|
775 * Handle \P and \p * |
|
776 *************************************************/ |
|
777 |
|
778 /* This function is called after \P or \p has been encountered, provided that |
|
779 PCRE is compiled with support for Unicode properties. On entry, ptrptr is |
|
780 pointing at the P or p. On exit, it is pointing at the final character of the |
|
781 escape sequence. |
|
782 |
|
783 Argument: |
|
784 ptrptr points to the pattern position pointer |
|
785 negptr points to a boolean that is set TRUE for negation else FALSE |
|
786 dptr points to an int that is set to the detailed property value |
|
787 errorcodeptr points to the error code variable |
|
788 |
|
789 Returns: type value from ucp_type_table, or -1 for an invalid type |
|
790 */ |
|
791 |
|
792 static int |
|
793 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr) |
|
794 { |
|
795 int c, i, bot, top; |
|
796 const uschar *ptr = *ptrptr; |
|
797 char name[32]; |
|
798 |
|
799 c = *(++ptr); |
|
800 if (c == 0) goto ERROR_RETURN; |
|
801 |
|
802 *negptr = FALSE; |
|
803 |
|
804 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for |
|
805 negation. */ |
|
806 |
|
807 if (c == '{') |
|
808 { |
|
809 if (ptr[1] == '^') |
|
810 { |
|
811 *negptr = TRUE; |
|
812 ptr++; |
|
813 } |
|
814 for (i = 0; i < (int)sizeof(name) - 1; i++) |
|
815 { |
|
816 c = *(++ptr); |
|
817 if (c == 0) goto ERROR_RETURN; |
|
818 if (c == '}') break; |
|
819 name[i] = c; |
|
820 } |
|
821 if (c !='}') goto ERROR_RETURN; |
|
822 name[i] = 0; |
|
823 } |
|
824 |
|
825 /* Otherwise there is just one following character */ |
|
826 |
|
827 else |
|
828 { |
|
829 name[0] = c; |
|
830 name[1] = 0; |
|
831 } |
|
832 |
|
833 *ptrptr = ptr; |
|
834 |
|
835 /* Search for a recognized property name using binary chop */ |
|
836 |
|
837 bot = 0; |
|
838 top = _pcre_utt_size; |
|
839 |
|
840 while (bot < top) |
|
841 { |
|
842 i = (bot + top) >> 1; |
|
843 c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset); |
|
844 if (c == 0) |
|
845 { |
|
846 *dptr = _pcre_utt[i].value; |
|
847 return _pcre_utt[i].type; |
|
848 } |
|
849 if (c > 0) bot = i + 1; else top = i; |
|
850 } |
|
851 |
|
852 *errorcodeptr = ERR47; |
|
853 *ptrptr = ptr; |
|
854 return -1; |
|
855 |
|
856 ERROR_RETURN: |
|
857 *errorcodeptr = ERR46; |
|
858 *ptrptr = ptr; |
|
859 return -1; |
|
860 } |
|
861 #endif |
|
862 |
|
863 |
|
864 |
|
865 |
|
866 /************************************************* |
|
867 * Check for counted repeat * |
|
868 *************************************************/ |
|
869 |
|
870 /* This function is called when a '{' is encountered in a place where it might |
|
871 start a quantifier. It looks ahead to see if it really is a quantifier or not. |
|
872 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd} |
|
873 where the ddds are digits. |
|
874 |
|
875 Arguments: |
|
876 p pointer to the first char after '{' |
|
877 |
|
878 Returns: TRUE or FALSE |
|
879 */ |
|
880 |
|
881 static BOOL |
|
882 is_counted_repeat(const uschar *p) |
|
883 { |
|
884 if ((digitab[*p++] & ctype_digit) == 0) return FALSE; |
|
885 while ((digitab[*p] & ctype_digit) != 0) p++; |
|
886 if (*p == '}') return TRUE; |
|
887 |
|
888 if (*p++ != ',') return FALSE; |
|
889 if (*p == '}') return TRUE; |
|
890 |
|
891 if ((digitab[*p++] & ctype_digit) == 0) return FALSE; |
|
892 while ((digitab[*p] & ctype_digit) != 0) p++; |
|
893 |
|
894 return (*p == '}'); |
|
895 } |
|
896 |
|
897 |
|
898 |
|
899 /************************************************* |
|
900 * Read repeat counts * |
|
901 *************************************************/ |
|
902 |
|
903 /* Read an item of the form {n,m} and return the values. This is called only |
|
904 after is_counted_repeat() has confirmed that a repeat-count quantifier exists, |
|
905 so the syntax is guaranteed to be correct, but we need to check the values. |
|
906 |
|
907 Arguments: |
|
908 p pointer to first char after '{' |
|
909 minp pointer to int for min |
|
910 maxp pointer to int for max |
|
911 returned as -1 if no max |
|
912 errorcodeptr points to error code variable |
|
913 |
|
914 Returns: pointer to '}' on success; |
|
915 current ptr on error, with errorcodeptr set non-zero |
|
916 */ |
|
917 |
|
918 static const uschar * |
|
919 read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr) |
|
920 { |
|
921 int min = 0; |
|
922 int max = -1; |
|
923 |
|
924 /* Read the minimum value and do a paranoid check: a negative value indicates |
|
925 an integer overflow. */ |
|
926 |
|
927 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0'; |
|
928 if (min < 0 || min > 65535) |
|
929 { |
|
930 *errorcodeptr = ERR5; |
|
931 return p; |
|
932 } |
|
933 |
|
934 /* Read the maximum value if there is one, and again do a paranoid on its size. |
|
935 Also, max must not be less than min. */ |
|
936 |
|
937 if (*p == '}') max = min; else |
|
938 { |
|
939 if (*(++p) != '}') |
|
940 { |
|
941 max = 0; |
|
942 while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0'; |
|
943 if (max < 0 || max > 65535) |
|
944 { |
|
945 *errorcodeptr = ERR5; |
|
946 return p; |
|
947 } |
|
948 if (max < min) |
|
949 { |
|
950 *errorcodeptr = ERR4; |
|
951 return p; |
|
952 } |
|
953 } |
|
954 } |
|
955 |
|
956 /* Fill in the required variables, and pass back the pointer to the terminating |
|
957 '}'. */ |
|
958 |
|
959 *minp = min; |
|
960 *maxp = max; |
|
961 return p; |
|
962 } |
|
963 |
|
964 |
|
965 |
|
966 /************************************************* |
|
967 * Find forward referenced subpattern * |
|
968 *************************************************/ |
|
969 |
|
970 /* This function scans along a pattern's text looking for capturing |
|
971 subpatterns, and counting them. If it finds a named pattern that matches the |
|
972 name it is given, it returns its number. Alternatively, if the name is NULL, it |
|
973 returns when it reaches a given numbered subpattern. This is used for forward |
|
974 references to subpatterns. We know that if (?P< is encountered, the name will |
|
975 be terminated by '>' because that is checked in the first pass. |
|
976 |
|
977 Arguments: |
|
978 ptr current position in the pattern |
|
979 cd compile background data |
|
980 name name to seek, or NULL if seeking a numbered subpattern |
|
981 lorn name length, or subpattern number if name is NULL |
|
982 xmode TRUE if we are in /x mode |
|
983 |
|
984 Returns: the number of the named subpattern, or -1 if not found |
|
985 */ |
|
986 |
|
987 static int |
|
988 find_parens(const uschar *ptr, compile_data *cd, const uschar *name, int lorn, |
|
989 BOOL xmode) |
|
990 { |
|
991 const uschar *thisname; |
|
992 int count = cd->bracount; |
|
993 |
|
994 for (; *ptr != 0; ptr++) |
|
995 { |
|
996 int term; |
|
997 |
|
998 /* Skip over backslashed characters and also entire \Q...\E */ |
|
999 |
|
1000 if (*ptr == '\\') |
|
1001 { |
|
1002 if (*(++ptr) == 0) return -1; |
|
1003 if (*ptr == 'Q') for (;;) |
|
1004 { |
|
1005 while (*(++ptr) != 0 && *ptr != '\\') {}; |
|
1006 if (*ptr == 0) return -1; |
|
1007 if (*(++ptr) == 'E') break; |
|
1008 } |
|
1009 continue; |
|
1010 } |
|
1011 |
|
1012 /* Skip over character classes; this logic must be similar to the way they |
|
1013 are handled for real. If the first character is '^', skip it. Also, if the |
|
1014 first few characters (either before or after ^) are \Q\E or \E we skip them |
|
1015 too. This makes for compatibility with Perl. */ |
|
1016 |
|
1017 if (*ptr == '[') |
|
1018 { |
|
1019 BOOL negate_class = FALSE; |
|
1020 for (;;) |
|
1021 { |
|
1022 int c = *(++ptr); |
|
1023 if (c == '\\') |
|
1024 { |
|
1025 if (ptr[1] == 'E') ptr++; |
|
1026 else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3; |
|
1027 else break; |
|
1028 } |
|
1029 else if (!negate_class && c == '^') |
|
1030 negate_class = TRUE; |
|
1031 else break; |
|
1032 } |
|
1033 |
|
1034 /* If the next character is ']', it is a data character that must be |
|
1035 skipped, except in JavaScript compatibility mode. */ |
|
1036 |
|
1037 if (ptr[1] == ']' && (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0) |
|
1038 ptr++; |
|
1039 |
|
1040 while (*(++ptr) != ']') |
|
1041 { |
|
1042 if (*ptr == 0) return -1; |
|
1043 if (*ptr == '\\') |
|
1044 { |
|
1045 if (*(++ptr) == 0) return -1; |
|
1046 if (*ptr == 'Q') for (;;) |
|
1047 { |
|
1048 while (*(++ptr) != 0 && *ptr != '\\') {}; |
|
1049 if (*ptr == 0) return -1; |
|
1050 if (*(++ptr) == 'E') break; |
|
1051 } |
|
1052 continue; |
|
1053 } |
|
1054 } |
|
1055 continue; |
|
1056 } |
|
1057 |
|
1058 /* Skip comments in /x mode */ |
|
1059 |
|
1060 if (xmode && *ptr == '#') |
|
1061 { |
|
1062 while (*(++ptr) != 0 && *ptr != '\n') {}; |
|
1063 if (*ptr == 0) return -1; |
|
1064 continue; |
|
1065 } |
|
1066 |
|
1067 /* An opening parens must now be a real metacharacter */ |
|
1068 |
|
1069 if (*ptr != '(') continue; |
|
1070 if (ptr[1] != '?' && ptr[1] != '*') |
|
1071 { |
|
1072 count++; |
|
1073 if (name == NULL && count == lorn) return count; |
|
1074 continue; |
|
1075 } |
|
1076 |
|
1077 ptr += 2; |
|
1078 if (*ptr == 'P') ptr++; /* Allow optional P */ |
|
1079 |
|
1080 /* We have to disambiguate (?<! and (?<= from (?<name> */ |
|
1081 |
|
1082 if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') && |
|
1083 *ptr != '\'') |
|
1084 continue; |
|
1085 |
|
1086 count++; |
|
1087 |
|
1088 if (name == NULL && count == lorn) return count; |
|
1089 term = *ptr++; |
|
1090 if (term == '<') term = '>'; |
|
1091 thisname = ptr; |
|
1092 while (*ptr != term) ptr++; |
|
1093 if (name != NULL && lorn == ptr - thisname && |
|
1094 strncmp((const char *)name, (const char *)thisname, lorn) == 0) |
|
1095 return count; |
|
1096 } |
|
1097 |
|
1098 return -1; |
|
1099 } |
|
1100 |
|
1101 |
|
1102 |
|
1103 /************************************************* |
|
1104 * Find first significant op code * |
|
1105 *************************************************/ |
|
1106 |
|
1107 /* This is called by several functions that scan a compiled expression looking |
|
1108 for a fixed first character, or an anchoring op code etc. It skips over things |
|
1109 that do not influence this. For some calls, a change of option is important. |
|
1110 For some calls, it makes sense to skip negative forward and all backward |
|
1111 assertions, and also the \b assertion; for others it does not. |
|
1112 |
|
1113 Arguments: |
|
1114 code pointer to the start of the group |
|
1115 options pointer to external options |
|
1116 optbit the option bit whose changing is significant, or |
|
1117 zero if none are |
|
1118 skipassert TRUE if certain assertions are to be skipped |
|
1119 |
|
1120 Returns: pointer to the first significant opcode |
|
1121 */ |
|
1122 |
|
1123 static const uschar* |
|
1124 first_significant_code(const uschar *code, int *options, int optbit, |
|
1125 BOOL skipassert) |
|
1126 { |
|
1127 for (;;) |
|
1128 { |
|
1129 switch ((int)*code) |
|
1130 { |
|
1131 case OP_OPT: |
|
1132 if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit)) |
|
1133 *options = (int)code[1]; |
|
1134 code += 2; |
|
1135 break; |
|
1136 |
|
1137 case OP_ASSERT_NOT: |
|
1138 case OP_ASSERTBACK: |
|
1139 case OP_ASSERTBACK_NOT: |
|
1140 if (!skipassert) return code; |
|
1141 do code += GET(code, 1); while (*code == OP_ALT); |
|
1142 code += _pcre_OP_lengths[*code]; |
|
1143 break; |
|
1144 |
|
1145 case OP_WORD_BOUNDARY: |
|
1146 case OP_NOT_WORD_BOUNDARY: |
|
1147 if (!skipassert) return code; |
|
1148 /* Fall through */ |
|
1149 |
|
1150 case OP_CALLOUT: |
|
1151 case OP_CREF: |
|
1152 case OP_RREF: |
|
1153 case OP_DEF: |
|
1154 code += _pcre_OP_lengths[*code]; |
|
1155 break; |
|
1156 |
|
1157 default: |
|
1158 return code; |
|
1159 } |
|
1160 } |
|
1161 /* Control never reaches here */ |
|
1162 } |
|
1163 |
|
1164 |
|
1165 |
|
1166 |
|
1167 /************************************************* |
|
1168 * Find the fixed length of a pattern * |
|
1169 *************************************************/ |
|
1170 |
|
1171 /* Scan a pattern and compute the fixed length of subject that will match it, |
|
1172 if the length is fixed. This is needed for dealing with backward assertions. |
|
1173 In UTF8 mode, the result is in characters rather than bytes. |
|
1174 |
|
1175 Arguments: |
|
1176 code points to the start of the pattern (the bracket) |
|
1177 options the compiling options |
|
1178 |
|
1179 Returns: the fixed length, or -1 if there is no fixed length, |
|
1180 or -2 if \C was encountered |
|
1181 */ |
|
1182 |
|
1183 static int |
|
1184 find_fixedlength(uschar *code, int options) |
|
1185 { |
|
1186 int length = -1; |
|
1187 |
|
1188 register int branchlength = 0; |
|
1189 register uschar *cc = code + 1 + LINK_SIZE; |
|
1190 |
|
1191 /* Scan along the opcodes for this branch. If we get to the end of the |
|
1192 branch, check the length against that of the other branches. */ |
|
1193 |
|
1194 for (;;) |
|
1195 { |
|
1196 int d; |
|
1197 register int op = *cc; |
|
1198 switch (op) |
|
1199 { |
|
1200 case OP_CBRA: |
|
1201 case OP_BRA: |
|
1202 case OP_ONCE: |
|
1203 case OP_COND: |
|
1204 d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options); |
|
1205 if (d < 0) return d; |
|
1206 branchlength += d; |
|
1207 do cc += GET(cc, 1); while (*cc == OP_ALT); |
|
1208 cc += 1 + LINK_SIZE; |
|
1209 break; |
|
1210 |
|
1211 /* Reached end of a branch; if it's a ket it is the end of a nested |
|
1212 call. If it's ALT it is an alternation in a nested call. If it is |
|
1213 END it's the end of the outer call. All can be handled by the same code. */ |
|
1214 |
|
1215 case OP_ALT: |
|
1216 case OP_KET: |
|
1217 case OP_KETRMAX: |
|
1218 case OP_KETRMIN: |
|
1219 case OP_END: |
|
1220 if (length < 0) length = branchlength; |
|
1221 else if (length != branchlength) return -1; |
|
1222 if (*cc != OP_ALT) return length; |
|
1223 cc += 1 + LINK_SIZE; |
|
1224 branchlength = 0; |
|
1225 break; |
|
1226 |
|
1227 /* Skip over assertive subpatterns */ |
|
1228 |
|
1229 case OP_ASSERT: |
|
1230 case OP_ASSERT_NOT: |
|
1231 case OP_ASSERTBACK: |
|
1232 case OP_ASSERTBACK_NOT: |
|
1233 do cc += GET(cc, 1); while (*cc == OP_ALT); |
|
1234 /* Fall through */ |
|
1235 |
|
1236 /* Skip over things that don't match chars */ |
|
1237 |
|
1238 case OP_REVERSE: |
|
1239 case OP_CREF: |
|
1240 case OP_RREF: |
|
1241 case OP_DEF: |
|
1242 case OP_OPT: |
|
1243 case OP_CALLOUT: |
|
1244 case OP_SOD: |
|
1245 case OP_SOM: |
|
1246 case OP_EOD: |
|
1247 case OP_EODN: |
|
1248 case OP_CIRC: |
|
1249 case OP_DOLL: |
|
1250 case OP_NOT_WORD_BOUNDARY: |
|
1251 case OP_WORD_BOUNDARY: |
|
1252 cc += _pcre_OP_lengths[*cc]; |
|
1253 break; |
|
1254 |
|
1255 /* Handle literal characters */ |
|
1256 |
|
1257 case OP_CHAR: |
|
1258 case OP_CHARNC: |
|
1259 case OP_NOT: |
|
1260 branchlength++; |
|
1261 cc += 2; |
|
1262 #ifdef SUPPORT_UTF8 |
|
1263 if ((options & PCRE_UTF8) != 0) |
|
1264 { |
|
1265 while ((*cc & 0xc0) == 0x80) cc++; |
|
1266 } |
|
1267 #endif |
|
1268 break; |
|
1269 |
|
1270 /* Handle exact repetitions. The count is already in characters, but we |
|
1271 need to skip over a multibyte character in UTF8 mode. */ |
|
1272 |
|
1273 case OP_EXACT: |
|
1274 branchlength += GET2(cc,1); |
|
1275 cc += 4; |
|
1276 #ifdef SUPPORT_UTF8 |
|
1277 if ((options & PCRE_UTF8) != 0) |
|
1278 { |
|
1279 while((*cc & 0x80) == 0x80) cc++; |
|
1280 } |
|
1281 #endif |
|
1282 break; |
|
1283 |
|
1284 case OP_TYPEEXACT: |
|
1285 branchlength += GET2(cc,1); |
|
1286 if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2; |
|
1287 cc += 4; |
|
1288 break; |
|
1289 |
|
1290 /* Handle single-char matchers */ |
|
1291 |
|
1292 case OP_PROP: |
|
1293 case OP_NOTPROP: |
|
1294 cc += 2; |
|
1295 /* Fall through */ |
|
1296 |
|
1297 case OP_NOT_DIGIT: |
|
1298 case OP_DIGIT: |
|
1299 case OP_NOT_WHITESPACE: |
|
1300 case OP_WHITESPACE: |
|
1301 case OP_NOT_WORDCHAR: |
|
1302 case OP_WORDCHAR: |
|
1303 case OP_ANY: |
|
1304 case OP_ALLANY: |
|
1305 branchlength++; |
|
1306 cc++; |
|
1307 break; |
|
1308 |
|
1309 /* The single-byte matcher isn't allowed */ |
|
1310 |
|
1311 case OP_ANYBYTE: |
|
1312 return -2; |
|
1313 |
|
1314 /* Check a class for variable quantification */ |
|
1315 |
|
1316 #ifdef SUPPORT_UTF8 |
|
1317 case OP_XCLASS: |
|
1318 cc += GET(cc, 1) - 33; |
|
1319 /* Fall through */ |
|
1320 #endif |
|
1321 |
|
1322 case OP_CLASS: |
|
1323 case OP_NCLASS: |
|
1324 cc += 33; |
|
1325 |
|
1326 switch (*cc) |
|
1327 { |
|
1328 case OP_CRSTAR: |
|
1329 case OP_CRMINSTAR: |
|
1330 case OP_CRQUERY: |
|
1331 case OP_CRMINQUERY: |
|
1332 return -1; |
|
1333 |
|
1334 case OP_CRRANGE: |
|
1335 case OP_CRMINRANGE: |
|
1336 if (GET2(cc,1) != GET2(cc,3)) return -1; |
|
1337 branchlength += GET2(cc,1); |
|
1338 cc += 5; |
|
1339 break; |
|
1340 |
|
1341 default: |
|
1342 branchlength++; |
|
1343 } |
|
1344 break; |
|
1345 |
|
1346 /* Anything else is variable length */ |
|
1347 |
|
1348 default: |
|
1349 return -1; |
|
1350 } |
|
1351 } |
|
1352 /* Control never gets here */ |
|
1353 } |
|
1354 |
|
1355 |
|
1356 |
|
1357 |
|
1358 /************************************************* |
|
1359 * Scan compiled regex for numbered bracket * |
|
1360 *************************************************/ |
|
1361 |
|
1362 /* This little function scans through a compiled pattern until it finds a |
|
1363 capturing bracket with the given number. |
|
1364 |
|
1365 Arguments: |
|
1366 code points to start of expression |
|
1367 utf8 TRUE in UTF-8 mode |
|
1368 number the required bracket number |
|
1369 |
|
1370 Returns: pointer to the opcode for the bracket, or NULL if not found |
|
1371 */ |
|
1372 |
|
1373 static const uschar * |
|
1374 find_bracket(const uschar *code, BOOL utf8, int number) |
|
1375 { |
|
1376 for (;;) |
|
1377 { |
|
1378 register int c = *code; |
|
1379 if (c == OP_END) return NULL; |
|
1380 |
|
1381 /* XCLASS is used for classes that cannot be represented just by a bit |
|
1382 map. This includes negated single high-valued characters. The length in |
|
1383 the table is zero; the actual length is stored in the compiled code. */ |
|
1384 |
|
1385 if (c == OP_XCLASS) code += GET(code, 1); |
|
1386 |
|
1387 /* Handle capturing bracket */ |
|
1388 |
|
1389 else if (c == OP_CBRA) |
|
1390 { |
|
1391 int n = GET2(code, 1+LINK_SIZE); |
|
1392 if (n == number) return (uschar *)code; |
|
1393 code += _pcre_OP_lengths[c]; |
|
1394 } |
|
1395 |
|
1396 /* Otherwise, we can get the item's length from the table, except that for |
|
1397 repeated character types, we have to test for \p and \P, which have an extra |
|
1398 two bytes of parameters. */ |
|
1399 |
|
1400 else |
|
1401 { |
|
1402 switch(c) |
|
1403 { |
|
1404 case OP_TYPESTAR: |
|
1405 case OP_TYPEMINSTAR: |
|
1406 case OP_TYPEPLUS: |
|
1407 case OP_TYPEMINPLUS: |
|
1408 case OP_TYPEQUERY: |
|
1409 case OP_TYPEMINQUERY: |
|
1410 case OP_TYPEPOSSTAR: |
|
1411 case OP_TYPEPOSPLUS: |
|
1412 case OP_TYPEPOSQUERY: |
|
1413 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2; |
|
1414 break; |
|
1415 |
|
1416 case OP_TYPEUPTO: |
|
1417 case OP_TYPEMINUPTO: |
|
1418 case OP_TYPEEXACT: |
|
1419 case OP_TYPEPOSUPTO: |
|
1420 if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2; |
|
1421 break; |
|
1422 } |
|
1423 |
|
1424 /* Add in the fixed length from the table */ |
|
1425 |
|
1426 code += _pcre_OP_lengths[c]; |
|
1427 |
|
1428 /* In UTF-8 mode, opcodes that are followed by a character may be followed by |
|
1429 a multi-byte character. The length in the table is a minimum, so we have to |
|
1430 arrange to skip the extra bytes. */ |
|
1431 |
|
1432 #ifdef SUPPORT_UTF8 |
|
1433 if (utf8) switch(c) |
|
1434 { |
|
1435 case OP_CHAR: |
|
1436 case OP_CHARNC: |
|
1437 case OP_EXACT: |
|
1438 case OP_UPTO: |
|
1439 case OP_MINUPTO: |
|
1440 case OP_POSUPTO: |
|
1441 case OP_STAR: |
|
1442 case OP_MINSTAR: |
|
1443 case OP_POSSTAR: |
|
1444 case OP_PLUS: |
|
1445 case OP_MINPLUS: |
|
1446 case OP_POSPLUS: |
|
1447 case OP_QUERY: |
|
1448 case OP_MINQUERY: |
|
1449 case OP_POSQUERY: |
|
1450 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f]; |
|
1451 break; |
|
1452 } |
|
1453 #else |
|
1454 (void)(utf8); /* Keep compiler happy by referencing function argument */ |
|
1455 #endif |
|
1456 } |
|
1457 } |
|
1458 } |
|
1459 |
|
1460 |
|
1461 |
|
1462 /************************************************* |
|
1463 * Scan compiled regex for recursion reference * |
|
1464 *************************************************/ |
|
1465 |
|
1466 /* This little function scans through a compiled pattern until it finds an |
|
1467 instance of OP_RECURSE. |
|
1468 |
|
1469 Arguments: |
|
1470 code points to start of expression |
|
1471 utf8 TRUE in UTF-8 mode |
|
1472 |
|
1473 Returns: pointer to the opcode for OP_RECURSE, or NULL if not found |
|
1474 */ |
|
1475 |
|
1476 static const uschar * |
|
1477 find_recurse(const uschar *code, BOOL utf8) |
|
1478 { |
|
1479 for (;;) |
|
1480 { |
|
1481 register int c = *code; |
|
1482 if (c == OP_END) return NULL; |
|
1483 if (c == OP_RECURSE) return code; |
|
1484 |
|
1485 /* XCLASS is used for classes that cannot be represented just by a bit |
|
1486 map. This includes negated single high-valued characters. The length in |
|
1487 the table is zero; the actual length is stored in the compiled code. */ |
|
1488 |
|
1489 if (c == OP_XCLASS) code += GET(code, 1); |
|
1490 |
|
1491 /* Otherwise, we can get the item's length from the table, except that for |
|
1492 repeated character types, we have to test for \p and \P, which have an extra |
|
1493 two bytes of parameters. */ |
|
1494 |
|
1495 else |
|
1496 { |
|
1497 switch(c) |
|
1498 { |
|
1499 case OP_TYPESTAR: |
|
1500 case OP_TYPEMINSTAR: |
|
1501 case OP_TYPEPLUS: |
|
1502 case OP_TYPEMINPLUS: |
|
1503 case OP_TYPEQUERY: |
|
1504 case OP_TYPEMINQUERY: |
|
1505 case OP_TYPEPOSSTAR: |
|
1506 case OP_TYPEPOSPLUS: |
|
1507 case OP_TYPEPOSQUERY: |
|
1508 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2; |
|
1509 break; |
|
1510 |
|
1511 case OP_TYPEPOSUPTO: |
|
1512 case OP_TYPEUPTO: |
|
1513 case OP_TYPEMINUPTO: |
|
1514 case OP_TYPEEXACT: |
|
1515 if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2; |
|
1516 break; |
|
1517 } |
|
1518 |
|
1519 /* Add in the fixed length from the table */ |
|
1520 |
|
1521 code += _pcre_OP_lengths[c]; |
|
1522 |
|
1523 /* In UTF-8 mode, opcodes that are followed by a character may be followed |
|
1524 by a multi-byte character. The length in the table is a minimum, so we have |
|
1525 to arrange to skip the extra bytes. */ |
|
1526 |
|
1527 #ifdef SUPPORT_UTF8 |
|
1528 if (utf8) switch(c) |
|
1529 { |
|
1530 case OP_CHAR: |
|
1531 case OP_CHARNC: |
|
1532 case OP_EXACT: |
|
1533 case OP_UPTO: |
|
1534 case OP_MINUPTO: |
|
1535 case OP_POSUPTO: |
|
1536 case OP_STAR: |
|
1537 case OP_MINSTAR: |
|
1538 case OP_POSSTAR: |
|
1539 case OP_PLUS: |
|
1540 case OP_MINPLUS: |
|
1541 case OP_POSPLUS: |
|
1542 case OP_QUERY: |
|
1543 case OP_MINQUERY: |
|
1544 case OP_POSQUERY: |
|
1545 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f]; |
|
1546 break; |
|
1547 } |
|
1548 #else |
|
1549 (void)(utf8); /* Keep compiler happy by referencing function argument */ |
|
1550 #endif |
|
1551 } |
|
1552 } |
|
1553 } |
|
1554 |
|
1555 |
|
1556 |
|
1557 /************************************************* |
|
1558 * Scan compiled branch for non-emptiness * |
|
1559 *************************************************/ |
|
1560 |
|
1561 /* This function scans through a branch of a compiled pattern to see whether it |
|
1562 can match the empty string or not. It is called from could_be_empty() |
|
1563 below and from compile_branch() when checking for an unlimited repeat of a |
|
1564 group that can match nothing. Note that first_significant_code() skips over |
|
1565 backward and negative forward assertions when its final argument is TRUE. If we |
|
1566 hit an unclosed bracket, we return "empty" - this means we've struck an inner |
|
1567 bracket whose current branch will already have been scanned. |
|
1568 |
|
1569 Arguments: |
|
1570 code points to start of search |
|
1571 endcode points to where to stop |
|
1572 utf8 TRUE if in UTF8 mode |
|
1573 |
|
1574 Returns: TRUE if what is matched could be empty |
|
1575 */ |
|
1576 |
|
1577 static BOOL |
|
1578 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8) |
|
1579 { |
|
1580 register int c; |
|
1581 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE); |
|
1582 code < endcode; |
|
1583 code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE)) |
|
1584 { |
|
1585 const uschar *ccode; |
|
1586 |
|
1587 c = *code; |
|
1588 |
|
1589 /* Skip over forward assertions; the other assertions are skipped by |
|
1590 first_significant_code() with a TRUE final argument. */ |
|
1591 |
|
1592 if (c == OP_ASSERT) |
|
1593 { |
|
1594 do code += GET(code, 1); while (*code == OP_ALT); |
|
1595 c = *code; |
|
1596 continue; |
|
1597 } |
|
1598 |
|
1599 /* Groups with zero repeats can of course be empty; skip them. */ |
|
1600 |
|
1601 if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO) |
|
1602 { |
|
1603 code += _pcre_OP_lengths[c]; |
|
1604 do code += GET(code, 1); while (*code == OP_ALT); |
|
1605 c = *code; |
|
1606 continue; |
|
1607 } |
|
1608 |
|
1609 /* For other groups, scan the branches. */ |
|
1610 |
|
1611 if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND) |
|
1612 { |
|
1613 BOOL empty_branch; |
|
1614 if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */ |
|
1615 |
|
1616 /* Scan a closed bracket */ |
|
1617 |
|
1618 empty_branch = FALSE; |
|
1619 do |
|
1620 { |
|
1621 if (!empty_branch && could_be_empty_branch(code, endcode, utf8)) |
|
1622 empty_branch = TRUE; |
|
1623 code += GET(code, 1); |
|
1624 } |
|
1625 while (*code == OP_ALT); |
|
1626 if (!empty_branch) return FALSE; /* All branches are non-empty */ |
|
1627 c = *code; |
|
1628 continue; |
|
1629 } |
|
1630 |
|
1631 /* Handle the other opcodes */ |
|
1632 |
|
1633 switch (c) |
|
1634 { |
|
1635 /* Check for quantifiers after a class. XCLASS is used for classes that |
|
1636 cannot be represented just by a bit map. This includes negated single |
|
1637 high-valued characters. The length in _pcre_OP_lengths[] is zero; the |
|
1638 actual length is stored in the compiled code, so we must update "code" |
|
1639 here. */ |
|
1640 |
|
1641 #ifdef SUPPORT_UTF8 |
|
1642 case OP_XCLASS: |
|
1643 ccode = code += GET(code, 1); |
|
1644 goto CHECK_CLASS_REPEAT; |
|
1645 #endif |
|
1646 |
|
1647 case OP_CLASS: |
|
1648 case OP_NCLASS: |
|
1649 ccode = code + 33; |
|
1650 |
|
1651 #ifdef SUPPORT_UTF8 |
|
1652 CHECK_CLASS_REPEAT: |
|
1653 #endif |
|
1654 |
|
1655 switch (*ccode) |
|
1656 { |
|
1657 case OP_CRSTAR: /* These could be empty; continue */ |
|
1658 case OP_CRMINSTAR: |
|
1659 case OP_CRQUERY: |
|
1660 case OP_CRMINQUERY: |
|
1661 break; |
|
1662 |
|
1663 default: /* Non-repeat => class must match */ |
|
1664 case OP_CRPLUS: /* These repeats aren't empty */ |
|
1665 case OP_CRMINPLUS: |
|
1666 return FALSE; |
|
1667 |
|
1668 case OP_CRRANGE: |
|
1669 case OP_CRMINRANGE: |
|
1670 if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */ |
|
1671 break; |
|
1672 } |
|
1673 break; |
|
1674 |
|
1675 /* Opcodes that must match a character */ |
|
1676 |
|
1677 case OP_PROP: |
|
1678 case OP_NOTPROP: |
|
1679 case OP_EXTUNI: |
|
1680 case OP_NOT_DIGIT: |
|
1681 case OP_DIGIT: |
|
1682 case OP_NOT_WHITESPACE: |
|
1683 case OP_WHITESPACE: |
|
1684 case OP_NOT_WORDCHAR: |
|
1685 case OP_WORDCHAR: |
|
1686 case OP_ANY: |
|
1687 case OP_ALLANY: |
|
1688 case OP_ANYBYTE: |
|
1689 case OP_CHAR: |
|
1690 case OP_CHARNC: |
|
1691 case OP_NOT: |
|
1692 case OP_PLUS: |
|
1693 case OP_MINPLUS: |
|
1694 case OP_POSPLUS: |
|
1695 case OP_EXACT: |
|
1696 case OP_NOTPLUS: |
|
1697 case OP_NOTMINPLUS: |
|
1698 case OP_NOTPOSPLUS: |
|
1699 case OP_NOTEXACT: |
|
1700 case OP_TYPEPLUS: |
|
1701 case OP_TYPEMINPLUS: |
|
1702 case OP_TYPEPOSPLUS: |
|
1703 case OP_TYPEEXACT: |
|
1704 return FALSE; |
|
1705 |
|
1706 /* These are going to continue, as they may be empty, but we have to |
|
1707 fudge the length for the \p and \P cases. */ |
|
1708 |
|
1709 case OP_TYPESTAR: |
|
1710 case OP_TYPEMINSTAR: |
|
1711 case OP_TYPEPOSSTAR: |
|
1712 case OP_TYPEQUERY: |
|
1713 case OP_TYPEMINQUERY: |
|
1714 case OP_TYPEPOSQUERY: |
|
1715 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2; |
|
1716 break; |
|
1717 |
|
1718 /* Same for these */ |
|
1719 |
|
1720 case OP_TYPEUPTO: |
|
1721 case OP_TYPEMINUPTO: |
|
1722 case OP_TYPEPOSUPTO: |
|
1723 if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2; |
|
1724 break; |
|
1725 |
|
1726 /* End of branch */ |
|
1727 |
|
1728 case OP_KET: |
|
1729 case OP_KETRMAX: |
|
1730 case OP_KETRMIN: |
|
1731 case OP_ALT: |
|
1732 return TRUE; |
|
1733 |
|
1734 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO, |
|
1735 MINUPTO, and POSUPTO may be followed by a multibyte character */ |
|
1736 |
|
1737 #ifdef SUPPORT_UTF8 |
|
1738 case OP_STAR: |
|
1739 case OP_MINSTAR: |
|
1740 case OP_POSSTAR: |
|
1741 case OP_QUERY: |
|
1742 case OP_MINQUERY: |
|
1743 case OP_POSQUERY: |
|
1744 case OP_UPTO: |
|
1745 case OP_MINUPTO: |
|
1746 case OP_POSUPTO: |
|
1747 if (utf8) while ((code[2] & 0xc0) == 0x80) code++; |
|
1748 break; |
|
1749 #endif |
|
1750 } |
|
1751 } |
|
1752 |
|
1753 return TRUE; |
|
1754 } |
|
1755 |
|
1756 |
|
1757 |
|
1758 /************************************************* |
|
1759 * Scan compiled regex for non-emptiness * |
|
1760 *************************************************/ |
|
1761 |
|
1762 /* This function is called to check for left recursive calls. We want to check |
|
1763 the current branch of the current pattern to see if it could match the empty |
|
1764 string. If it could, we must look outwards for branches at other levels, |
|
1765 stopping when we pass beyond the bracket which is the subject of the recursion. |
|
1766 |
|
1767 Arguments: |
|
1768 code points to start of the recursion |
|
1769 endcode points to where to stop (current RECURSE item) |
|
1770 bcptr points to the chain of current (unclosed) branch starts |
|
1771 utf8 TRUE if in UTF-8 mode |
|
1772 |
|
1773 Returns: TRUE if what is matched could be empty |
|
1774 */ |
|
1775 |
|
1776 static BOOL |
|
1777 could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr, |
|
1778 BOOL utf8) |
|
1779 { |
|
1780 while (bcptr != NULL && bcptr->current >= code) |
|
1781 { |
|
1782 if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE; |
|
1783 bcptr = bcptr->outer; |
|
1784 } |
|
1785 return TRUE; |
|
1786 } |
|
1787 |
|
1788 |
|
1789 |
|
1790 /************************************************* |
|
1791 * Check for POSIX class syntax * |
|
1792 *************************************************/ |
|
1793 |
|
1794 /* This function is called when the sequence "[:" or "[." or "[=" is |
|
1795 encountered in a character class. It checks whether this is followed by a |
|
1796 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we |
|
1797 reach an unescaped ']' without the special preceding character, return FALSE. |
|
1798 |
|
1799 Originally, this function only recognized a sequence of letters between the |
|
1800 terminators, but it seems that Perl recognizes any sequence of characters, |
|
1801 though of course unknown POSIX names are subsequently rejected. Perl gives an |
|
1802 "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE |
|
1803 didn't consider this to be a POSIX class. Likewise for [:1234:]. |
|
1804 |
|
1805 The problem in trying to be exactly like Perl is in the handling of escapes. We |
|
1806 have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX |
|
1807 class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code |
|
1808 below handles the special case of \], but does not try to do any other escape |
|
1809 processing. This makes it different from Perl for cases such as [:l\ower:] |
|
1810 where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize |
|
1811 "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does, |
|
1812 I think. |
|
1813 |
|
1814 Arguments: |
|
1815 ptr pointer to the initial [ |
|
1816 endptr where to return the end pointer |
|
1817 |
|
1818 Returns: TRUE or FALSE |
|
1819 */ |
|
1820 |
|
1821 static BOOL |
|
1822 check_posix_syntax(const uschar *ptr, const uschar **endptr) |
|
1823 { |
|
1824 int terminator; /* Don't combine these lines; the Solaris cc */ |
|
1825 terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */ |
|
1826 for (++ptr; *ptr != 0; ptr++) |
|
1827 { |
|
1828 if (*ptr == '\\' && ptr[1] == ']') ptr++; else |
|
1829 { |
|
1830 if (*ptr == ']') return FALSE; |
|
1831 if (*ptr == terminator && ptr[1] == ']') |
|
1832 { |
|
1833 *endptr = ptr; |
|
1834 return TRUE; |
|
1835 } |
|
1836 } |
|
1837 } |
|
1838 return FALSE; |
|
1839 } |
|
1840 |
|
1841 |
|
1842 |
|
1843 |
|
1844 /************************************************* |
|
1845 * Check POSIX class name * |
|
1846 *************************************************/ |
|
1847 |
|
1848 /* This function is called to check the name given in a POSIX-style class entry |
|
1849 such as [:alnum:]. |
|
1850 |
|
1851 Arguments: |
|
1852 ptr points to the first letter |
|
1853 len the length of the name |
|
1854 |
|
1855 Returns: a value representing the name, or -1 if unknown |
|
1856 */ |
|
1857 |
|
1858 static int |
|
1859 check_posix_name(const uschar *ptr, int len) |
|
1860 { |
|
1861 const char *pn = posix_names; |
|
1862 register int yield = 0; |
|
1863 while (posix_name_lengths[yield] != 0) |
|
1864 { |
|
1865 if (len == posix_name_lengths[yield] && |
|
1866 strncmp((const char *)ptr, pn, len) == 0) return yield; |
|
1867 pn += posix_name_lengths[yield] + 1; |
|
1868 yield++; |
|
1869 } |
|
1870 return -1; |
|
1871 } |
|
1872 |
|
1873 |
|
1874 /************************************************* |
|
1875 * Adjust OP_RECURSE items in repeated group * |
|
1876 *************************************************/ |
|
1877 |
|
1878 /* OP_RECURSE items contain an offset from the start of the regex to the group |
|
1879 that is referenced. This means that groups can be replicated for fixed |
|
1880 repetition simply by copying (because the recursion is allowed to refer to |
|
1881 earlier groups that are outside the current group). However, when a group is |
|
1882 optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is |
|
1883 inserted before it, after it has been compiled. This means that any OP_RECURSE |
|
1884 items within it that refer to the group itself or any contained groups have to |
|
1885 have their offsets adjusted. That one of the jobs of this function. Before it |
|
1886 is called, the partially compiled regex must be temporarily terminated with |
|
1887 OP_END. |
|
1888 |
|
1889 This function has been extended with the possibility of forward references for |
|
1890 recursions and subroutine calls. It must also check the list of such references |
|
1891 for the group we are dealing with. If it finds that one of the recursions in |
|
1892 the current group is on this list, it adjusts the offset in the list, not the |
|
1893 value in the reference (which is a group number). |
|
1894 |
|
1895 Arguments: |
|
1896 group points to the start of the group |
|
1897 adjust the amount by which the group is to be moved |
|
1898 utf8 TRUE in UTF-8 mode |
|
1899 cd contains pointers to tables etc. |
|
1900 save_hwm the hwm forward reference pointer at the start of the group |
|
1901 |
|
1902 Returns: nothing |
|
1903 */ |
|
1904 |
|
1905 static void |
|
1906 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd, |
|
1907 uschar *save_hwm) |
|
1908 { |
|
1909 uschar *ptr = group; |
|
1910 |
|
1911 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL) |
|
1912 { |
|
1913 int offset; |
|
1914 uschar *hc; |
|
1915 |
|
1916 /* See if this recursion is on the forward reference list. If so, adjust the |
|
1917 reference. */ |
|
1918 |
|
1919 for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE) |
|
1920 { |
|
1921 offset = GET(hc, 0); |
|
1922 if (cd->start_code + offset == ptr + 1) |
|
1923 { |
|
1924 PUT(hc, 0, offset + adjust); |
|
1925 break; |
|
1926 } |
|
1927 } |
|
1928 |
|
1929 /* Otherwise, adjust the recursion offset if it's after the start of this |
|
1930 group. */ |
|
1931 |
|
1932 if (hc >= cd->hwm) |
|
1933 { |
|
1934 offset = GET(ptr, 1); |
|
1935 if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust); |
|
1936 } |
|
1937 |
|
1938 ptr += 1 + LINK_SIZE; |
|
1939 } |
|
1940 } |
|
1941 |
|
1942 |
|
1943 |
|
1944 /************************************************* |
|
1945 * Insert an automatic callout point * |
|
1946 *************************************************/ |
|
1947 |
|
1948 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert |
|
1949 callout points before each pattern item. |
|
1950 |
|
1951 Arguments: |
|
1952 code current code pointer |
|
1953 ptr current pattern pointer |
|
1954 cd pointers to tables etc |
|
1955 |
|
1956 Returns: new code pointer |
|
1957 */ |
|
1958 |
|
1959 static uschar * |
|
1960 auto_callout(uschar *code, const uschar *ptr, compile_data *cd) |
|
1961 { |
|
1962 *code++ = OP_CALLOUT; |
|
1963 *code++ = 255; |
|
1964 PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */ |
|
1965 PUT(code, LINK_SIZE, 0); /* Default length */ |
|
1966 return code + 2*LINK_SIZE; |
|
1967 } |
|
1968 |
|
1969 |
|
1970 |
|
1971 /************************************************* |
|
1972 * Complete a callout item * |
|
1973 *************************************************/ |
|
1974 |
|
1975 /* A callout item contains the length of the next item in the pattern, which |
|
1976 we can't fill in till after we have reached the relevant point. This is used |
|
1977 for both automatic and manual callouts. |
|
1978 |
|
1979 Arguments: |
|
1980 previous_callout points to previous callout item |
|
1981 ptr current pattern pointer |
|
1982 cd pointers to tables etc |
|
1983 |
|
1984 Returns: nothing |
|
1985 */ |
|
1986 |
|
1987 static void |
|
1988 complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd) |
|
1989 { |
|
1990 int length = ptr - cd->start_pattern - GET(previous_callout, 2); |
|
1991 PUT(previous_callout, 2 + LINK_SIZE, length); |
|
1992 } |
|
1993 |
|
1994 |
|
1995 |
|
1996 #ifdef SUPPORT_UCP |
|
1997 /************************************************* |
|
1998 * Get othercase range * |
|
1999 *************************************************/ |
|
2000 |
|
2001 /* This function is passed the start and end of a class range, in UTF-8 mode |
|
2002 with UCP support. It searches up the characters, looking for internal ranges of |
|
2003 characters in the "other" case. Each call returns the next one, updating the |
|
2004 start address. |
|
2005 |
|
2006 Arguments: |
|
2007 cptr points to starting character value; updated |
|
2008 d end value |
|
2009 ocptr where to put start of othercase range |
|
2010 odptr where to put end of othercase range |
|
2011 |
|
2012 Yield: TRUE when range returned; FALSE when no more |
|
2013 */ |
|
2014 |
|
2015 static BOOL |
|
2016 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr, |
|
2017 unsigned int *odptr) |
|
2018 { |
|
2019 unsigned int c, othercase, next; |
|
2020 |
|
2021 for (c = *cptr; c <= d; c++) |
|
2022 { if ((othercase = UCD_OTHERCASE(c)) != c) break; } |
|
2023 |
|
2024 if (c > d) return FALSE; |
|
2025 |
|
2026 *ocptr = othercase; |
|
2027 next = othercase + 1; |
|
2028 |
|
2029 for (++c; c <= d; c++) |
|
2030 { |
|
2031 if (UCD_OTHERCASE(c) != next) break; |
|
2032 next++; |
|
2033 } |
|
2034 |
|
2035 *odptr = next - 1; |
|
2036 *cptr = c; |
|
2037 |
|
2038 return TRUE; |
|
2039 } |
|
2040 #endif /* SUPPORT_UCP */ |
|
2041 |
|
2042 |
|
2043 |
|
2044 /************************************************* |
|
2045 * Check if auto-possessifying is possible * |
|
2046 *************************************************/ |
|
2047 |
|
2048 /* This function is called for unlimited repeats of certain items, to see |
|
2049 whether the next thing could possibly match the repeated item. If not, it makes |
|
2050 sense to automatically possessify the repeated item. |
|
2051 |
|
2052 Arguments: |
|
2053 op_code the repeated op code |
|
2054 this data for this item, depends on the opcode |
|
2055 utf8 TRUE in UTF-8 mode |
|
2056 utf8_char used for utf8 character bytes, NULL if not relevant |
|
2057 ptr next character in pattern |
|
2058 options options bits |
|
2059 cd contains pointers to tables etc. |
|
2060 |
|
2061 Returns: TRUE if possessifying is wanted |
|
2062 */ |
|
2063 |
|
2064 static BOOL |
|
2065 check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char, |
|
2066 const uschar *ptr, int options, compile_data *cd) |
|
2067 { |
|
2068 int next; |
|
2069 |
|
2070 /* Skip whitespace and comments in extended mode */ |
|
2071 |
|
2072 if ((options & PCRE_EXTENDED) != 0) |
|
2073 { |
|
2074 for (;;) |
|
2075 { |
|
2076 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++; |
|
2077 if (*ptr == '#') |
|
2078 { |
|
2079 while (*(++ptr) != 0) |
|
2080 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; } |
|
2081 } |
|
2082 else break; |
|
2083 } |
|
2084 } |
|
2085 |
|
2086 /* If the next item is one that we can handle, get its value. A non-negative |
|
2087 value is a character, a negative value is an escape value. */ |
|
2088 |
|
2089 if (*ptr == '\\') |
|
2090 { |
|
2091 int temperrorcode = 0; |
|
2092 next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE); |
|
2093 if (temperrorcode != 0) return FALSE; |
|
2094 ptr++; /* Point after the escape sequence */ |
|
2095 } |
|
2096 |
|
2097 else if ((cd->ctypes[*ptr] & ctype_meta) == 0) |
|
2098 { |
|
2099 #ifdef SUPPORT_UTF8 |
|
2100 if (utf8) { GETCHARINC(next, ptr); } else |
|
2101 #endif |
|
2102 next = *ptr++; |
|
2103 } |
|
2104 |
|
2105 else return FALSE; |
|
2106 |
|
2107 /* Skip whitespace and comments in extended mode */ |
|
2108 |
|
2109 if ((options & PCRE_EXTENDED) != 0) |
|
2110 { |
|
2111 for (;;) |
|
2112 { |
|
2113 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++; |
|
2114 if (*ptr == '#') |
|
2115 { |
|
2116 while (*(++ptr) != 0) |
|
2117 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; } |
|
2118 } |
|
2119 else break; |
|
2120 } |
|
2121 } |
|
2122 |
|
2123 /* If the next thing is itself optional, we have to give up. */ |
|
2124 |
|
2125 if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0) |
|
2126 return FALSE; |
|
2127 |
|
2128 /* Now compare the next item with the previous opcode. If the previous is a |
|
2129 positive single character match, "item" either contains the character or, if |
|
2130 "item" is greater than 127 in utf8 mode, the character's bytes are in |
|
2131 utf8_char. */ |
|
2132 |
|
2133 |
|
2134 /* Handle cases when the next item is a character. */ |
|
2135 |
|
2136 if (next >= 0) switch(op_code) |
|
2137 { |
|
2138 case OP_CHAR: |
|
2139 #ifdef SUPPORT_UTF8 |
|
2140 if (utf8 && item > 127) { GETCHAR(item, utf8_char); } |
|
2141 #else |
|
2142 (void)(utf8_char); /* Keep compiler happy by referencing function argument */ |
|
2143 #endif |
|
2144 return item != next; |
|
2145 |
|
2146 /* For CHARNC (caseless character) we must check the other case. If we have |
|
2147 Unicode property support, we can use it to test the other case of |
|
2148 high-valued characters. */ |
|
2149 |
|
2150 case OP_CHARNC: |
|
2151 #ifdef SUPPORT_UTF8 |
|
2152 if (utf8 && item > 127) { GETCHAR(item, utf8_char); } |
|
2153 #endif |
|
2154 if (item == next) return FALSE; |
|
2155 #ifdef SUPPORT_UTF8 |
|
2156 if (utf8) |
|
2157 { |
|
2158 unsigned int othercase; |
|
2159 if (next < 128) othercase = cd->fcc[next]; else |
|
2160 #ifdef SUPPORT_UCP |
|
2161 othercase = UCD_OTHERCASE((unsigned int)next); |
|
2162 #else |
|
2163 othercase = NOTACHAR; |
|
2164 #endif |
|
2165 return (unsigned int)item != othercase; |
|
2166 } |
|
2167 else |
|
2168 #endif /* SUPPORT_UTF8 */ |
|
2169 return (item != cd->fcc[next]); /* Non-UTF-8 mode */ |
|
2170 |
|
2171 /* For OP_NOT, "item" must be a single-byte character. */ |
|
2172 |
|
2173 case OP_NOT: |
|
2174 if (item == next) return TRUE; |
|
2175 if ((options & PCRE_CASELESS) == 0) return FALSE; |
|
2176 #ifdef SUPPORT_UTF8 |
|
2177 if (utf8) |
|
2178 { |
|
2179 unsigned int othercase; |
|
2180 if (next < 128) othercase = cd->fcc[next]; else |
|
2181 #ifdef SUPPORT_UCP |
|
2182 othercase = UCD_OTHERCASE(next); |
|
2183 #else |
|
2184 othercase = NOTACHAR; |
|
2185 #endif |
|
2186 return (unsigned int)item == othercase; |
|
2187 } |
|
2188 else |
|
2189 #endif /* SUPPORT_UTF8 */ |
|
2190 return (item == cd->fcc[next]); /* Non-UTF-8 mode */ |
|
2191 |
|
2192 case OP_DIGIT: |
|
2193 return next > 127 || (cd->ctypes[next] & ctype_digit) == 0; |
|
2194 |
|
2195 case OP_NOT_DIGIT: |
|
2196 return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0; |
|
2197 |
|
2198 case OP_WHITESPACE: |
|
2199 return next > 127 || (cd->ctypes[next] & ctype_space) == 0; |
|
2200 |
|
2201 case OP_NOT_WHITESPACE: |
|
2202 return next <= 127 && (cd->ctypes[next] & ctype_space) != 0; |
|
2203 |
|
2204 case OP_WORDCHAR: |
|
2205 return next > 127 || (cd->ctypes[next] & ctype_word) == 0; |
|
2206 |
|
2207 case OP_NOT_WORDCHAR: |
|
2208 return next <= 127 && (cd->ctypes[next] & ctype_word) != 0; |
|
2209 |
|
2210 case OP_HSPACE: |
|
2211 case OP_NOT_HSPACE: |
|
2212 switch(next) |
|
2213 { |
|
2214 case 0x09: |
|
2215 case 0x20: |
|
2216 case 0xa0: |
|
2217 case 0x1680: |
|
2218 case 0x180e: |
|
2219 case 0x2000: |
|
2220 case 0x2001: |
|
2221 case 0x2002: |
|
2222 case 0x2003: |
|
2223 case 0x2004: |
|
2224 case 0x2005: |
|
2225 case 0x2006: |
|
2226 case 0x2007: |
|
2227 case 0x2008: |
|
2228 case 0x2009: |
|
2229 case 0x200A: |
|
2230 case 0x202f: |
|
2231 case 0x205f: |
|
2232 case 0x3000: |
|
2233 return op_code != OP_HSPACE; |
|
2234 default: |
|
2235 return op_code == OP_HSPACE; |
|
2236 } |
|
2237 |
|
2238 case OP_VSPACE: |
|
2239 case OP_NOT_VSPACE: |
|
2240 switch(next) |
|
2241 { |
|
2242 case 0x0a: |
|
2243 case 0x0b: |
|
2244 case 0x0c: |
|
2245 case 0x0d: |
|
2246 case 0x85: |
|
2247 case 0x2028: |
|
2248 case 0x2029: |
|
2249 return op_code != OP_VSPACE; |
|
2250 default: |
|
2251 return op_code == OP_VSPACE; |
|
2252 } |
|
2253 |
|
2254 default: |
|
2255 return FALSE; |
|
2256 } |
|
2257 |
|
2258 |
|
2259 /* Handle the case when the next item is \d, \s, etc. */ |
|
2260 |
|
2261 switch(op_code) |
|
2262 { |
|
2263 case OP_CHAR: |
|
2264 case OP_CHARNC: |
|
2265 #ifdef SUPPORT_UTF8 |
|
2266 if (utf8 && item > 127) { GETCHAR(item, utf8_char); } |
|
2267 #endif |
|
2268 switch(-next) |
|
2269 { |
|
2270 case ESC_d: |
|
2271 return item > 127 || (cd->ctypes[item] & ctype_digit) == 0; |
|
2272 |
|
2273 case ESC_D: |
|
2274 return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0; |
|
2275 |
|
2276 case ESC_s: |
|
2277 return item > 127 || (cd->ctypes[item] & ctype_space) == 0; |
|
2278 |
|
2279 case ESC_S: |
|
2280 return item <= 127 && (cd->ctypes[item] & ctype_space) != 0; |
|
2281 |
|
2282 case ESC_w: |
|
2283 return item > 127 || (cd->ctypes[item] & ctype_word) == 0; |
|
2284 |
|
2285 case ESC_W: |
|
2286 return item <= 127 && (cd->ctypes[item] & ctype_word) != 0; |
|
2287 |
|
2288 case ESC_h: |
|
2289 case ESC_H: |
|
2290 switch(item) |
|
2291 { |
|
2292 case 0x09: |
|
2293 case 0x20: |
|
2294 case 0xa0: |
|
2295 case 0x1680: |
|
2296 case 0x180e: |
|
2297 case 0x2000: |
|
2298 case 0x2001: |
|
2299 case 0x2002: |
|
2300 case 0x2003: |
|
2301 case 0x2004: |
|
2302 case 0x2005: |
|
2303 case 0x2006: |
|
2304 case 0x2007: |
|
2305 case 0x2008: |
|
2306 case 0x2009: |
|
2307 case 0x200A: |
|
2308 case 0x202f: |
|
2309 case 0x205f: |
|
2310 case 0x3000: |
|
2311 return -next != ESC_h; |
|
2312 default: |
|
2313 return -next == ESC_h; |
|
2314 } |
|
2315 |
|
2316 case ESC_v: |
|
2317 case ESC_V: |
|
2318 switch(item) |
|
2319 { |
|
2320 case 0x0a: |
|
2321 case 0x0b: |
|
2322 case 0x0c: |
|
2323 case 0x0d: |
|
2324 case 0x85: |
|
2325 case 0x2028: |
|
2326 case 0x2029: |
|
2327 return -next != ESC_v; |
|
2328 default: |
|
2329 return -next == ESC_v; |
|
2330 } |
|
2331 |
|
2332 default: |
|
2333 return FALSE; |
|
2334 } |
|
2335 |
|
2336 case OP_DIGIT: |
|
2337 return next == -ESC_D || next == -ESC_s || next == -ESC_W || |
|
2338 next == -ESC_h || next == -ESC_v; |
|
2339 |
|
2340 case OP_NOT_DIGIT: |
|
2341 return next == -ESC_d; |
|
2342 |
|
2343 case OP_WHITESPACE: |
|
2344 return next == -ESC_S || next == -ESC_d || next == -ESC_w; |
|
2345 |
|
2346 case OP_NOT_WHITESPACE: |
|
2347 return next == -ESC_s || next == -ESC_h || next == -ESC_v; |
|
2348 |
|
2349 case OP_HSPACE: |
|
2350 return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w; |
|
2351 |
|
2352 case OP_NOT_HSPACE: |
|
2353 return next == -ESC_h; |
|
2354 |
|
2355 /* Can't have \S in here because VT matches \S (Perl anomaly) */ |
|
2356 case OP_VSPACE: |
|
2357 return next == -ESC_V || next == -ESC_d || next == -ESC_w; |
|
2358 |
|
2359 case OP_NOT_VSPACE: |
|
2360 return next == -ESC_v; |
|
2361 |
|
2362 case OP_WORDCHAR: |
|
2363 return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v; |
|
2364 |
|
2365 case OP_NOT_WORDCHAR: |
|
2366 return next == -ESC_w || next == -ESC_d; |
|
2367 |
|
2368 default: |
|
2369 return FALSE; |
|
2370 } |
|
2371 |
|
2372 /* Control does not reach here */ |
|
2373 } |
|
2374 |
|
2375 |
|
2376 |
|
2377 /************************************************* |
|
2378 * Compile one branch * |
|
2379 *************************************************/ |
|
2380 |
|
2381 /* Scan the pattern, compiling it into the a vector. If the options are |
|
2382 changed during the branch, the pointer is used to change the external options |
|
2383 bits. This function is used during the pre-compile phase when we are trying |
|
2384 to find out the amount of memory needed, as well as during the real compile |
|
2385 phase. The value of lengthptr distinguishes the two phases. |
|
2386 |
|
2387 Arguments: |
|
2388 optionsptr pointer to the option bits |
|
2389 codeptr points to the pointer to the current code point |
|
2390 ptrptr points to the current pattern pointer |
|
2391 errorcodeptr points to error code variable |
|
2392 firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE) |
|
2393 reqbyteptr set to the last literal character required, else < 0 |
|
2394 bcptr points to current branch chain |
|
2395 cd contains pointers to tables etc. |
|
2396 lengthptr NULL during the real compile phase |
|
2397 points to length accumulator during pre-compile phase |
|
2398 |
|
2399 Returns: TRUE on success |
|
2400 FALSE, with *errorcodeptr set non-zero on error |
|
2401 */ |
|
2402 |
|
2403 static BOOL |
|
2404 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr, |
|
2405 int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, |
|
2406 compile_data *cd, int *lengthptr) |
|
2407 { |
|
2408 int repeat_type, op_type; |
|
2409 int repeat_min = 0, repeat_max = 0; /* To please picky compilers */ |
|
2410 int bravalue = 0; |
|
2411 int greedy_default, greedy_non_default; |
|
2412 int firstbyte, reqbyte; |
|
2413 int zeroreqbyte, zerofirstbyte; |
|
2414 int req_caseopt, reqvary, tempreqvary; |
|
2415 int options = *optionsptr; |
|
2416 int after_manual_callout = 0; |
|
2417 int length_prevgroup = 0; |
|
2418 register int c; |
|
2419 register uschar *code = *codeptr; |
|
2420 uschar *last_code = code; |
|
2421 uschar *orig_code = code; |
|
2422 uschar *tempcode; |
|
2423 BOOL inescq = FALSE; |
|
2424 BOOL groupsetfirstbyte = FALSE; |
|
2425 const uschar *ptr = *ptrptr; |
|
2426 const uschar *tempptr; |
|
2427 uschar *previous = NULL; |
|
2428 uschar *previous_callout = NULL; |
|
2429 uschar *save_hwm = NULL; |
|
2430 uschar classbits[32]; |
|
2431 |
|
2432 #ifdef SUPPORT_UTF8 |
|
2433 BOOL class_utf8; |
|
2434 BOOL utf8 = (options & PCRE_UTF8) != 0; |
|
2435 uschar *class_utf8data; |
|
2436 uschar *class_utf8data_base; |
|
2437 uschar utf8_char[6]; |
|
2438 #else |
|
2439 BOOL utf8 = FALSE; |
|
2440 uschar *utf8_char = NULL; |
|
2441 #endif |
|
2442 |
|
2443 #ifdef DEBUG |
|
2444 if (lengthptr != NULL) DPRINTF((">> start branch\n")); |
|
2445 #endif |
|
2446 |
|
2447 /* Set up the default and non-default settings for greediness */ |
|
2448 |
|
2449 greedy_default = ((options & PCRE_UNGREEDY) != 0); |
|
2450 greedy_non_default = greedy_default ^ 1; |
|
2451 |
|
2452 /* Initialize no first byte, no required byte. REQ_UNSET means "no char |
|
2453 matching encountered yet". It gets changed to REQ_NONE if we hit something that |
|
2454 matches a non-fixed char first char; reqbyte just remains unset if we never |
|
2455 find one. |
|
2456 |
|
2457 When we hit a repeat whose minimum is zero, we may have to adjust these values |
|
2458 to take the zero repeat into account. This is implemented by setting them to |
|
2459 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual |
|
2460 item types that can be repeated set these backoff variables appropriately. */ |
|
2461 |
|
2462 firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET; |
|
2463 |
|
2464 /* The variable req_caseopt contains either the REQ_CASELESS value or zero, |
|
2465 according to the current setting of the caseless flag. REQ_CASELESS is a bit |
|
2466 value > 255. It is added into the firstbyte or reqbyte variables to record the |
|
2467 case status of the value. This is used only for ASCII characters. */ |
|
2468 |
|
2469 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0; |
|
2470 |
|
2471 /* Switch on next character until the end of the branch */ |
|
2472 |
|
2473 for (;; ptr++) |
|
2474 { |
|
2475 BOOL negate_class; |
|
2476 BOOL should_flip_negation; |
|
2477 BOOL possessive_quantifier; |
|
2478 BOOL is_quantifier; |
|
2479 BOOL is_recurse; |
|
2480 BOOL reset_bracount; |
|
2481 int class_charcount; |
|
2482 int class_lastchar; |
|
2483 int newoptions; |
|
2484 int recno; |
|
2485 int refsign; |
|
2486 int skipbytes; |
|
2487 int subreqbyte; |
|
2488 int subfirstbyte; |
|
2489 int terminator; |
|
2490 int mclength; |
|
2491 uschar mcbuffer[8]; |
|
2492 |
|
2493 /* Get next byte in the pattern */ |
|
2494 |
|
2495 c = *ptr; |
|
2496 |
|
2497 /* If we are in the pre-compile phase, accumulate the length used for the |
|
2498 previous cycle of this loop. */ |
|
2499 |
|
2500 if (lengthptr != NULL) |
|
2501 { |
|
2502 #ifdef DEBUG |
|
2503 if (code > cd->hwm) cd->hwm = code; /* High water info */ |
|
2504 #endif |
|
2505 if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */ |
|
2506 { |
|
2507 *errorcodeptr = ERR52; |
|
2508 goto FAILED; |
|
2509 } |
|
2510 |
|
2511 /* There is at least one situation where code goes backwards: this is the |
|
2512 case of a zero quantifier after a class (e.g. [ab]{0}). At compile time, |
|
2513 the class is simply eliminated. However, it is created first, so we have to |
|
2514 allow memory for it. Therefore, don't ever reduce the length at this point. |
|
2515 */ |
|
2516 |
|
2517 if (code < last_code) code = last_code; |
|
2518 |
|
2519 /* Paranoid check for integer overflow */ |
|
2520 |
|
2521 if (OFLOW_MAX - *lengthptr < code - last_code) |
|
2522 { |
|
2523 *errorcodeptr = ERR20; |
|
2524 goto FAILED; |
|
2525 } |
|
2526 |
|
2527 *lengthptr += code - last_code; |
|
2528 DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c)); |
|
2529 |
|
2530 /* If "previous" is set and it is not at the start of the work space, move |
|
2531 it back to there, in order to avoid filling up the work space. Otherwise, |
|
2532 if "previous" is NULL, reset the current code pointer to the start. */ |
|
2533 |
|
2534 if (previous != NULL) |
|
2535 { |
|
2536 if (previous > orig_code) |
|
2537 { |
|
2538 memmove(orig_code, previous, code - previous); |
|
2539 code -= previous - orig_code; |
|
2540 previous = orig_code; |
|
2541 } |
|
2542 } |
|
2543 else code = orig_code; |
|
2544 |
|
2545 /* Remember where this code item starts so we can pick up the length |
|
2546 next time round. */ |
|
2547 |
|
2548 last_code = code; |
|
2549 } |
|
2550 |
|
2551 /* In the real compile phase, just check the workspace used by the forward |
|
2552 reference list. */ |
|
2553 |
|
2554 else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE) |
|
2555 { |
|
2556 *errorcodeptr = ERR52; |
|
2557 goto FAILED; |
|
2558 } |
|
2559 |
|
2560 /* If in \Q...\E, check for the end; if not, we have a literal */ |
|
2561 |
|
2562 if (inescq && c != 0) |
|
2563 { |
|
2564 if (c == '\\' && ptr[1] == 'E') |
|
2565 { |
|
2566 inescq = FALSE; |
|
2567 ptr++; |
|
2568 continue; |
|
2569 } |
|
2570 else |
|
2571 { |
|
2572 if (previous_callout != NULL) |
|
2573 { |
|
2574 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */ |
|
2575 complete_callout(previous_callout, ptr, cd); |
|
2576 previous_callout = NULL; |
|
2577 } |
|
2578 if ((options & PCRE_AUTO_CALLOUT) != 0) |
|
2579 { |
|
2580 previous_callout = code; |
|
2581 code = auto_callout(code, ptr, cd); |
|
2582 } |
|
2583 goto NORMAL_CHAR; |
|
2584 } |
|
2585 } |
|
2586 |
|
2587 /* Fill in length of a previous callout, except when the next thing is |
|
2588 a quantifier. */ |
|
2589 |
|
2590 is_quantifier = c == '*' || c == '+' || c == '?' || |
|
2591 (c == '{' && is_counted_repeat(ptr+1)); |
|
2592 |
|
2593 if (!is_quantifier && previous_callout != NULL && |
|
2594 after_manual_callout-- <= 0) |
|
2595 { |
|
2596 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */ |
|
2597 complete_callout(previous_callout, ptr, cd); |
|
2598 previous_callout = NULL; |
|
2599 } |
|
2600 |
|
2601 /* In extended mode, skip white space and comments */ |
|
2602 |
|
2603 if ((options & PCRE_EXTENDED) != 0) |
|
2604 { |
|
2605 if ((cd->ctypes[c] & ctype_space) != 0) continue; |
|
2606 if (c == '#') |
|
2607 { |
|
2608 while (*(++ptr) != 0) |
|
2609 { |
|
2610 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; } |
|
2611 } |
|
2612 if (*ptr != 0) continue; |
|
2613 |
|
2614 /* Else fall through to handle end of string */ |
|
2615 c = 0; |
|
2616 } |
|
2617 } |
|
2618 |
|
2619 /* No auto callout for quantifiers. */ |
|
2620 |
|
2621 if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier) |
|
2622 { |
|
2623 previous_callout = code; |
|
2624 code = auto_callout(code, ptr, cd); |
|
2625 } |
|
2626 |
|
2627 switch(c) |
|
2628 { |
|
2629 /* ===================================================================*/ |
|
2630 case 0: /* The branch terminates at string end */ |
|
2631 case '|': /* or | or ) */ |
|
2632 case ')': |
|
2633 *firstbyteptr = firstbyte; |
|
2634 *reqbyteptr = reqbyte; |
|
2635 *codeptr = code; |
|
2636 *ptrptr = ptr; |
|
2637 if (lengthptr != NULL) |
|
2638 { |
|
2639 if (OFLOW_MAX - *lengthptr < code - last_code) |
|
2640 { |
|
2641 *errorcodeptr = ERR20; |
|
2642 goto FAILED; |
|
2643 } |
|
2644 *lengthptr += code - last_code; /* To include callout length */ |
|
2645 DPRINTF((">> end branch\n")); |
|
2646 } |
|
2647 return TRUE; |
|
2648 |
|
2649 |
|
2650 /* ===================================================================*/ |
|
2651 /* Handle single-character metacharacters. In multiline mode, ^ disables |
|
2652 the setting of any following char as a first character. */ |
|
2653 |
|
2654 case '^': |
|
2655 if ((options & PCRE_MULTILINE) != 0) |
|
2656 { |
|
2657 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE; |
|
2658 } |
|
2659 previous = NULL; |
|
2660 *code++ = OP_CIRC; |
|
2661 break; |
|
2662 |
|
2663 case '$': |
|
2664 previous = NULL; |
|
2665 *code++ = OP_DOLL; |
|
2666 break; |
|
2667 |
|
2668 /* There can never be a first char if '.' is first, whatever happens about |
|
2669 repeats. The value of reqbyte doesn't change either. */ |
|
2670 |
|
2671 case '.': |
|
2672 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE; |
|
2673 zerofirstbyte = firstbyte; |
|
2674 zeroreqbyte = reqbyte; |
|
2675 previous = code; |
|
2676 *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY; |
|
2677 break; |
|
2678 |
|
2679 |
|
2680 /* ===================================================================*/ |
|
2681 /* Character classes. If the included characters are all < 256, we build a |
|
2682 32-byte bitmap of the permitted characters, except in the special case |
|
2683 where there is only one such character. For negated classes, we build the |
|
2684 map as usual, then invert it at the end. However, we use a different opcode |
|
2685 so that data characters > 255 can be handled correctly. |
|
2686 |
|
2687 If the class contains characters outside the 0-255 range, a different |
|
2688 opcode is compiled. It may optionally have a bit map for characters < 256, |
|
2689 but those above are are explicitly listed afterwards. A flag byte tells |
|
2690 whether the bitmap is present, and whether this is a negated class or not. |
|
2691 |
|
2692 In JavaScript compatibility mode, an isolated ']' causes an error. In |
|
2693 default (Perl) mode, it is treated as a data character. */ |
|
2694 |
|
2695 case ']': |
|
2696 if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0) |
|
2697 { |
|
2698 *errorcodeptr = ERR64; |
|
2699 goto FAILED; |
|
2700 } |
|
2701 goto NORMAL_CHAR; |
|
2702 |
|
2703 case '[': |
|
2704 previous = code; |
|
2705 |
|
2706 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if |
|
2707 they are encountered at the top level, so we'll do that too. */ |
|
2708 |
|
2709 if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') && |
|
2710 check_posix_syntax(ptr, &tempptr)) |
|
2711 { |
|
2712 *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31; |
|
2713 goto FAILED; |
|
2714 } |
|
2715 |
|
2716 /* If the first character is '^', set the negation flag and skip it. Also, |
|
2717 if the first few characters (either before or after ^) are \Q\E or \E we |
|
2718 skip them too. This makes for compatibility with Perl. */ |
|
2719 |
|
2720 negate_class = FALSE; |
|
2721 for (;;) |
|
2722 { |
|
2723 c = *(++ptr); |
|
2724 if (c == '\\') |
|
2725 { |
|
2726 if (ptr[1] == 'E') ptr++; |
|
2727 else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3; |
|
2728 else break; |
|
2729 } |
|
2730 else if (!negate_class && c == '^') |
|
2731 negate_class = TRUE; |
|
2732 else break; |
|
2733 } |
|
2734 |
|
2735 /* Empty classes are allowed in JavaScript compatibility mode. Otherwise, |
|
2736 an initial ']' is taken as a data character -- the code below handles |
|
2737 that. In JS mode, [] must always fail, so generate OP_FAIL, whereas |
|
2738 [^] must match any character, so generate OP_ALLANY. */ |
|
2739 |
|
2740 if (c ==']' && (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0) |
|
2741 { |
|
2742 *code++ = negate_class? OP_ALLANY : OP_FAIL; |
|
2743 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE; |
|
2744 zerofirstbyte = firstbyte; |
|
2745 break; |
|
2746 } |
|
2747 |
|
2748 /* If a class contains a negative special such as \S, we need to flip the |
|
2749 negation flag at the end, so that support for characters > 255 works |
|
2750 correctly (they are all included in the class). */ |
|
2751 |
|
2752 should_flip_negation = FALSE; |
|
2753 |
|
2754 /* Keep a count of chars with values < 256 so that we can optimize the case |
|
2755 of just a single character (as long as it's < 256). However, For higher |
|
2756 valued UTF-8 characters, we don't yet do any optimization. */ |
|
2757 |
|
2758 class_charcount = 0; |
|
2759 class_lastchar = -1; |
|
2760 |
|
2761 /* Initialize the 32-char bit map to all zeros. We build the map in a |
|
2762 temporary bit of memory, in case the class contains only 1 character (less |
|
2763 than 256), because in that case the compiled code doesn't use the bit map. |
|
2764 */ |
|
2765 |
|
2766 memset(classbits, 0, 32 * sizeof(uschar)); |
|
2767 |
|
2768 #ifdef SUPPORT_UTF8 |
|
2769 class_utf8 = FALSE; /* No chars >= 256 */ |
|
2770 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */ |
|
2771 class_utf8data_base = class_utf8data; /* For resetting in pass 1 */ |
|
2772 #endif |
|
2773 |
|
2774 /* Process characters until ] is reached. By writing this as a "do" it |
|
2775 means that an initial ] is taken as a data character. At the start of the |
|
2776 loop, c contains the first byte of the character. */ |
|
2777 |
|
2778 if (c != 0) do |
|
2779 { |
|
2780 const uschar *oldptr; |
|
2781 |
|
2782 #ifdef SUPPORT_UTF8 |
|
2783 if (utf8 && c > 127) |
|
2784 { /* Braces are required because the */ |
|
2785 GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */ |
|
2786 } |
|
2787 |
|
2788 /* In the pre-compile phase, accumulate the length of any UTF-8 extra |
|
2789 data and reset the pointer. This is so that very large classes that |
|
2790 contain a zillion UTF-8 characters no longer overwrite the work space |
|
2791 (which is on the stack). */ |
|
2792 |
|
2793 if (lengthptr != NULL) |
|
2794 { |
|
2795 *lengthptr += class_utf8data - class_utf8data_base; |
|
2796 class_utf8data = class_utf8data_base; |
|
2797 } |
|
2798 |
|
2799 #endif |
|
2800 |
|
2801 /* Inside \Q...\E everything is literal except \E */ |
|
2802 |
|
2803 if (inescq) |
|
2804 { |
|
2805 if (c == '\\' && ptr[1] == 'E') /* If we are at \E */ |
|
2806 { |
|
2807 inescq = FALSE; /* Reset literal state */ |
|
2808 ptr++; /* Skip the 'E' */ |
|
2809 continue; /* Carry on with next */ |
|
2810 } |
|
2811 goto CHECK_RANGE; /* Could be range if \E follows */ |
|
2812 } |
|
2813 |
|
2814 /* Handle POSIX class names. Perl allows a negation extension of the |
|
2815 form [:^name:]. A square bracket that doesn't match the syntax is |
|
2816 treated as a literal. We also recognize the POSIX constructions |
|
2817 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl |
|
2818 5.6 and 5.8 do. */ |
|
2819 |
|
2820 if (c == '[' && |
|
2821 (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') && |
|
2822 check_posix_syntax(ptr, &tempptr)) |
|
2823 { |
|
2824 BOOL local_negate = FALSE; |
|
2825 int posix_class, taboffset, tabopt; |
|
2826 register const uschar *cbits = cd->cbits; |
|
2827 uschar pbits[32]; |
|
2828 |
|
2829 if (ptr[1] != ':') |
|
2830 { |
|
2831 *errorcodeptr = ERR31; |
|
2832 goto FAILED; |
|
2833 } |
|
2834 |
|
2835 ptr += 2; |
|
2836 if (*ptr == '^') |
|
2837 { |
|
2838 local_negate = TRUE; |
|
2839 should_flip_negation = TRUE; /* Note negative special */ |
|
2840 ptr++; |
|
2841 } |
|
2842 |
|
2843 posix_class = check_posix_name(ptr, tempptr - ptr); |
|
2844 if (posix_class < 0) |
|
2845 { |
|
2846 *errorcodeptr = ERR30; |
|
2847 goto FAILED; |
|
2848 } |
|
2849 |
|
2850 /* If matching is caseless, upper and lower are converted to |
|
2851 alpha. This relies on the fact that the class table starts with |
|
2852 alpha, lower, upper as the first 3 entries. */ |
|
2853 |
|
2854 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2) |
|
2855 posix_class = 0; |
|
2856 |
|
2857 /* We build the bit map for the POSIX class in a chunk of local store |
|
2858 because we may be adding and subtracting from it, and we don't want to |
|
2859 subtract bits that may be in the main map already. At the end we or the |
|
2860 result into the bit map that is being built. */ |
|
2861 |
|
2862 posix_class *= 3; |
|
2863 |
|
2864 /* Copy in the first table (always present) */ |
|
2865 |
|
2866 memcpy(pbits, cbits + posix_class_maps[posix_class], |
|
2867 32 * sizeof(uschar)); |
|
2868 |
|
2869 /* If there is a second table, add or remove it as required. */ |
|
2870 |
|
2871 taboffset = posix_class_maps[posix_class + 1]; |
|
2872 tabopt = posix_class_maps[posix_class + 2]; |
|
2873 |
|
2874 if (taboffset >= 0) |
|
2875 { |
|
2876 if (tabopt >= 0) |
|
2877 for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset]; |
|
2878 else |
|
2879 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset]; |
|
2880 } |
|
2881 |
|
2882 /* Not see if we need to remove any special characters. An option |
|
2883 value of 1 removes vertical space and 2 removes underscore. */ |
|
2884 |
|
2885 if (tabopt < 0) tabopt = -tabopt; |
|
2886 if (tabopt == 1) pbits[1] &= ~0x3c; |
|
2887 else if (tabopt == 2) pbits[11] &= 0x7f; |
|
2888 |
|
2889 /* Add the POSIX table or its complement into the main table that is |
|
2890 being built and we are done. */ |
|
2891 |
|
2892 if (local_negate) |
|
2893 for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c]; |
|
2894 else |
|
2895 for (c = 0; c < 32; c++) classbits[c] |= pbits[c]; |
|
2896 |
|
2897 ptr = tempptr + 1; |
|
2898 class_charcount = 10; /* Set > 1; assumes more than 1 per class */ |
|
2899 continue; /* End of POSIX syntax handling */ |
|
2900 } |
|
2901 |
|
2902 /* Backslash may introduce a single character, or it may introduce one |
|
2903 of the specials, which just set a flag. The sequence \b is a special |
|
2904 case. Inside a class (and only there) it is treated as backspace. |
|
2905 Elsewhere it marks a word boundary. Other escapes have preset maps ready |
|
2906 to 'or' into the one we are building. We assume they have more than one |
|
2907 character in them, so set class_charcount bigger than one. */ |
|
2908 |
|
2909 if (c == '\\') |
|
2910 { |
|
2911 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE); |
|
2912 if (*errorcodeptr != 0) goto FAILED; |
|
2913 |
|
2914 if (-c == ESC_b) c = '\b'; /* \b is backspace in a class */ |
|
2915 else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */ |
|
2916 else if (-c == ESC_R) c = 'R'; /* \R is literal R in a class */ |
|
2917 else if (-c == ESC_Q) /* Handle start of quoted string */ |
|
2918 { |
|
2919 if (ptr[1] == '\\' && ptr[2] == 'E') |
|
2920 { |
|
2921 ptr += 2; /* avoid empty string */ |
|
2922 } |
|
2923 else inescq = TRUE; |
|
2924 continue; |
|
2925 } |
|
2926 else if (-c == ESC_E) continue; /* Ignore orphan \E */ |
|
2927 |
|
2928 if (c < 0) |
|
2929 { |
|
2930 register const uschar *cbits = cd->cbits; |
|
2931 class_charcount += 2; /* Greater than 1 is what matters */ |
|
2932 |
|
2933 /* Save time by not doing this in the pre-compile phase. */ |
|
2934 |
|
2935 if (lengthptr == NULL) switch (-c) |
|
2936 { |
|
2937 case ESC_d: |
|
2938 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit]; |
|
2939 continue; |
|
2940 |
|
2941 case ESC_D: |
|
2942 should_flip_negation = TRUE; |
|
2943 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit]; |
|
2944 continue; |
|
2945 |
|
2946 case ESC_w: |
|
2947 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word]; |
|
2948 continue; |
|
2949 |
|
2950 case ESC_W: |
|
2951 should_flip_negation = TRUE; |
|
2952 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word]; |
|
2953 continue; |
|
2954 |
|
2955 case ESC_s: |
|
2956 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space]; |
|
2957 classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */ |
|
2958 continue; |
|
2959 |
|
2960 case ESC_S: |
|
2961 should_flip_negation = TRUE; |
|
2962 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space]; |
|
2963 classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */ |
|
2964 continue; |
|
2965 |
|
2966 default: /* Not recognized; fall through */ |
|
2967 break; /* Need "default" setting to stop compiler warning. */ |
|
2968 } |
|
2969 |
|
2970 /* In the pre-compile phase, just do the recognition. */ |
|
2971 |
|
2972 else if (c == -ESC_d || c == -ESC_D || c == -ESC_w || |
|
2973 c == -ESC_W || c == -ESC_s || c == -ESC_S) continue; |
|
2974 |
|
2975 /* We need to deal with \H, \h, \V, and \v in both phases because |
|
2976 they use extra memory. */ |
|
2977 |
|
2978 if (-c == ESC_h) |
|
2979 { |
|
2980 SETBIT(classbits, 0x09); /* VT */ |
|
2981 SETBIT(classbits, 0x20); /* SPACE */ |
|
2982 SETBIT(classbits, 0xa0); /* NSBP */ |
|
2983 #ifdef SUPPORT_UTF8 |
|
2984 if (utf8) |
|
2985 { |
|
2986 class_utf8 = TRUE; |
|
2987 *class_utf8data++ = XCL_SINGLE; |
|
2988 class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data); |
|
2989 *class_utf8data++ = XCL_SINGLE; |
|
2990 class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data); |
|
2991 *class_utf8data++ = XCL_RANGE; |
|
2992 class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data); |
|
2993 class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data); |
|
2994 *class_utf8data++ = XCL_SINGLE; |
|
2995 class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data); |
|
2996 *class_utf8data++ = XCL_SINGLE; |
|
2997 class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data); |
|
2998 *class_utf8data++ = XCL_SINGLE; |
|
2999 class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data); |
|
3000 } |
|
3001 #endif |
|
3002 continue; |
|
3003 } |
|
3004 |
|
3005 if (-c == ESC_H) |
|
3006 { |
|
3007 for (c = 0; c < 32; c++) |
|
3008 { |
|
3009 int x = 0xff; |
|
3010 switch (c) |
|
3011 { |
|
3012 case 0x09/8: x ^= 1 << (0x09%8); break; |
|
3013 case 0x20/8: x ^= 1 << (0x20%8); break; |
|
3014 case 0xa0/8: x ^= 1 << (0xa0%8); break; |
|
3015 default: break; |
|
3016 } |
|
3017 classbits[c] |= x; |
|
3018 } |
|
3019 |
|
3020 #ifdef SUPPORT_UTF8 |
|
3021 if (utf8) |
|
3022 { |
|
3023 class_utf8 = TRUE; |
|
3024 *class_utf8data++ = XCL_RANGE; |
|
3025 class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data); |
|
3026 class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data); |
|
3027 *class_utf8data++ = XCL_RANGE; |
|
3028 class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data); |
|
3029 class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data); |
|
3030 *class_utf8data++ = XCL_RANGE; |
|
3031 class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data); |
|
3032 class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data); |
|
3033 *class_utf8data++ = XCL_RANGE; |
|
3034 class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data); |
|
3035 class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data); |
|
3036 *class_utf8data++ = XCL_RANGE; |
|
3037 class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data); |
|
3038 class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data); |
|
3039 *class_utf8data++ = XCL_RANGE; |
|
3040 class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data); |
|
3041 class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data); |
|
3042 *class_utf8data++ = XCL_RANGE; |
|
3043 class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data); |
|
3044 class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data); |
|
3045 } |
|
3046 #endif |
|
3047 continue; |
|
3048 } |
|
3049 |
|
3050 if (-c == ESC_v) |
|
3051 { |
|
3052 SETBIT(classbits, 0x0a); /* LF */ |
|
3053 SETBIT(classbits, 0x0b); /* VT */ |
|
3054 SETBIT(classbits, 0x0c); /* FF */ |
|
3055 SETBIT(classbits, 0x0d); /* CR */ |
|
3056 SETBIT(classbits, 0x85); /* NEL */ |
|
3057 #ifdef SUPPORT_UTF8 |
|
3058 if (utf8) |
|
3059 { |
|
3060 class_utf8 = TRUE; |
|
3061 *class_utf8data++ = XCL_RANGE; |
|
3062 class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data); |
|
3063 class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data); |
|
3064 } |
|
3065 #endif |
|
3066 continue; |
|
3067 } |
|
3068 |
|
3069 if (-c == ESC_V) |
|
3070 { |
|
3071 for (c = 0; c < 32; c++) |
|
3072 { |
|
3073 int x = 0xff; |
|
3074 switch (c) |
|
3075 { |
|
3076 case 0x0a/8: x ^= 1 << (0x0a%8); |
|
3077 x ^= 1 << (0x0b%8); |
|
3078 x ^= 1 << (0x0c%8); |
|
3079 x ^= 1 << (0x0d%8); |
|
3080 break; |
|
3081 case 0x85/8: x ^= 1 << (0x85%8); break; |
|
3082 default: break; |
|
3083 } |
|
3084 classbits[c] |= x; |
|
3085 } |
|
3086 |
|
3087 #ifdef SUPPORT_UTF8 |
|
3088 if (utf8) |
|
3089 { |
|
3090 class_utf8 = TRUE; |
|
3091 *class_utf8data++ = XCL_RANGE; |
|
3092 class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data); |
|
3093 class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data); |
|
3094 *class_utf8data++ = XCL_RANGE; |
|
3095 class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data); |
|
3096 class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data); |
|
3097 } |
|
3098 #endif |
|
3099 continue; |
|
3100 } |
|
3101 |
|
3102 /* We need to deal with \P and \p in both phases. */ |
|
3103 |
|
3104 #ifdef SUPPORT_UCP |
|
3105 if (-c == ESC_p || -c == ESC_P) |
|
3106 { |
|
3107 BOOL negated; |
|
3108 int pdata; |
|
3109 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr); |
|
3110 if (ptype < 0) goto FAILED; |
|
3111 class_utf8 = TRUE; |
|
3112 *class_utf8data++ = ((-c == ESC_p) != negated)? |
|
3113 XCL_PROP : XCL_NOTPROP; |
|
3114 *class_utf8data++ = ptype; |
|
3115 *class_utf8data++ = pdata; |
|
3116 class_charcount -= 2; /* Not a < 256 character */ |
|
3117 continue; |
|
3118 } |
|
3119 #endif |
|
3120 /* Unrecognized escapes are faulted if PCRE is running in its |
|
3121 strict mode. By default, for compatibility with Perl, they are |
|
3122 treated as literals. */ |
|
3123 |
|
3124 if ((options & PCRE_EXTRA) != 0) |
|
3125 { |
|
3126 *errorcodeptr = ERR7; |
|
3127 goto FAILED; |
|
3128 } |
|
3129 |
|
3130 class_charcount -= 2; /* Undo the default count from above */ |
|
3131 c = *ptr; /* Get the final character and fall through */ |
|
3132 } |
|
3133 |
|
3134 /* Fall through if we have a single character (c >= 0). This may be |
|
3135 greater than 256 in UTF-8 mode. */ |
|
3136 |
|
3137 } /* End of backslash handling */ |
|
3138 |
|
3139 /* A single character may be followed by '-' to form a range. However, |
|
3140 Perl does not permit ']' to be the end of the range. A '-' character |
|
3141 at the end is treated as a literal. Perl ignores orphaned \E sequences |
|
3142 entirely. The code for handling \Q and \E is messy. */ |
|
3143 |
|
3144 CHECK_RANGE: |
|
3145 while (ptr[1] == '\\' && ptr[2] == 'E') |
|
3146 { |
|
3147 inescq = FALSE; |
|
3148 ptr += 2; |
|
3149 } |
|
3150 |
|
3151 oldptr = ptr; |
|
3152 |
|
3153 /* Remember \r or \n */ |
|
3154 |
|
3155 if (c == '\r' || c == '\n') cd->external_flags |= PCRE_HASCRORLF; |
|
3156 |
|
3157 /* Check for range */ |
|
3158 |
|
3159 if (!inescq && ptr[1] == '-') |
|
3160 { |
|
3161 int d; |
|
3162 ptr += 2; |
|
3163 while (*ptr == '\\' && ptr[1] == 'E') ptr += 2; |
|
3164 |
|
3165 /* If we hit \Q (not followed by \E) at this point, go into escaped |
|
3166 mode. */ |
|
3167 |
|
3168 while (*ptr == '\\' && ptr[1] == 'Q') |
|
3169 { |
|
3170 ptr += 2; |
|
3171 if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; } |
|
3172 inescq = TRUE; |
|
3173 break; |
|
3174 } |
|
3175 |
|
3176 if (*ptr == 0 || (!inescq && *ptr == ']')) |
|
3177 { |
|
3178 ptr = oldptr; |
|
3179 goto LONE_SINGLE_CHARACTER; |
|
3180 } |
|
3181 |
|
3182 #ifdef SUPPORT_UTF8 |
|
3183 if (utf8) |
|
3184 { /* Braces are required because the */ |
|
3185 GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */ |
|
3186 } |
|
3187 else |
|
3188 #endif |
|
3189 d = *ptr; /* Not UTF-8 mode */ |
|
3190 |
|
3191 /* The second part of a range can be a single-character escape, but |
|
3192 not any of the other escapes. Perl 5.6 treats a hyphen as a literal |
|
3193 in such circumstances. */ |
|
3194 |
|
3195 if (!inescq && d == '\\') |
|
3196 { |
|
3197 d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE); |
|
3198 if (*errorcodeptr != 0) goto FAILED; |
|
3199 |
|
3200 /* \b is backspace; \X is literal X; \R is literal R; any other |
|
3201 special means the '-' was literal */ |
|
3202 |
|
3203 if (d < 0) |
|
3204 { |
|
3205 if (d == -ESC_b) d = '\b'; |
|
3206 else if (d == -ESC_X) d = 'X'; |
|
3207 else if (d == -ESC_R) d = 'R'; else |
|
3208 { |
|
3209 ptr = oldptr; |
|
3210 goto LONE_SINGLE_CHARACTER; /* A few lines below */ |
|
3211 } |
|
3212 } |
|
3213 } |
|
3214 |
|
3215 /* Check that the two values are in the correct order. Optimize |
|
3216 one-character ranges */ |
|
3217 |
|
3218 if (d < c) |
|
3219 { |
|
3220 *errorcodeptr = ERR8; |
|
3221 goto FAILED; |
|
3222 } |
|
3223 |
|
3224 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */ |
|
3225 |
|
3226 /* Remember \r or \n */ |
|
3227 |
|
3228 if (d == '\r' || d == '\n') cd->external_flags |= PCRE_HASCRORLF; |
|
3229 |
|
3230 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless |
|
3231 matching, we have to use an XCLASS with extra data items. Caseless |
|
3232 matching for characters > 127 is available only if UCP support is |
|
3233 available. */ |
|
3234 |
|
3235 #ifdef SUPPORT_UTF8 |
|
3236 if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127))) |
|
3237 { |
|
3238 class_utf8 = TRUE; |
|
3239 |
|
3240 /* With UCP support, we can find the other case equivalents of |
|
3241 the relevant characters. There may be several ranges. Optimize how |
|
3242 they fit with the basic range. */ |
|
3243 |
|
3244 #ifdef SUPPORT_UCP |
|
3245 if ((options & PCRE_CASELESS) != 0) |
|
3246 { |
|
3247 unsigned int occ, ocd; |
|
3248 unsigned int cc = c; |
|
3249 unsigned int origd = d; |
|
3250 while (get_othercase_range(&cc, origd, &occ, &ocd)) |
|
3251 { |
|
3252 if (occ >= (unsigned int)c && |
|
3253 ocd <= (unsigned int)d) |
|
3254 continue; /* Skip embedded ranges */ |
|
3255 |
|
3256 if (occ < (unsigned int)c && |
|
3257 ocd >= (unsigned int)c - 1) /* Extend the basic range */ |
|
3258 { /* if there is overlap, */ |
|
3259 c = occ; /* noting that if occ < c */ |
|
3260 continue; /* we can't have ocd > d */ |
|
3261 } /* because a subrange is */ |
|
3262 if (ocd > (unsigned int)d && |
|
3263 occ <= (unsigned int)d + 1) /* always shorter than */ |
|
3264 { /* the basic range. */ |
|
3265 d = ocd; |
|
3266 continue; |
|
3267 } |
|
3268 |
|
3269 if (occ == ocd) |
|
3270 { |
|
3271 *class_utf8data++ = XCL_SINGLE; |
|
3272 } |
|
3273 else |
|
3274 { |
|
3275 *class_utf8data++ = XCL_RANGE; |
|
3276 class_utf8data += _pcre_ord2utf8(occ, class_utf8data); |
|
3277 } |
|
3278 class_utf8data += _pcre_ord2utf8(ocd, class_utf8data); |
|
3279 } |
|
3280 } |
|
3281 #endif /* SUPPORT_UCP */ |
|
3282 |
|
3283 /* Now record the original range, possibly modified for UCP caseless |
|
3284 overlapping ranges. */ |
|
3285 |
|
3286 *class_utf8data++ = XCL_RANGE; |
|
3287 class_utf8data += _pcre_ord2utf8(c, class_utf8data); |
|
3288 class_utf8data += _pcre_ord2utf8(d, class_utf8data); |
|
3289 |
|
3290 /* With UCP support, we are done. Without UCP support, there is no |
|
3291 caseless matching for UTF-8 characters > 127; we can use the bit map |
|
3292 for the smaller ones. */ |
|
3293 |
|
3294 #ifdef SUPPORT_UCP |
|
3295 continue; /* With next character in the class */ |
|
3296 #else |
|
3297 if ((options & PCRE_CASELESS) == 0 || c > 127) continue; |
|
3298 |
|
3299 /* Adjust upper limit and fall through to set up the map */ |
|
3300 |
|
3301 d = 127; |
|
3302 |
|
3303 #endif /* SUPPORT_UCP */ |
|
3304 } |
|
3305 #endif /* SUPPORT_UTF8 */ |
|
3306 |
|
3307 /* We use the bit map for all cases when not in UTF-8 mode; else |
|
3308 ranges that lie entirely within 0-127 when there is UCP support; else |
|
3309 for partial ranges without UCP support. */ |
|
3310 |
|
3311 class_charcount += d - c + 1; |
|
3312 class_lastchar = d; |
|
3313 |
|
3314 /* We can save a bit of time by skipping this in the pre-compile. */ |
|
3315 |
|
3316 if (lengthptr == NULL) for (; c <= d; c++) |
|
3317 { |
|
3318 classbits[c/8] |= (1 << (c&7)); |
|
3319 if ((options & PCRE_CASELESS) != 0) |
|
3320 { |
|
3321 int uc = cd->fcc[c]; /* flip case */ |
|
3322 classbits[uc/8] |= (1 << (uc&7)); |
|
3323 } |
|
3324 } |
|
3325 |
|
3326 continue; /* Go get the next char in the class */ |
|
3327 } |
|
3328 |
|
3329 /* Handle a lone single character - we can get here for a normal |
|
3330 non-escape char, or after \ that introduces a single character or for an |
|
3331 apparent range that isn't. */ |
|
3332 |
|
3333 LONE_SINGLE_CHARACTER: |
|
3334 |
|
3335 /* Handle a character that cannot go in the bit map */ |
|
3336 |
|
3337 #ifdef SUPPORT_UTF8 |
|
3338 if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127))) |
|
3339 { |
|
3340 class_utf8 = TRUE; |
|
3341 *class_utf8data++ = XCL_SINGLE; |
|
3342 class_utf8data += _pcre_ord2utf8(c, class_utf8data); |
|
3343 |
|
3344 #ifdef SUPPORT_UCP |
|
3345 if ((options & PCRE_CASELESS) != 0) |
|
3346 { |
|
3347 unsigned int othercase; |
|
3348 if ((othercase = UCD_OTHERCASE(c)) != c) |
|
3349 { |
|
3350 *class_utf8data++ = XCL_SINGLE; |
|
3351 class_utf8data += _pcre_ord2utf8(othercase, class_utf8data); |
|
3352 } |
|
3353 } |
|
3354 #endif /* SUPPORT_UCP */ |
|
3355 |
|
3356 } |
|
3357 else |
|
3358 #endif /* SUPPORT_UTF8 */ |
|
3359 |
|
3360 /* Handle a single-byte character */ |
|
3361 { |
|
3362 classbits[c/8] |= (1 << (c&7)); |
|
3363 if ((options & PCRE_CASELESS) != 0) |
|
3364 { |
|
3365 c = cd->fcc[c]; /* flip case */ |
|
3366 classbits[c/8] |= (1 << (c&7)); |
|
3367 } |
|
3368 class_charcount++; |
|
3369 class_lastchar = c; |
|
3370 } |
|
3371 } |
|
3372 |
|
3373 /* Loop until ']' reached. This "while" is the end of the "do" above. */ |
|
3374 |
|
3375 while ((c = *(++ptr)) != 0 && (c != ']' || inescq)); |
|
3376 |
|
3377 if (c == 0) /* Missing terminating ']' */ |
|
3378 { |
|
3379 *errorcodeptr = ERR6; |
|
3380 goto FAILED; |
|
3381 } |
|
3382 |
|
3383 |
|
3384 /* This code has been disabled because it would mean that \s counts as |
|
3385 an explicit \r or \n reference, and that's not really what is wanted. Now |
|
3386 we set the flag only if there is a literal "\r" or "\n" in the class. */ |
|
3387 |
|
3388 #if 0 |
|
3389 /* Remember whether \r or \n are in this class */ |
|
3390 |
|
3391 if (negate_class) |
|
3392 { |
|
3393 if ((classbits[1] & 0x24) != 0x24) cd->external_flags |= PCRE_HASCRORLF; |
|
3394 } |
|
3395 else |
|
3396 { |
|
3397 if ((classbits[1] & 0x24) != 0) cd->external_flags |= PCRE_HASCRORLF; |
|
3398 } |
|
3399 #endif |
|
3400 |
|
3401 |
|
3402 /* If class_charcount is 1, we saw precisely one character whose value is |
|
3403 less than 256. As long as there were no characters >= 128 and there was no |
|
3404 use of \p or \P, in other words, no use of any XCLASS features, we can |
|
3405 optimize. |
|
3406 |
|
3407 In UTF-8 mode, we can optimize the negative case only if there were no |
|
3408 characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR |
|
3409 operate on single-bytes only. This is an historical hangover. Maybe one day |
|
3410 we can tidy these opcodes to handle multi-byte characters. |
|
3411 |
|
3412 The optimization throws away the bit map. We turn the item into a |
|
3413 1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note |
|
3414 that OP_NOT does not support multibyte characters. In the positive case, it |
|
3415 can cause firstbyte to be set. Otherwise, there can be no first char if |
|
3416 this item is first, whatever repeat count may follow. In the case of |
|
3417 reqbyte, save the previous value for reinstating. */ |
|
3418 |
|
3419 #ifdef SUPPORT_UTF8 |
|
3420 if (class_charcount == 1 && !class_utf8 && |
|
3421 (!utf8 || !negate_class || class_lastchar < 128)) |
|
3422 #else |
|
3423 if (class_charcount == 1) |
|
3424 #endif |
|
3425 { |
|
3426 zeroreqbyte = reqbyte; |
|
3427 |
|
3428 /* The OP_NOT opcode works on one-byte characters only. */ |
|
3429 |
|
3430 if (negate_class) |
|
3431 { |
|
3432 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE; |
|
3433 zerofirstbyte = firstbyte; |
|
3434 *code++ = OP_NOT; |
|
3435 *code++ = class_lastchar; |
|
3436 break; |
|
3437 } |
|
3438 |
|
3439 /* For a single, positive character, get the value into mcbuffer, and |
|
3440 then we can handle this with the normal one-character code. */ |
|
3441 |
|
3442 #ifdef SUPPORT_UTF8 |
|
3443 if (utf8 && class_lastchar > 127) |
|
3444 mclength = _pcre_ord2utf8(class_lastchar, mcbuffer); |
|
3445 else |
|
3446 #endif |
|
3447 { |
|
3448 mcbuffer[0] = class_lastchar; |
|
3449 mclength = 1; |
|
3450 } |
|
3451 goto ONE_CHAR; |
|
3452 } /* End of 1-char optimization */ |
|
3453 |
|
3454 /* The general case - not the one-char optimization. If this is the first |
|
3455 thing in the branch, there can be no first char setting, whatever the |
|
3456 repeat count. Any reqbyte setting must remain unchanged after any kind of |
|
3457 repeat. */ |
|
3458 |
|
3459 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE; |
|
3460 zerofirstbyte = firstbyte; |
|
3461 zeroreqbyte = reqbyte; |
|
3462 |
|
3463 /* If there are characters with values > 255, we have to compile an |
|
3464 extended class, with its own opcode, unless there was a negated special |
|
3465 such as \S in the class, because in that case all characters > 255 are in |
|
3466 the class, so any that were explicitly given as well can be ignored. If |
|
3467 (when there are explicit characters > 255 that must be listed) there are no |
|
3468 characters < 256, we can omit the bitmap in the actual compiled code. */ |
|
3469 |
|
3470 #ifdef SUPPORT_UTF8 |
|
3471 if (class_utf8 && !should_flip_negation) |
|
3472 { |
|
3473 *class_utf8data++ = XCL_END; /* Marks the end of extra data */ |
|
3474 *code++ = OP_XCLASS; |
|
3475 code += LINK_SIZE; |
|
3476 *code = negate_class? XCL_NOT : 0; |
|
3477 |
|
3478 /* If the map is required, move up the extra data to make room for it; |
|
3479 otherwise just move the code pointer to the end of the extra data. */ |
|
3480 |
|
3481 if (class_charcount > 0) |
|
3482 { |
|
3483 *code++ |= XCL_MAP; |
|
3484 memmove(code + 32, code, class_utf8data - code); |
|
3485 memcpy(code, classbits, 32); |
|
3486 code = class_utf8data + 32; |
|
3487 } |
|
3488 else code = class_utf8data; |
|
3489 |
|
3490 /* Now fill in the complete length of the item */ |
|
3491 |
|
3492 PUT(previous, 1, code - previous); |
|
3493 break; /* End of class handling */ |
|
3494 } |
|
3495 #endif |
|
3496 |
|
3497 /* If there are no characters > 255, set the opcode to OP_CLASS or |
|
3498 OP_NCLASS, depending on whether the whole class was negated and whether |
|
3499 there were negative specials such as \S in the class. Then copy the 32-byte |
|
3500 map into the code vector, negating it if necessary. */ |
|
3501 |
|
3502 *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS; |
|
3503 if (negate_class) |
|
3504 { |
|
3505 if (lengthptr == NULL) /* Save time in the pre-compile phase */ |
|
3506 for (c = 0; c < 32; c++) code[c] = ~classbits[c]; |
|
3507 } |
|
3508 else |
|
3509 { |
|
3510 memcpy(code, classbits, 32); |
|
3511 } |
|
3512 code += 32; |
|
3513 break; |
|
3514 |
|
3515 |
|
3516 /* ===================================================================*/ |
|
3517 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this |
|
3518 has been tested above. */ |
|
3519 |
|
3520 case '{': |
|
3521 if (!is_quantifier) goto NORMAL_CHAR; |
|
3522 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr); |
|
3523 if (*errorcodeptr != 0) goto FAILED; |
|
3524 goto REPEAT; |
|
3525 |
|
3526 case '*': |
|
3527 repeat_min = 0; |
|
3528 repeat_max = -1; |
|
3529 goto REPEAT; |
|
3530 |
|
3531 case '+': |
|
3532 repeat_min = 1; |
|
3533 repeat_max = -1; |
|
3534 goto REPEAT; |
|
3535 |
|
3536 case '?': |
|
3537 repeat_min = 0; |
|
3538 repeat_max = 1; |
|
3539 |
|
3540 REPEAT: |
|
3541 if (previous == NULL) |
|
3542 { |
|
3543 *errorcodeptr = ERR9; |
|
3544 goto FAILED; |
|
3545 } |
|
3546 |
|
3547 if (repeat_min == 0) |
|
3548 { |
|
3549 firstbyte = zerofirstbyte; /* Adjust for zero repeat */ |
|
3550 reqbyte = zeroreqbyte; /* Ditto */ |
|
3551 } |
|
3552 |
|
3553 /* Remember whether this is a variable length repeat */ |
|
3554 |
|
3555 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY; |
|
3556 |
|
3557 op_type = 0; /* Default single-char op codes */ |
|
3558 possessive_quantifier = FALSE; /* Default not possessive quantifier */ |
|
3559 |
|
3560 /* Save start of previous item, in case we have to move it up to make space |
|
3561 for an inserted OP_ONCE for the additional '+' extension. */ |
|
3562 |
|
3563 tempcode = previous; |
|
3564 |
|
3565 /* If the next character is '+', we have a possessive quantifier. This |
|
3566 implies greediness, whatever the setting of the PCRE_UNGREEDY option. |
|
3567 If the next character is '?' this is a minimizing repeat, by default, |
|
3568 but if PCRE_UNGREEDY is set, it works the other way round. We change the |
|
3569 repeat type to the non-default. */ |
|
3570 |
|
3571 if (ptr[1] == '+') |
|
3572 { |
|
3573 repeat_type = 0; /* Force greedy */ |
|
3574 possessive_quantifier = TRUE; |
|
3575 ptr++; |
|
3576 } |
|
3577 else if (ptr[1] == '?') |
|
3578 { |
|
3579 repeat_type = greedy_non_default; |
|
3580 ptr++; |
|
3581 } |
|
3582 else repeat_type = greedy_default; |
|
3583 |
|
3584 /* If previous was a character match, abolish the item and generate a |
|
3585 repeat item instead. If a char item has a minumum of more than one, ensure |
|
3586 that it is set in reqbyte - it might not be if a sequence such as x{3} is |
|
3587 the first thing in a branch because the x will have gone into firstbyte |
|
3588 instead. */ |
|
3589 |
|
3590 if (*previous == OP_CHAR || *previous == OP_CHARNC) |
|
3591 { |
|
3592 /* Deal with UTF-8 characters that take up more than one byte. It's |
|
3593 easier to write this out separately than try to macrify it. Use c to |
|
3594 hold the length of the character in bytes, plus 0x80 to flag that it's a |
|
3595 length rather than a small character. */ |
|
3596 |
|
3597 #ifdef SUPPORT_UTF8 |
|
3598 if (utf8 && (code[-1] & 0x80) != 0) |
|
3599 { |
|
3600 uschar *lastchar = code - 1; |
|
3601 while((*lastchar & 0xc0) == 0x80) lastchar--; |
|
3602 c = code - lastchar; /* Length of UTF-8 character */ |
|
3603 memcpy(utf8_char, lastchar, c); /* Save the char */ |
|
3604 c |= 0x80; /* Flag c as a length */ |
|
3605 } |
|
3606 else |
|
3607 #endif |
|
3608 |
|
3609 /* Handle the case of a single byte - either with no UTF8 support, or |
|
3610 with UTF-8 disabled, or for a UTF-8 character < 128. */ |
|
3611 |
|
3612 { |
|
3613 c = code[-1]; |
|
3614 if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt; |
|
3615 } |
|
3616 |
|
3617 /* If the repetition is unlimited, it pays to see if the next thing on |
|
3618 the line is something that cannot possibly match this character. If so, |
|
3619 automatically possessifying this item gains some performance in the case |
|
3620 where the match fails. */ |
|
3621 |
|
3622 if (!possessive_quantifier && |
|
3623 repeat_max < 0 && |
|
3624 check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1, |
|
3625 options, cd)) |
|
3626 { |
|
3627 repeat_type = 0; /* Force greedy */ |
|
3628 possessive_quantifier = TRUE; |
|
3629 } |
|
3630 |
|
3631 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */ |
|
3632 } |
|
3633 |
|
3634 /* If previous was a single negated character ([^a] or similar), we use |
|
3635 one of the special opcodes, replacing it. The code is shared with single- |
|
3636 character repeats by setting opt_type to add a suitable offset into |
|
3637 repeat_type. We can also test for auto-possessification. OP_NOT is |
|
3638 currently used only for single-byte chars. */ |
|
3639 |
|
3640 else if (*previous == OP_NOT) |
|
3641 { |
|
3642 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */ |
|
3643 c = previous[1]; |
|
3644 if (!possessive_quantifier && |
|
3645 repeat_max < 0 && |
|
3646 check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd)) |
|
3647 { |
|
3648 repeat_type = 0; /* Force greedy */ |
|
3649 possessive_quantifier = TRUE; |
|
3650 } |
|
3651 goto OUTPUT_SINGLE_REPEAT; |
|
3652 } |
|
3653 |
|
3654 /* If previous was a character type match (\d or similar), abolish it and |
|
3655 create a suitable repeat item. The code is shared with single-character |
|
3656 repeats by setting op_type to add a suitable offset into repeat_type. Note |
|
3657 the the Unicode property types will be present only when SUPPORT_UCP is |
|
3658 defined, but we don't wrap the little bits of code here because it just |
|
3659 makes it horribly messy. */ |
|
3660 |
|
3661 else if (*previous < OP_EODN) |
|
3662 { |
|
3663 uschar *oldcode; |
|
3664 int prop_type, prop_value; |
|
3665 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */ |
|
3666 c = *previous; |
|
3667 |
|
3668 if (!possessive_quantifier && |
|
3669 repeat_max < 0 && |
|
3670 check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd)) |
|
3671 { |
|
3672 repeat_type = 0; /* Force greedy */ |
|
3673 possessive_quantifier = TRUE; |
|
3674 } |
|
3675 |
|
3676 OUTPUT_SINGLE_REPEAT: |
|
3677 if (*previous == OP_PROP || *previous == OP_NOTPROP) |
|
3678 { |
|
3679 prop_type = previous[1]; |
|
3680 prop_value = previous[2]; |
|
3681 } |
|
3682 else prop_type = prop_value = -1; |
|
3683 |
|
3684 oldcode = code; |
|
3685 code = previous; /* Usually overwrite previous item */ |
|
3686 |
|
3687 /* If the maximum is zero then the minimum must also be zero; Perl allows |
|
3688 this case, so we do too - by simply omitting the item altogether. */ |
|
3689 |
|
3690 if (repeat_max == 0) goto END_REPEAT; |
|
3691 |
|
3692 /* All real repeats make it impossible to handle partial matching (maybe |
|
3693 one day we will be able to remove this restriction). */ |
|
3694 |
|
3695 if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL; |
|
3696 |
|
3697 /* Combine the op_type with the repeat_type */ |
|
3698 |
|
3699 repeat_type += op_type; |
|
3700 |
|
3701 /* A minimum of zero is handled either as the special case * or ?, or as |
|
3702 an UPTO, with the maximum given. */ |
|
3703 |
|
3704 if (repeat_min == 0) |
|
3705 { |
|
3706 if (repeat_max == -1) *code++ = OP_STAR + repeat_type; |
|
3707 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type; |
|
3708 else |
|
3709 { |
|
3710 *code++ = OP_UPTO + repeat_type; |
|
3711 PUT2INC(code, 0, repeat_max); |
|
3712 } |
|
3713 } |
|
3714 |
|
3715 /* A repeat minimum of 1 is optimized into some special cases. If the |
|
3716 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is |
|
3717 left in place and, if the maximum is greater than 1, we use OP_UPTO with |
|
3718 one less than the maximum. */ |
|
3719 |
|
3720 else if (repeat_min == 1) |
|
3721 { |
|
3722 if (repeat_max == -1) |
|
3723 *code++ = OP_PLUS + repeat_type; |
|
3724 else |
|
3725 { |
|
3726 code = oldcode; /* leave previous item in place */ |
|
3727 if (repeat_max == 1) goto END_REPEAT; |
|
3728 *code++ = OP_UPTO + repeat_type; |
|
3729 PUT2INC(code, 0, repeat_max - 1); |
|
3730 } |
|
3731 } |
|
3732 |
|
3733 /* The case {n,n} is just an EXACT, while the general case {n,m} is |
|
3734 handled as an EXACT followed by an UPTO. */ |
|
3735 |
|
3736 else |
|
3737 { |
|
3738 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */ |
|
3739 PUT2INC(code, 0, repeat_min); |
|
3740 |
|
3741 /* If the maximum is unlimited, insert an OP_STAR. Before doing so, |
|
3742 we have to insert the character for the previous code. For a repeated |
|
3743 Unicode property match, there are two extra bytes that define the |
|
3744 required property. In UTF-8 mode, long characters have their length in |
|
3745 c, with the 0x80 bit as a flag. */ |
|
3746 |
|
3747 if (repeat_max < 0) |
|
3748 { |
|
3749 #ifdef SUPPORT_UTF8 |
|
3750 if (utf8 && c >= 128) |
|
3751 { |
|
3752 memcpy(code, utf8_char, c & 7); |
|
3753 code += c & 7; |
|
3754 } |
|
3755 else |
|
3756 #endif |
|
3757 { |
|
3758 *code++ = c; |
|
3759 if (prop_type >= 0) |
|
3760 { |
|
3761 *code++ = prop_type; |
|
3762 *code++ = prop_value; |
|
3763 } |
|
3764 } |
|
3765 *code++ = OP_STAR + repeat_type; |
|
3766 } |
|
3767 |
|
3768 /* Else insert an UPTO if the max is greater than the min, again |
|
3769 preceded by the character, for the previously inserted code. If the |
|
3770 UPTO is just for 1 instance, we can use QUERY instead. */ |
|
3771 |
|
3772 else if (repeat_max != repeat_min) |
|
3773 { |
|
3774 #ifdef SUPPORT_UTF8 |
|
3775 if (utf8 && c >= 128) |
|
3776 { |
|
3777 memcpy(code, utf8_char, c & 7); |
|
3778 code += c & 7; |
|
3779 } |
|
3780 else |
|
3781 #endif |
|
3782 *code++ = c; |
|
3783 if (prop_type >= 0) |
|
3784 { |
|
3785 *code++ = prop_type; |
|
3786 *code++ = prop_value; |
|
3787 } |
|
3788 repeat_max -= repeat_min; |
|
3789 |
|
3790 if (repeat_max == 1) |
|
3791 { |
|
3792 *code++ = OP_QUERY + repeat_type; |
|
3793 } |
|
3794 else |
|
3795 { |
|
3796 *code++ = OP_UPTO + repeat_type; |
|
3797 PUT2INC(code, 0, repeat_max); |
|
3798 } |
|
3799 } |
|
3800 } |
|
3801 |
|
3802 /* The character or character type itself comes last in all cases. */ |
|
3803 |
|
3804 #ifdef SUPPORT_UTF8 |
|
3805 if (utf8 && c >= 128) |
|
3806 { |
|
3807 memcpy(code, utf8_char, c & 7); |
|
3808 code += c & 7; |
|
3809 } |
|
3810 else |
|
3811 #endif |
|
3812 *code++ = c; |
|
3813 |
|
3814 /* For a repeated Unicode property match, there are two extra bytes that |
|
3815 define the required property. */ |
|
3816 |
|
3817 #ifdef SUPPORT_UCP |
|
3818 if (prop_type >= 0) |
|
3819 { |
|
3820 *code++ = prop_type; |
|
3821 *code++ = prop_value; |
|
3822 } |
|
3823 #endif |
|
3824 } |
|
3825 |
|
3826 /* If previous was a character class or a back reference, we put the repeat |
|
3827 stuff after it, but just skip the item if the repeat was {0,0}. */ |
|
3828 |
|
3829 else if (*previous == OP_CLASS || |
|
3830 *previous == OP_NCLASS || |
|
3831 #ifdef SUPPORT_UTF8 |
|
3832 *previous == OP_XCLASS || |
|
3833 #endif |
|
3834 *previous == OP_REF) |
|
3835 { |
|
3836 if (repeat_max == 0) |
|
3837 { |
|
3838 code = previous; |
|
3839 goto END_REPEAT; |
|
3840 } |
|
3841 |
|
3842 /* All real repeats make it impossible to handle partial matching (maybe |
|
3843 one day we will be able to remove this restriction). */ |
|
3844 |
|
3845 if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL; |
|
3846 |
|
3847 if (repeat_min == 0 && repeat_max == -1) |
|
3848 *code++ = OP_CRSTAR + repeat_type; |
|
3849 else if (repeat_min == 1 && repeat_max == -1) |
|
3850 *code++ = OP_CRPLUS + repeat_type; |
|
3851 else if (repeat_min == 0 && repeat_max == 1) |
|
3852 *code++ = OP_CRQUERY + repeat_type; |
|
3853 else |
|
3854 { |
|
3855 *code++ = OP_CRRANGE + repeat_type; |
|
3856 PUT2INC(code, 0, repeat_min); |
|
3857 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */ |
|
3858 PUT2INC(code, 0, repeat_max); |
|
3859 } |
|
3860 } |
|
3861 |
|
3862 /* If previous was a bracket group, we may have to replicate it in certain |
|
3863 cases. */ |
|
3864 |
|
3865 else if (*previous == OP_BRA || *previous == OP_CBRA || |
|
3866 *previous == OP_ONCE || *previous == OP_COND) |
|
3867 { |
|
3868 register int i; |
|
3869 int ketoffset = 0; |
|
3870 int len = code - previous; |
|
3871 uschar *bralink = NULL; |
|
3872 |
|
3873 /* Repeating a DEFINE group is pointless */ |
|
3874 |
|
3875 if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF) |
|
3876 { |
|
3877 *errorcodeptr = ERR55; |
|
3878 goto FAILED; |
|
3879 } |
|
3880 |
|
3881 /* If the maximum repeat count is unlimited, find the end of the bracket |
|
3882 by scanning through from the start, and compute the offset back to it |
|
3883 from the current code pointer. There may be an OP_OPT setting following |
|
3884 the final KET, so we can't find the end just by going back from the code |
|
3885 pointer. */ |
|
3886 |
|
3887 if (repeat_max == -1) |
|
3888 { |
|
3889 register uschar *ket = previous; |
|
3890 do ket += GET(ket, 1); while (*ket != OP_KET); |
|
3891 ketoffset = code - ket; |
|
3892 } |
|
3893 |
|
3894 /* The case of a zero minimum is special because of the need to stick |
|
3895 OP_BRAZERO in front of it, and because the group appears once in the |
|
3896 data, whereas in other cases it appears the minimum number of times. For |
|
3897 this reason, it is simplest to treat this case separately, as otherwise |
|
3898 the code gets far too messy. There are several special subcases when the |
|
3899 minimum is zero. */ |
|
3900 |
|
3901 if (repeat_min == 0) |
|
3902 { |
|
3903 /* If the maximum is also zero, we used to just omit the group from the |
|
3904 output altogether, like this: |
|
3905 |
|
3906 ** if (repeat_max == 0) |
|
3907 ** { |
|
3908 ** code = previous; |
|
3909 ** goto END_REPEAT; |
|
3910 ** } |
|
3911 |
|
3912 However, that fails when a group is referenced as a subroutine from |
|
3913 elsewhere in the pattern, so now we stick in OP_SKIPZERO in front of it |
|
3914 so that it is skipped on execution. As we don't have a list of which |
|
3915 groups are referenced, we cannot do this selectively. |
|
3916 |
|
3917 If the maximum is 1 or unlimited, we just have to stick in the BRAZERO |
|
3918 and do no more at this point. However, we do need to adjust any |
|
3919 OP_RECURSE calls inside the group that refer to the group itself or any |
|
3920 internal or forward referenced group, because the offset is from the |
|
3921 start of the whole regex. Temporarily terminate the pattern while doing |
|
3922 this. */ |
|
3923 |
|
3924 if (repeat_max <= 1) /* Covers 0, 1, and unlimited */ |
|
3925 { |
|
3926 *code = OP_END; |
|
3927 adjust_recurse(previous, 1, utf8, cd, save_hwm); |
|
3928 memmove(previous+1, previous, len); |
|
3929 code++; |
|
3930 if (repeat_max == 0) |
|
3931 { |
|
3932 *previous++ = OP_SKIPZERO; |
|
3933 goto END_REPEAT; |
|
3934 } |
|
3935 *previous++ = OP_BRAZERO + repeat_type; |
|
3936 } |
|
3937 |
|
3938 /* If the maximum is greater than 1 and limited, we have to replicate |
|
3939 in a nested fashion, sticking OP_BRAZERO before each set of brackets. |
|
3940 The first one has to be handled carefully because it's the original |
|
3941 copy, which has to be moved up. The remainder can be handled by code |
|
3942 that is common with the non-zero minimum case below. We have to |
|
3943 adjust the value or repeat_max, since one less copy is required. Once |
|
3944 again, we may have to adjust any OP_RECURSE calls inside the group. */ |
|
3945 |
|
3946 else |
|
3947 { |
|
3948 int offset; |
|
3949 *code = OP_END; |
|
3950 adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm); |
|
3951 memmove(previous + 2 + LINK_SIZE, previous, len); |
|
3952 code += 2 + LINK_SIZE; |
|
3953 *previous++ = OP_BRAZERO + repeat_type; |
|
3954 *previous++ = OP_BRA; |
|
3955 |
|
3956 /* We chain together the bracket offset fields that have to be |
|
3957 filled in later when the ends of the brackets are reached. */ |
|
3958 |
|
3959 offset = (bralink == NULL)? 0 : previous - bralink; |
|
3960 bralink = previous; |
|
3961 PUTINC(previous, 0, offset); |
|
3962 } |
|
3963 |
|
3964 repeat_max--; |
|
3965 } |
|
3966 |
|
3967 /* If the minimum is greater than zero, replicate the group as many |
|
3968 times as necessary, and adjust the maximum to the number of subsequent |
|
3969 copies that we need. If we set a first char from the group, and didn't |
|
3970 set a required char, copy the latter from the former. If there are any |
|
3971 forward reference subroutine calls in the group, there will be entries on |
|
3972 the workspace list; replicate these with an appropriate increment. */ |
|
3973 |
|
3974 else |
|
3975 { |
|
3976 if (repeat_min > 1) |
|
3977 { |
|
3978 /* In the pre-compile phase, we don't actually do the replication. We |
|
3979 just adjust the length as if we had. Do some paranoid checks for |
|
3980 potential integer overflow. */ |
|
3981 |
|
3982 if (lengthptr != NULL) |
|
3983 { |
|
3984 int delta = (repeat_min - 1)*length_prevgroup; |
|
3985 if ((double)(repeat_min - 1)*(double)length_prevgroup > |
|
3986 (double)INT_MAX || |
|
3987 OFLOW_MAX - *lengthptr < delta) |
|
3988 { |
|
3989 *errorcodeptr = ERR20; |
|
3990 goto FAILED; |
|
3991 } |
|
3992 *lengthptr += delta; |
|
3993 } |
|
3994 |
|
3995 /* This is compiling for real */ |
|
3996 |
|
3997 else |
|
3998 { |
|
3999 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte; |
|
4000 for (i = 1; i < repeat_min; i++) |
|
4001 { |
|
4002 uschar *hc; |
|
4003 uschar *this_hwm = cd->hwm; |
|
4004 memcpy(code, previous, len); |
|
4005 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE) |
|
4006 { |
|
4007 PUT(cd->hwm, 0, GET(hc, 0) + len); |
|
4008 cd->hwm += LINK_SIZE; |
|
4009 } |
|
4010 save_hwm = this_hwm; |
|
4011 code += len; |
|
4012 } |
|
4013 } |
|
4014 } |
|
4015 |
|
4016 if (repeat_max > 0) repeat_max -= repeat_min; |
|
4017 } |
|
4018 |
|
4019 /* This code is common to both the zero and non-zero minimum cases. If |
|
4020 the maximum is limited, it replicates the group in a nested fashion, |
|
4021 remembering the bracket starts on a stack. In the case of a zero minimum, |
|
4022 the first one was set up above. In all cases the repeat_max now specifies |
|
4023 the number of additional copies needed. Again, we must remember to |
|
4024 replicate entries on the forward reference list. */ |
|
4025 |
|
4026 if (repeat_max >= 0) |
|
4027 { |
|
4028 /* In the pre-compile phase, we don't actually do the replication. We |
|
4029 just adjust the length as if we had. For each repetition we must add 1 |
|
4030 to the length for BRAZERO and for all but the last repetition we must |
|
4031 add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some |
|
4032 paranoid checks to avoid integer overflow. */ |
|
4033 |
|
4034 if (lengthptr != NULL && repeat_max > 0) |
|
4035 { |
|
4036 int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) - |
|
4037 2 - 2*LINK_SIZE; /* Last one doesn't nest */ |
|
4038 if ((double)repeat_max * |
|
4039 (double)(length_prevgroup + 1 + 2 + 2*LINK_SIZE) |
|
4040 > (double)INT_MAX || |
|
4041 OFLOW_MAX - *lengthptr < delta) |
|
4042 { |
|
4043 *errorcodeptr = ERR20; |
|
4044 goto FAILED; |
|
4045 } |
|
4046 *lengthptr += delta; |
|
4047 } |
|
4048 |
|
4049 /* This is compiling for real */ |
|
4050 |
|
4051 else for (i = repeat_max - 1; i >= 0; i--) |
|
4052 { |
|
4053 uschar *hc; |
|
4054 uschar *this_hwm = cd->hwm; |
|
4055 |
|
4056 *code++ = OP_BRAZERO + repeat_type; |
|
4057 |
|
4058 /* All but the final copy start a new nesting, maintaining the |
|
4059 chain of brackets outstanding. */ |
|
4060 |
|
4061 if (i != 0) |
|
4062 { |
|
4063 int offset; |
|
4064 *code++ = OP_BRA; |
|
4065 offset = (bralink == NULL)? 0 : code - bralink; |
|
4066 bralink = code; |
|
4067 PUTINC(code, 0, offset); |
|
4068 } |
|
4069 |
|
4070 memcpy(code, previous, len); |
|
4071 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE) |
|
4072 { |
|
4073 PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1)); |
|
4074 cd->hwm += LINK_SIZE; |
|
4075 } |
|
4076 save_hwm = this_hwm; |
|
4077 code += len; |
|
4078 } |
|
4079 |
|
4080 /* Now chain through the pending brackets, and fill in their length |
|
4081 fields (which are holding the chain links pro tem). */ |
|
4082 |
|
4083 while (bralink != NULL) |
|
4084 { |
|
4085 int oldlinkoffset; |
|
4086 int offset = code - bralink + 1; |
|
4087 uschar *bra = code - offset; |
|
4088 oldlinkoffset = GET(bra, 1); |
|
4089 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset; |
|
4090 *code++ = OP_KET; |
|
4091 PUTINC(code, 0, offset); |
|
4092 PUT(bra, 1, offset); |
|
4093 } |
|
4094 } |
|
4095 |
|
4096 /* If the maximum is unlimited, set a repeater in the final copy. We |
|
4097 can't just offset backwards from the current code point, because we |
|
4098 don't know if there's been an options resetting after the ket. The |
|
4099 correct offset was computed above. |
|
4100 |
|
4101 Then, when we are doing the actual compile phase, check to see whether |
|
4102 this group is a non-atomic one that could match an empty string. If so, |
|
4103 convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so |
|
4104 that runtime checking can be done. [This check is also applied to |
|
4105 atomic groups at runtime, but in a different way.] */ |
|
4106 |
|
4107 else |
|
4108 { |
|
4109 uschar *ketcode = code - ketoffset; |
|
4110 uschar *bracode = ketcode - GET(ketcode, 1); |
|
4111 *ketcode = OP_KETRMAX + repeat_type; |
|
4112 if (lengthptr == NULL && *bracode != OP_ONCE) |
|
4113 { |
|
4114 uschar *scode = bracode; |
|
4115 do |
|
4116 { |
|
4117 if (could_be_empty_branch(scode, ketcode, utf8)) |
|
4118 { |
|
4119 *bracode += OP_SBRA - OP_BRA; |
|
4120 break; |
|
4121 } |
|
4122 scode += GET(scode, 1); |
|
4123 } |
|
4124 while (*scode == OP_ALT); |
|
4125 } |
|
4126 } |
|
4127 } |
|
4128 |
|
4129 /* If previous is OP_FAIL, it was generated by an empty class [] in |
|
4130 JavaScript mode. The other ways in which OP_FAIL can be generated, that is |
|
4131 by (*FAIL) or (?!) set previous to NULL, which gives a "nothing to repeat" |
|
4132 error above. We can just ignore the repeat in JS case. */ |
|
4133 |
|
4134 else if (*previous == OP_FAIL) goto END_REPEAT; |
|
4135 |
|
4136 /* Else there's some kind of shambles */ |
|
4137 |
|
4138 else |
|
4139 { |
|
4140 *errorcodeptr = ERR11; |
|
4141 goto FAILED; |
|
4142 } |
|
4143 |
|
4144 /* If the character following a repeat is '+', or if certain optimization |
|
4145 tests above succeeded, possessive_quantifier is TRUE. For some of the |
|
4146 simpler opcodes, there is an special alternative opcode for this. For |
|
4147 anything else, we wrap the entire repeated item inside OP_ONCE brackets. |
|
4148 The '+' notation is just syntactic sugar, taken from Sun's Java package, |
|
4149 but the special opcodes can optimize it a bit. The repeated item starts at |
|
4150 tempcode, not at previous, which might be the first part of a string whose |
|
4151 (former) last char we repeated. |
|
4152 |
|
4153 Possessifying an 'exact' quantifier has no effect, so we can ignore it. But |
|
4154 an 'upto' may follow. We skip over an 'exact' item, and then test the |
|
4155 length of what remains before proceeding. */ |
|
4156 |
|
4157 if (possessive_quantifier) |
|
4158 { |
|
4159 int len; |
|
4160 if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT || |
|
4161 *tempcode == OP_NOTEXACT) |
|
4162 tempcode += _pcre_OP_lengths[*tempcode] + |
|
4163 ((*tempcode == OP_TYPEEXACT && |
|
4164 (tempcode[3] == OP_PROP || tempcode[3] == OP_NOTPROP))? 2:0); |
|
4165 len = code - tempcode; |
|
4166 if (len > 0) switch (*tempcode) |
|
4167 { |
|
4168 case OP_STAR: *tempcode = OP_POSSTAR; break; |
|
4169 case OP_PLUS: *tempcode = OP_POSPLUS; break; |
|
4170 case OP_QUERY: *tempcode = OP_POSQUERY; break; |
|
4171 case OP_UPTO: *tempcode = OP_POSUPTO; break; |
|
4172 |
|
4173 case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break; |
|
4174 case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break; |
|
4175 case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break; |
|
4176 case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break; |
|
4177 |
|
4178 case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break; |
|
4179 case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break; |
|
4180 case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break; |
|
4181 case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break; |
|
4182 |
|
4183 default: |
|
4184 memmove(tempcode + 1+LINK_SIZE, tempcode, len); |
|
4185 code += 1 + LINK_SIZE; |
|
4186 len += 1 + LINK_SIZE; |
|
4187 tempcode[0] = OP_ONCE; |
|
4188 *code++ = OP_KET; |
|
4189 PUTINC(code, 0, len); |
|
4190 PUT(tempcode, 1, len); |
|
4191 break; |
|
4192 } |
|
4193 } |
|
4194 |
|
4195 /* In all case we no longer have a previous item. We also set the |
|
4196 "follows varying string" flag for subsequently encountered reqbytes if |
|
4197 it isn't already set and we have just passed a varying length item. */ |
|
4198 |
|
4199 END_REPEAT: |
|
4200 previous = NULL; |
|
4201 cd->req_varyopt |= reqvary; |
|
4202 break; |
|
4203 |
|
4204 |
|
4205 /* ===================================================================*/ |
|
4206 /* Start of nested parenthesized sub-expression, or comment or lookahead or |
|
4207 lookbehind or option setting or condition or all the other extended |
|
4208 parenthesis forms. */ |
|
4209 |
|
4210 case '(': |
|
4211 newoptions = options; |
|
4212 skipbytes = 0; |
|
4213 bravalue = OP_CBRA; |
|
4214 save_hwm = cd->hwm; |
|
4215 reset_bracount = FALSE; |
|
4216 |
|
4217 /* First deal with various "verbs" that can be introduced by '*'. */ |
|
4218 |
|
4219 if (*(++ptr) == '*' && (cd->ctypes[ptr[1]] & ctype_letter) != 0) |
|
4220 { |
|
4221 int i, namelen; |
|
4222 const char *vn = verbnames; |
|
4223 const uschar *name = ++ptr; |
|
4224 previous = NULL; |
|
4225 while ((cd->ctypes[*++ptr] & ctype_letter) != 0) {}; |
|
4226 if (*ptr == ':') |
|
4227 { |
|
4228 *errorcodeptr = ERR59; /* Not supported */ |
|
4229 goto FAILED; |
|
4230 } |
|
4231 if (*ptr != ')') |
|
4232 { |
|
4233 *errorcodeptr = ERR60; |
|
4234 goto FAILED; |
|
4235 } |
|
4236 namelen = ptr - name; |
|
4237 for (i = 0; i < verbcount; i++) |
|
4238 { |
|
4239 if (namelen == verbs[i].len && |
|
4240 strncmp((char *)name, vn, namelen) == 0) |
|
4241 { |
|
4242 *code = verbs[i].op; |
|
4243 if (*code++ == OP_ACCEPT) cd->had_accept = TRUE; |
|
4244 break; |
|
4245 } |
|
4246 vn += verbs[i].len + 1; |
|
4247 } |
|
4248 if (i < verbcount) continue; |
|
4249 *errorcodeptr = ERR60; |
|
4250 goto FAILED; |
|
4251 } |
|
4252 |
|
4253 /* Deal with the extended parentheses; all are introduced by '?', and the |
|
4254 appearance of any of them means that this is not a capturing group. */ |
|
4255 |
|
4256 else if (*ptr == '?') |
|
4257 { |
|
4258 int i, set, unset, namelen; |
|
4259 int *optset; |
|
4260 const uschar *name; |
|
4261 uschar *slot; |
|
4262 |
|
4263 switch (*(++ptr)) |
|
4264 { |
|
4265 case '#': /* Comment; skip to ket */ |
|
4266 ptr++; |
|
4267 while (*ptr != 0 && *ptr != ')') ptr++; |
|
4268 if (*ptr == 0) |
|
4269 { |
|
4270 *errorcodeptr = ERR18; |
|
4271 goto FAILED; |
|
4272 } |
|
4273 continue; |
|
4274 |
|
4275 |
|
4276 /* ------------------------------------------------------------ */ |
|
4277 case '|': /* Reset capture count for each branch */ |
|
4278 reset_bracount = TRUE; |
|
4279 /* Fall through */ |
|
4280 |
|
4281 /* ------------------------------------------------------------ */ |
|
4282 case ':': /* Non-capturing bracket */ |
|
4283 bravalue = OP_BRA; |
|
4284 ptr++; |
|
4285 break; |
|
4286 |
|
4287 |
|
4288 /* ------------------------------------------------------------ */ |
|
4289 case '(': |
|
4290 bravalue = OP_COND; /* Conditional group */ |
|
4291 |
|
4292 /* A condition can be an assertion, a number (referring to a numbered |
|
4293 group), a name (referring to a named group), or 'R', referring to |
|
4294 recursion. R<digits> and R&name are also permitted for recursion tests. |
|
4295 |
|
4296 There are several syntaxes for testing a named group: (?(name)) is used |
|
4297 by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')). |
|
4298 |
|
4299 There are two unfortunate ambiguities, caused by history. (a) 'R' can |
|
4300 be the recursive thing or the name 'R' (and similarly for 'R' followed |
|
4301 by digits), and (b) a number could be a name that consists of digits. |
|
4302 In both cases, we look for a name first; if not found, we try the other |
|
4303 cases. */ |
|
4304 |
|
4305 /* For conditions that are assertions, check the syntax, and then exit |
|
4306 the switch. This will take control down to where bracketed groups, |
|
4307 including assertions, are processed. */ |
|
4308 |
|
4309 if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<')) |
|
4310 break; |
|
4311 |
|
4312 /* Most other conditions use OP_CREF (a couple change to OP_RREF |
|
4313 below), and all need to skip 3 bytes at the start of the group. */ |
|
4314 |
|
4315 code[1+LINK_SIZE] = OP_CREF; |
|
4316 skipbytes = 3; |
|
4317 refsign = -1; |
|
4318 |
|
4319 /* Check for a test for recursion in a named group. */ |
|
4320 |
|
4321 if (ptr[1] == 'R' && ptr[2] == '&') |
|
4322 { |
|
4323 terminator = -1; |
|
4324 ptr += 2; |
|
4325 code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */ |
|
4326 } |
|
4327 |
|
4328 /* Check for a test for a named group's having been set, using the Perl |
|
4329 syntax (?(<name>) or (?('name') */ |
|
4330 |
|
4331 else if (ptr[1] == '<') |
|
4332 { |
|
4333 terminator = '>'; |
|
4334 ptr++; |
|
4335 } |
|
4336 else if (ptr[1] == '\'') |
|
4337 { |
|
4338 terminator = '\''; |
|
4339 ptr++; |
|
4340 } |
|
4341 else |
|
4342 { |
|
4343 terminator = 0; |
|
4344 if (ptr[1] == '-' || ptr[1] == '+') refsign = *(++ptr); |
|
4345 } |
|
4346 |
|
4347 /* We now expect to read a name; any thing else is an error */ |
|
4348 |
|
4349 if ((cd->ctypes[ptr[1]] & ctype_word) == 0) |
|
4350 { |
|
4351 ptr += 1; /* To get the right offset */ |
|
4352 *errorcodeptr = ERR28; |
|
4353 goto FAILED; |
|
4354 } |
|
4355 |
|
4356 /* Read the name, but also get it as a number if it's all digits */ |
|
4357 |
|
4358 recno = 0; |
|
4359 name = ++ptr; |
|
4360 while ((cd->ctypes[*ptr] & ctype_word) != 0) |
|
4361 { |
|
4362 if (recno >= 0) |
|
4363 recno = ((digitab[*ptr] & ctype_digit) != 0)? |
|
4364 recno * 10 + *ptr - '0' : -1; |
|
4365 ptr++; |
|
4366 } |
|
4367 namelen = ptr - name; |
|
4368 |
|
4369 if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')') |
|
4370 { |
|
4371 ptr--; /* Error offset */ |
|
4372 *errorcodeptr = ERR26; |
|
4373 goto FAILED; |
|
4374 } |
|
4375 |
|
4376 /* Do no further checking in the pre-compile phase. */ |
|
4377 |
|
4378 if (lengthptr != NULL) break; |
|
4379 |
|
4380 /* In the real compile we do the work of looking for the actual |
|
4381 reference. If the string started with "+" or "-" we require the rest to |
|
4382 be digits, in which case recno will be set. */ |
|
4383 |
|
4384 if (refsign > 0) |
|
4385 { |
|
4386 if (recno <= 0) |
|
4387 { |
|
4388 *errorcodeptr = ERR58; |
|
4389 goto FAILED; |
|
4390 } |
|
4391 recno = (refsign == '-')? |
|
4392 cd->bracount - recno + 1 : recno +cd->bracount; |
|
4393 if (recno <= 0 || recno > cd->final_bracount) |
|
4394 { |
|
4395 *errorcodeptr = ERR15; |
|
4396 goto FAILED; |
|
4397 } |
|
4398 PUT2(code, 2+LINK_SIZE, recno); |
|
4399 break; |
|
4400 } |
|
4401 |
|
4402 /* Otherwise (did not start with "+" or "-"), start by looking for the |
|
4403 name. */ |
|
4404 |
|
4405 slot = cd->name_table; |
|
4406 for (i = 0; i < cd->names_found; i++) |
|
4407 { |
|
4408 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break; |
|
4409 slot += cd->name_entry_size; |
|
4410 } |
|
4411 |
|
4412 /* Found a previous named subpattern */ |
|
4413 |
|
4414 if (i < cd->names_found) |
|
4415 { |
|
4416 recno = GET2(slot, 0); |
|
4417 PUT2(code, 2+LINK_SIZE, recno); |
|
4418 } |
|
4419 |
|
4420 /* Search the pattern for a forward reference */ |
|
4421 |
|
4422 else if ((i = find_parens(ptr, cd, name, namelen, |
|
4423 (options & PCRE_EXTENDED) != 0)) > 0) |
|
4424 { |
|
4425 PUT2(code, 2+LINK_SIZE, i); |
|
4426 } |
|
4427 |
|
4428 /* If terminator == 0 it means that the name followed directly after |
|
4429 the opening parenthesis [e.g. (?(abc)...] and in this case there are |
|
4430 some further alternatives to try. For the cases where terminator != 0 |
|
4431 [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have |
|
4432 now checked all the possibilities, so give an error. */ |
|
4433 |
|
4434 else if (terminator != 0) |
|
4435 { |
|
4436 *errorcodeptr = ERR15; |
|
4437 goto FAILED; |
|
4438 } |
|
4439 |
|
4440 /* Check for (?(R) for recursion. Allow digits after R to specify a |
|
4441 specific group number. */ |
|
4442 |
|
4443 else if (*name == 'R') |
|
4444 { |
|
4445 recno = 0; |
|
4446 for (i = 1; i < namelen; i++) |
|
4447 { |
|
4448 if ((digitab[name[i]] & ctype_digit) == 0) |
|
4449 { |
|
4450 *errorcodeptr = ERR15; |
|
4451 goto FAILED; |
|
4452 } |
|
4453 recno = recno * 10 + name[i] - '0'; |
|
4454 } |
|
4455 if (recno == 0) recno = RREF_ANY; |
|
4456 code[1+LINK_SIZE] = OP_RREF; /* Change test type */ |
|
4457 PUT2(code, 2+LINK_SIZE, recno); |
|
4458 } |
|
4459 |
|
4460 /* Similarly, check for the (?(DEFINE) "condition", which is always |
|
4461 false. */ |
|
4462 |
|
4463 else if (namelen == 6 && strncmp((char *)name, "DEFINE", 6) == 0) |
|
4464 { |
|
4465 code[1+LINK_SIZE] = OP_DEF; |
|
4466 skipbytes = 1; |
|
4467 } |
|
4468 |
|
4469 /* Check for the "name" actually being a subpattern number. We are |
|
4470 in the second pass here, so final_bracount is set. */ |
|
4471 |
|
4472 else if (recno > 0 && recno <= cd->final_bracount) |
|
4473 { |
|
4474 PUT2(code, 2+LINK_SIZE, recno); |
|
4475 } |
|
4476 |
|
4477 /* Either an unidentified subpattern, or a reference to (?(0) */ |
|
4478 |
|
4479 else |
|
4480 { |
|
4481 *errorcodeptr = (recno == 0)? ERR35: ERR15; |
|
4482 goto FAILED; |
|
4483 } |
|
4484 break; |
|
4485 |
|
4486 |
|
4487 /* ------------------------------------------------------------ */ |
|
4488 case '=': /* Positive lookahead */ |
|
4489 bravalue = OP_ASSERT; |
|
4490 ptr++; |
|
4491 break; |
|
4492 |
|
4493 |
|
4494 /* ------------------------------------------------------------ */ |
|
4495 case '!': /* Negative lookahead */ |
|
4496 ptr++; |
|
4497 if (*ptr == ')') /* Optimize (?!) */ |
|
4498 { |
|
4499 *code++ = OP_FAIL; |
|
4500 previous = NULL; |
|
4501 continue; |
|
4502 } |
|
4503 bravalue = OP_ASSERT_NOT; |
|
4504 break; |
|
4505 |
|
4506 |
|
4507 /* ------------------------------------------------------------ */ |
|
4508 case '<': /* Lookbehind or named define */ |
|
4509 switch (ptr[1]) |
|
4510 { |
|
4511 case '=': /* Positive lookbehind */ |
|
4512 bravalue = OP_ASSERTBACK; |
|
4513 ptr += 2; |
|
4514 break; |
|
4515 |
|
4516 case '!': /* Negative lookbehind */ |
|
4517 bravalue = OP_ASSERTBACK_NOT; |
|
4518 ptr += 2; |
|
4519 break; |
|
4520 |
|
4521 default: /* Could be name define, else bad */ |
|
4522 if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME; |
|
4523 ptr++; /* Correct offset for error */ |
|
4524 *errorcodeptr = ERR24; |
|
4525 goto FAILED; |
|
4526 } |
|
4527 break; |
|
4528 |
|
4529 |
|
4530 /* ------------------------------------------------------------ */ |
|
4531 case '>': /* One-time brackets */ |
|
4532 bravalue = OP_ONCE; |
|
4533 ptr++; |
|
4534 break; |
|
4535 |
|
4536 |
|
4537 /* ------------------------------------------------------------ */ |
|
4538 case 'C': /* Callout - may be followed by digits; */ |
|
4539 previous_callout = code; /* Save for later completion */ |
|
4540 after_manual_callout = 1; /* Skip one item before completing */ |
|
4541 *code++ = OP_CALLOUT; |
|
4542 { |
|
4543 int n = 0; |
|
4544 while ((digitab[*(++ptr)] & ctype_digit) != 0) |
|
4545 n = n * 10 + *ptr - '0'; |
|
4546 if (*ptr != ')') |
|
4547 { |
|
4548 *errorcodeptr = ERR39; |
|
4549 goto FAILED; |
|
4550 } |
|
4551 if (n > 255) |
|
4552 { |
|
4553 *errorcodeptr = ERR38; |
|
4554 goto FAILED; |
|
4555 } |
|
4556 *code++ = n; |
|
4557 PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */ |
|
4558 PUT(code, LINK_SIZE, 0); /* Default length */ |
|
4559 code += 2 * LINK_SIZE; |
|
4560 } |
|
4561 previous = NULL; |
|
4562 continue; |
|
4563 |
|
4564 |
|
4565 /* ------------------------------------------------------------ */ |
|
4566 case 'P': /* Python-style named subpattern handling */ |
|
4567 if (*(++ptr) == '=' || *ptr == '>') /* Reference or recursion */ |
|
4568 { |
|
4569 is_recurse = *ptr == '>'; |
|
4570 terminator = ')'; |
|
4571 goto NAMED_REF_OR_RECURSE; |
|
4572 } |
|
4573 else if (*ptr != '<') /* Test for Python-style definition */ |
|
4574 { |
|
4575 *errorcodeptr = ERR41; |
|
4576 goto FAILED; |
|
4577 } |
|
4578 /* Fall through to handle (?P< as (?< is handled */ |
|
4579 |
|
4580 |
|
4581 /* ------------------------------------------------------------ */ |
|
4582 DEFINE_NAME: /* Come here from (?< handling */ |
|
4583 case '\'': |
|
4584 { |
|
4585 terminator = (*ptr == '<')? '>' : '\''; |
|
4586 name = ++ptr; |
|
4587 |
|
4588 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++; |
|
4589 namelen = ptr - name; |
|
4590 |
|
4591 /* In the pre-compile phase, just do a syntax check. */ |
|
4592 |
|
4593 if (lengthptr != NULL) |
|
4594 { |
|
4595 if (*ptr != terminator) |
|
4596 { |
|
4597 *errorcodeptr = ERR42; |
|
4598 goto FAILED; |
|
4599 } |
|
4600 if (cd->names_found >= MAX_NAME_COUNT) |
|
4601 { |
|
4602 *errorcodeptr = ERR49; |
|
4603 goto FAILED; |
|
4604 } |
|
4605 if (namelen + 3 > cd->name_entry_size) |
|
4606 { |
|
4607 cd->name_entry_size = namelen + 3; |
|
4608 if (namelen > MAX_NAME_SIZE) |
|
4609 { |
|
4610 *errorcodeptr = ERR48; |
|
4611 goto FAILED; |
|
4612 } |
|
4613 } |
|
4614 } |
|
4615 |
|
4616 /* In the real compile, create the entry in the table */ |
|
4617 |
|
4618 else |
|
4619 { |
|
4620 slot = cd->name_table; |
|
4621 for (i = 0; i < cd->names_found; i++) |
|
4622 { |
|
4623 int crc = memcmp(name, slot+2, namelen); |
|
4624 if (crc == 0) |
|
4625 { |
|
4626 if (slot[2+namelen] == 0) |
|
4627 { |
|
4628 if ((options & PCRE_DUPNAMES) == 0) |
|
4629 { |
|
4630 *errorcodeptr = ERR43; |
|
4631 goto FAILED; |
|
4632 } |
|
4633 } |
|
4634 else crc = -1; /* Current name is substring */ |
|
4635 } |
|
4636 if (crc < 0) |
|
4637 { |
|
4638 memmove(slot + cd->name_entry_size, slot, |
|
4639 (cd->names_found - i) * cd->name_entry_size); |
|
4640 break; |
|
4641 } |
|
4642 slot += cd->name_entry_size; |
|
4643 } |
|
4644 |
|
4645 PUT2(slot, 0, cd->bracount + 1); |
|
4646 memcpy(slot + 2, name, namelen); |
|
4647 slot[2+namelen] = 0; |
|
4648 } |
|
4649 } |
|
4650 |
|
4651 /* In both cases, count the number of names we've encountered. */ |
|
4652 |
|
4653 ptr++; /* Move past > or ' */ |
|
4654 cd->names_found++; |
|
4655 goto NUMBERED_GROUP; |
|
4656 |
|
4657 |
|
4658 /* ------------------------------------------------------------ */ |
|
4659 case '&': /* Perl recursion/subroutine syntax */ |
|
4660 terminator = ')'; |
|
4661 is_recurse = TRUE; |
|
4662 /* Fall through */ |
|
4663 |
|
4664 /* We come here from the Python syntax above that handles both |
|
4665 references (?P=name) and recursion (?P>name), as well as falling |
|
4666 through from the Perl recursion syntax (?&name). We also come here from |
|
4667 the Perl \k<name> or \k'name' back reference syntax and the \k{name} |
|
4668 .NET syntax, and the Oniguruma \g<...> and \g'...' subroutine syntax. */ |
|
4669 |
|
4670 NAMED_REF_OR_RECURSE: |
|
4671 name = ++ptr; |
|
4672 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++; |
|
4673 namelen = ptr - name; |
|
4674 |
|
4675 /* In the pre-compile phase, do a syntax check and set a dummy |
|
4676 reference number. */ |
|
4677 |
|
4678 if (lengthptr != NULL) |
|
4679 { |
|
4680 if (namelen == 0) |
|
4681 { |
|
4682 *errorcodeptr = ERR62; |
|
4683 goto FAILED; |
|
4684 } |
|
4685 if (*ptr != terminator) |
|
4686 { |
|
4687 *errorcodeptr = ERR42; |
|
4688 goto FAILED; |
|
4689 } |
|
4690 if (namelen > MAX_NAME_SIZE) |
|
4691 { |
|
4692 *errorcodeptr = ERR48; |
|
4693 goto FAILED; |
|
4694 } |
|
4695 recno = 0; |
|
4696 } |
|
4697 |
|
4698 /* In the real compile, seek the name in the table. We check the name |
|
4699 first, and then check that we have reached the end of the name in the |
|
4700 table. That way, if the name that is longer than any in the table, |
|
4701 the comparison will fail without reading beyond the table entry. */ |
|
4702 |
|
4703 else |
|
4704 { |
|
4705 slot = cd->name_table; |
|
4706 for (i = 0; i < cd->names_found; i++) |
|
4707 { |
|
4708 if (strncmp((char *)name, (char *)slot+2, namelen) == 0 && |
|
4709 slot[2+namelen] == 0) |
|
4710 break; |
|
4711 slot += cd->name_entry_size; |
|
4712 } |
|
4713 |
|
4714 if (i < cd->names_found) /* Back reference */ |
|
4715 { |
|
4716 recno = GET2(slot, 0); |
|
4717 } |
|
4718 else if ((recno = /* Forward back reference */ |
|
4719 find_parens(ptr, cd, name, namelen, |
|
4720 (options & PCRE_EXTENDED) != 0)) <= 0) |
|
4721 { |
|
4722 *errorcodeptr = ERR15; |
|
4723 goto FAILED; |
|
4724 } |
|
4725 } |
|
4726 |
|
4727 /* In both phases, we can now go to the code than handles numerical |
|
4728 recursion or backreferences. */ |
|
4729 |
|
4730 if (is_recurse) goto HANDLE_RECURSION; |
|
4731 else goto HANDLE_REFERENCE; |
|
4732 |
|
4733 |
|
4734 /* ------------------------------------------------------------ */ |
|
4735 case 'R': /* Recursion */ |
|
4736 ptr++; /* Same as (?0) */ |
|
4737 /* Fall through */ |
|
4738 |
|
4739 |
|
4740 /* ------------------------------------------------------------ */ |
|
4741 case '-': case '+': |
|
4742 case '0': case '1': case '2': case '3': case '4': /* Recursion or */ |
|
4743 case '5': case '6': case '7': case '8': case '9': /* subroutine */ |
|
4744 { |
|
4745 const uschar *called; |
|
4746 terminator = ')'; |
|
4747 |
|
4748 /* Come here from the \g<...> and \g'...' code (Oniguruma |
|
4749 compatibility). However, the syntax has been checked to ensure that |
|
4750 the ... are a (signed) number, so that neither ERR63 nor ERR29 will |
|
4751 be called on this path, nor with the jump to OTHER_CHAR_AFTER_QUERY |
|
4752 ever be taken. */ |
|
4753 |
|
4754 HANDLE_NUMERICAL_RECURSION: |
|
4755 |
|
4756 if ((refsign = *ptr) == '+') |
|
4757 { |
|
4758 ptr++; |
|
4759 if ((digitab[*ptr] & ctype_digit) == 0) |
|
4760 { |
|
4761 *errorcodeptr = ERR63; |
|
4762 goto FAILED; |
|
4763 } |
|
4764 } |
|
4765 else if (refsign == '-') |
|
4766 { |
|
4767 if ((digitab[ptr[1]] & ctype_digit) == 0) |
|
4768 goto OTHER_CHAR_AFTER_QUERY; |
|
4769 ptr++; |
|
4770 } |
|
4771 |
|
4772 recno = 0; |
|
4773 while((digitab[*ptr] & ctype_digit) != 0) |
|
4774 recno = recno * 10 + *ptr++ - '0'; |
|
4775 |
|
4776 if (*ptr != terminator) |
|
4777 { |
|
4778 *errorcodeptr = ERR29; |
|
4779 goto FAILED; |
|
4780 } |
|
4781 |
|
4782 if (refsign == '-') |
|
4783 { |
|
4784 if (recno == 0) |
|
4785 { |
|
4786 *errorcodeptr = ERR58; |
|
4787 goto FAILED; |
|
4788 } |
|
4789 recno = cd->bracount - recno + 1; |
|
4790 if (recno <= 0) |
|
4791 { |
|
4792 *errorcodeptr = ERR15; |
|
4793 goto FAILED; |
|
4794 } |
|
4795 } |
|
4796 else if (refsign == '+') |
|
4797 { |
|
4798 if (recno == 0) |
|
4799 { |
|
4800 *errorcodeptr = ERR58; |
|
4801 goto FAILED; |
|
4802 } |
|
4803 recno += cd->bracount; |
|
4804 } |
|
4805 |
|
4806 /* Come here from code above that handles a named recursion */ |
|
4807 |
|
4808 HANDLE_RECURSION: |
|
4809 |
|
4810 previous = code; |
|
4811 called = cd->start_code; |
|
4812 |
|
4813 /* When we are actually compiling, find the bracket that is being |
|
4814 referenced. Temporarily end the regex in case it doesn't exist before |
|
4815 this point. If we end up with a forward reference, first check that |
|
4816 the bracket does occur later so we can give the error (and position) |
|
4817 now. Then remember this forward reference in the workspace so it can |
|
4818 be filled in at the end. */ |
|
4819 |
|
4820 if (lengthptr == NULL) |
|
4821 { |
|
4822 *code = OP_END; |
|
4823 if (recno != 0) called = find_bracket(cd->start_code, utf8, recno); |
|
4824 |
|
4825 /* Forward reference */ |
|
4826 |
|
4827 if (called == NULL) |
|
4828 { |
|
4829 if (find_parens(ptr, cd, NULL, recno, |
|
4830 (options & PCRE_EXTENDED) != 0) < 0) |
|
4831 { |
|
4832 *errorcodeptr = ERR15; |
|
4833 goto FAILED; |
|
4834 } |
|
4835 called = cd->start_code + recno; |
|
4836 PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code); |
|
4837 } |
|
4838 |
|
4839 /* If not a forward reference, and the subpattern is still open, |
|
4840 this is a recursive call. We check to see if this is a left |
|
4841 recursion that could loop for ever, and diagnose that case. */ |
|
4842 |
|
4843 else if (GET(called, 1) == 0 && |
|
4844 could_be_empty(called, code, bcptr, utf8)) |
|
4845 { |
|
4846 *errorcodeptr = ERR40; |
|
4847 goto FAILED; |
|
4848 } |
|
4849 } |
|
4850 |
|
4851 /* Insert the recursion/subroutine item, automatically wrapped inside |
|
4852 "once" brackets. Set up a "previous group" length so that a |
|
4853 subsequent quantifier will work. */ |
|
4854 |
|
4855 *code = OP_ONCE; |
|
4856 PUT(code, 1, 2 + 2*LINK_SIZE); |
|
4857 code += 1 + LINK_SIZE; |
|
4858 |
|
4859 *code = OP_RECURSE; |
|
4860 PUT(code, 1, called - cd->start_code); |
|
4861 code += 1 + LINK_SIZE; |
|
4862 |
|
4863 *code = OP_KET; |
|
4864 PUT(code, 1, 2 + 2*LINK_SIZE); |
|
4865 code += 1 + LINK_SIZE; |
|
4866 |
|
4867 length_prevgroup = 3 + 3*LINK_SIZE; |
|
4868 } |
|
4869 |
|
4870 /* Can't determine a first byte now */ |
|
4871 |
|
4872 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE; |
|
4873 continue; |
|
4874 |
|
4875 |
|
4876 /* ------------------------------------------------------------ */ |
|
4877 default: /* Other characters: check option setting */ |
|
4878 OTHER_CHAR_AFTER_QUERY: |
|
4879 set = unset = 0; |
|
4880 optset = &set; |
|
4881 |
|
4882 while (*ptr != ')' && *ptr != ':') |
|
4883 { |
|
4884 switch (*ptr++) |
|
4885 { |
|
4886 case '-': optset = &unset; break; |
|
4887 |
|
4888 case 'J': /* Record that it changed in the external options */ |
|
4889 *optset |= PCRE_DUPNAMES; |
|
4890 cd->external_flags |= PCRE_JCHANGED; |
|
4891 break; |
|
4892 |
|
4893 case 'i': *optset |= PCRE_CASELESS; break; |
|
4894 case 'm': *optset |= PCRE_MULTILINE; break; |
|
4895 case 's': *optset |= PCRE_DOTALL; break; |
|
4896 case 'x': *optset |= PCRE_EXTENDED; break; |
|
4897 case 'U': *optset |= PCRE_UNGREEDY; break; |
|
4898 case 'X': *optset |= PCRE_EXTRA; break; |
|
4899 |
|
4900 default: *errorcodeptr = ERR12; |
|
4901 ptr--; /* Correct the offset */ |
|
4902 goto FAILED; |
|
4903 } |
|
4904 } |
|
4905 |
|
4906 /* Set up the changed option bits, but don't change anything yet. */ |
|
4907 |
|
4908 newoptions = (options | set) & (~unset); |
|
4909 |
|
4910 /* If the options ended with ')' this is not the start of a nested |
|
4911 group with option changes, so the options change at this level. If this |
|
4912 item is right at the start of the pattern, the options can be |
|
4913 abstracted and made external in the pre-compile phase, and ignored in |
|
4914 the compile phase. This can be helpful when matching -- for instance in |
|
4915 caseless checking of required bytes. |
|
4916 |
|
4917 If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are |
|
4918 definitely *not* at the start of the pattern because something has been |
|
4919 compiled. In the pre-compile phase, however, the code pointer can have |
|
4920 that value after the start, because it gets reset as code is discarded |
|
4921 during the pre-compile. However, this can happen only at top level - if |
|
4922 we are within parentheses, the starting BRA will still be present. At |
|
4923 any parenthesis level, the length value can be used to test if anything |
|
4924 has been compiled at that level. Thus, a test for both these conditions |
|
4925 is necessary to ensure we correctly detect the start of the pattern in |
|
4926 both phases. |
|
4927 |
|
4928 If we are not at the pattern start, compile code to change the ims |
|
4929 options if this setting actually changes any of them, and reset the |
|
4930 greedy defaults and the case value for firstbyte and reqbyte. */ |
|
4931 |
|
4932 if (*ptr == ')') |
|
4933 { |
|
4934 if (code == cd->start_code + 1 + LINK_SIZE && |
|
4935 (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE)) |
|
4936 { |
|
4937 cd->external_options = newoptions; |
|
4938 } |
|
4939 else |
|
4940 { |
|
4941 if ((options & PCRE_IMS) != (newoptions & PCRE_IMS)) |
|
4942 { |
|
4943 *code++ = OP_OPT; |
|
4944 *code++ = newoptions & PCRE_IMS; |
|
4945 } |
|
4946 greedy_default = ((newoptions & PCRE_UNGREEDY) != 0); |
|
4947 greedy_non_default = greedy_default ^ 1; |
|
4948 req_caseopt = ((newoptions & PCRE_CASELESS) != 0)? REQ_CASELESS : 0; |
|
4949 } |
|
4950 |
|
4951 /* Change options at this level, and pass them back for use |
|
4952 in subsequent branches. When not at the start of the pattern, this |
|
4953 information is also necessary so that a resetting item can be |
|
4954 compiled at the end of a group (if we are in a group). */ |
|
4955 |
|
4956 *optionsptr = options = newoptions; |
|
4957 previous = NULL; /* This item can't be repeated */ |
|
4958 continue; /* It is complete */ |
|
4959 } |
|
4960 |
|
4961 /* If the options ended with ':' we are heading into a nested group |
|
4962 with possible change of options. Such groups are non-capturing and are |
|
4963 not assertions of any kind. All we need to do is skip over the ':'; |
|
4964 the newoptions value is handled below. */ |
|
4965 |
|
4966 bravalue = OP_BRA; |
|
4967 ptr++; |
|
4968 } /* End of switch for character following (? */ |
|
4969 } /* End of (? handling */ |
|
4970 |
|
4971 /* Opening parenthesis not followed by '?'. If PCRE_NO_AUTO_CAPTURE is set, |
|
4972 all unadorned brackets become non-capturing and behave like (?:...) |
|
4973 brackets. */ |
|
4974 |
|
4975 else if ((options & PCRE_NO_AUTO_CAPTURE) != 0) |
|
4976 { |
|
4977 bravalue = OP_BRA; |
|
4978 } |
|
4979 |
|
4980 /* Else we have a capturing group. */ |
|
4981 |
|
4982 else |
|
4983 { |
|
4984 NUMBERED_GROUP: |
|
4985 cd->bracount += 1; |
|
4986 PUT2(code, 1+LINK_SIZE, cd->bracount); |
|
4987 skipbytes = 2; |
|
4988 } |
|
4989 |
|
4990 /* Process nested bracketed regex. Assertions may not be repeated, but |
|
4991 other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a |
|
4992 non-register variable in order to be able to pass its address because some |
|
4993 compilers complain otherwise. Pass in a new setting for the ims options if |
|
4994 they have changed. */ |
|
4995 |
|
4996 previous = (bravalue >= OP_ONCE)? code : NULL; |
|
4997 *code = bravalue; |
|
4998 tempcode = code; |
|
4999 tempreqvary = cd->req_varyopt; /* Save value before bracket */ |
|
5000 length_prevgroup = 0; /* Initialize for pre-compile phase */ |
|
5001 |
|
5002 if (!compile_regex( |
|
5003 newoptions, /* The complete new option state */ |
|
5004 options & PCRE_IMS, /* The previous ims option state */ |
|
5005 &tempcode, /* Where to put code (updated) */ |
|
5006 &ptr, /* Input pointer (updated) */ |
|
5007 errorcodeptr, /* Where to put an error message */ |
|
5008 (bravalue == OP_ASSERTBACK || |
|
5009 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */ |
|
5010 reset_bracount, /* True if (?| group */ |
|
5011 skipbytes, /* Skip over bracket number */ |
|
5012 &subfirstbyte, /* For possible first char */ |
|
5013 &subreqbyte, /* For possible last char */ |
|
5014 bcptr, /* Current branch chain */ |
|
5015 cd, /* Tables block */ |
|
5016 (lengthptr == NULL)? NULL : /* Actual compile phase */ |
|
5017 &length_prevgroup /* Pre-compile phase */ |
|
5018 )) |
|
5019 goto FAILED; |
|
5020 |
|
5021 /* At the end of compiling, code is still pointing to the start of the |
|
5022 group, while tempcode has been updated to point past the end of the group |
|
5023 and any option resetting that may follow it. The pattern pointer (ptr) |
|
5024 is on the bracket. */ |
|
5025 |
|
5026 /* If this is a conditional bracket, check that there are no more than |
|
5027 two branches in the group, or just one if it's a DEFINE group. We do this |
|
5028 in the real compile phase, not in the pre-pass, where the whole group may |
|
5029 not be available. */ |
|
5030 |
|
5031 if (bravalue == OP_COND && lengthptr == NULL) |
|
5032 { |
|
5033 uschar *tc = code; |
|
5034 int condcount = 0; |
|
5035 |
|
5036 do { |
|
5037 condcount++; |
|
5038 tc += GET(tc,1); |
|
5039 } |
|
5040 while (*tc != OP_KET); |
|
5041 |
|
5042 /* A DEFINE group is never obeyed inline (the "condition" is always |
|
5043 false). It must have only one branch. */ |
|
5044 |
|
5045 if (code[LINK_SIZE+1] == OP_DEF) |
|
5046 { |
|
5047 if (condcount > 1) |
|
5048 { |
|
5049 *errorcodeptr = ERR54; |
|
5050 goto FAILED; |
|
5051 } |
|
5052 bravalue = OP_DEF; /* Just a flag to suppress char handling below */ |
|
5053 } |
|
5054 |
|
5055 /* A "normal" conditional group. If there is just one branch, we must not |
|
5056 make use of its firstbyte or reqbyte, because this is equivalent to an |
|
5057 empty second branch. */ |
|
5058 |
|
5059 else |
|
5060 { |
|
5061 if (condcount > 2) |
|
5062 { |
|
5063 *errorcodeptr = ERR27; |
|
5064 goto FAILED; |
|
5065 } |
|
5066 if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE; |
|
5067 } |
|
5068 } |
|
5069 |
|
5070 /* Error if hit end of pattern */ |
|
5071 |
|
5072 if (*ptr != ')') |
|
5073 { |
|
5074 *errorcodeptr = ERR14; |
|
5075 goto FAILED; |
|
5076 } |
|
5077 |
|
5078 /* In the pre-compile phase, update the length by the length of the group, |
|
5079 less the brackets at either end. Then reduce the compiled code to just a |
|
5080 set of non-capturing brackets so that it doesn't use much memory if it is |
|
5081 duplicated by a quantifier.*/ |
|
5082 |
|
5083 if (lengthptr != NULL) |
|
5084 { |
|
5085 if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE) |
|
5086 { |
|
5087 *errorcodeptr = ERR20; |
|
5088 goto FAILED; |
|
5089 } |
|
5090 *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE; |
|
5091 *code++ = OP_BRA; |
|
5092 PUTINC(code, 0, 1 + LINK_SIZE); |
|
5093 *code++ = OP_KET; |
|
5094 PUTINC(code, 0, 1 + LINK_SIZE); |
|
5095 break; /* No need to waste time with special character handling */ |
|
5096 } |
|
5097 |
|
5098 /* Otherwise update the main code pointer to the end of the group. */ |
|
5099 |
|
5100 code = tempcode; |
|
5101 |
|
5102 /* For a DEFINE group, required and first character settings are not |
|
5103 relevant. */ |
|
5104 |
|
5105 if (bravalue == OP_DEF) break; |
|
5106 |
|
5107 /* Handle updating of the required and first characters for other types of |
|
5108 group. Update for normal brackets of all kinds, and conditions with two |
|
5109 branches (see code above). If the bracket is followed by a quantifier with |
|
5110 zero repeat, we have to back off. Hence the definition of zeroreqbyte and |
|
5111 zerofirstbyte outside the main loop so that they can be accessed for the |
|
5112 back off. */ |
|
5113 |
|
5114 zeroreqbyte = reqbyte; |
|
5115 zerofirstbyte = firstbyte; |
|
5116 groupsetfirstbyte = FALSE; |
|
5117 |
|
5118 if (bravalue >= OP_ONCE) |
|
5119 { |
|
5120 /* If we have not yet set a firstbyte in this branch, take it from the |
|
5121 subpattern, remembering that it was set here so that a repeat of more |
|
5122 than one can replicate it as reqbyte if necessary. If the subpattern has |
|
5123 no firstbyte, set "none" for the whole branch. In both cases, a zero |
|
5124 repeat forces firstbyte to "none". */ |
|
5125 |
|
5126 if (firstbyte == REQ_UNSET) |
|
5127 { |
|
5128 if (subfirstbyte >= 0) |
|
5129 { |
|
5130 firstbyte = subfirstbyte; |
|
5131 groupsetfirstbyte = TRUE; |
|
5132 } |
|
5133 else firstbyte = REQ_NONE; |
|
5134 zerofirstbyte = REQ_NONE; |
|
5135 } |
|
5136 |
|
5137 /* If firstbyte was previously set, convert the subpattern's firstbyte |
|
5138 into reqbyte if there wasn't one, using the vary flag that was in |
|
5139 existence beforehand. */ |
|
5140 |
|
5141 else if (subfirstbyte >= 0 && subreqbyte < 0) |
|
5142 subreqbyte = subfirstbyte | tempreqvary; |
|
5143 |
|
5144 /* If the subpattern set a required byte (or set a first byte that isn't |
|
5145 really the first byte - see above), set it. */ |
|
5146 |
|
5147 if (subreqbyte >= 0) reqbyte = subreqbyte; |
|
5148 } |
|
5149 |
|
5150 /* For a forward assertion, we take the reqbyte, if set. This can be |
|
5151 helpful if the pattern that follows the assertion doesn't set a different |
|
5152 char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte |
|
5153 for an assertion, however because it leads to incorrect effect for patterns |
|
5154 such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead |
|
5155 of a firstbyte. This is overcome by a scan at the end if there's no |
|
5156 firstbyte, looking for an asserted first char. */ |
|
5157 |
|
5158 else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte; |
|
5159 break; /* End of processing '(' */ |
|
5160 |
|
5161 |
|
5162 /* ===================================================================*/ |
|
5163 /* Handle metasequences introduced by \. For ones like \d, the ESC_ values |
|
5164 are arranged to be the negation of the corresponding OP_values. For the |
|
5165 back references, the values are ESC_REF plus the reference number. Only |
|
5166 back references and those types that consume a character may be repeated. |
|
5167 We can test for values between ESC_b and ESC_Z for the latter; this may |
|
5168 have to change if any new ones are ever created. */ |
|
5169 |
|
5170 case '\\': |
|
5171 tempptr = ptr; |
|
5172 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE); |
|
5173 if (*errorcodeptr != 0) goto FAILED; |
|
5174 |
|
5175 if (c < 0) |
|
5176 { |
|
5177 if (-c == ESC_Q) /* Handle start of quoted string */ |
|
5178 { |
|
5179 if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */ |
|
5180 else inescq = TRUE; |
|
5181 continue; |
|
5182 } |
|
5183 |
|
5184 if (-c == ESC_E) continue; /* Perl ignores an orphan \E */ |
|
5185 |
|
5186 /* For metasequences that actually match a character, we disable the |
|
5187 setting of a first character if it hasn't already been set. */ |
|
5188 |
|
5189 if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z) |
|
5190 firstbyte = REQ_NONE; |
|
5191 |
|
5192 /* Set values to reset to if this is followed by a zero repeat. */ |
|
5193 |
|
5194 zerofirstbyte = firstbyte; |
|
5195 zeroreqbyte = reqbyte; |
|
5196 |
|
5197 /* \g<name> or \g'name' is a subroutine call by name and \g<n> or \g'n' |
|
5198 is a subroutine call by number (Oniguruma syntax). In fact, the value |
|
5199 -ESC_g is returned only for these cases. So we don't need to check for < |
|
5200 or ' if the value is -ESC_g. For the Perl syntax \g{n} the value is |
|
5201 -ESC_REF+n, and for the Perl syntax \g{name} the result is -ESC_k (as |
|
5202 that is a synonym for a named back reference). */ |
|
5203 |
|
5204 if (-c == ESC_g) |
|
5205 { |
|
5206 const uschar *p; |
|
5207 save_hwm = cd->hwm; /* Normally this is set when '(' is read */ |
|
5208 terminator = (*(++ptr) == '<')? '>' : '\''; |
|
5209 |
|
5210 /* These two statements stop the compiler for warning about possibly |
|
5211 unset variables caused by the jump to HANDLE_NUMERICAL_RECURSION. In |
|
5212 fact, because we actually check for a number below, the paths that |
|
5213 would actually be in error are never taken. */ |
|
5214 |
|
5215 skipbytes = 0; |
|
5216 reset_bracount = FALSE; |
|
5217 |
|
5218 /* Test for a name */ |
|
5219 |
|
5220 if (ptr[1] != '+' && ptr[1] != '-') |
|
5221 { |
|
5222 BOOL isnumber = TRUE; |
|
5223 for (p = ptr + 1; *p != 0 && *p != terminator; p++) |
|
5224 { |
|
5225 if ((cd->ctypes[*p] & ctype_digit) == 0) isnumber = FALSE; |
|
5226 if ((cd->ctypes[*p] & ctype_word) == 0) break; |
|
5227 } |
|
5228 if (*p != terminator) |
|
5229 { |
|
5230 *errorcodeptr = ERR57; |
|
5231 break; |
|
5232 } |
|
5233 if (isnumber) |
|
5234 { |
|
5235 ptr++; |
|
5236 goto HANDLE_NUMERICAL_RECURSION; |
|
5237 } |
|
5238 is_recurse = TRUE; |
|
5239 goto NAMED_REF_OR_RECURSE; |
|
5240 } |
|
5241 |
|
5242 /* Test a signed number in angle brackets or quotes. */ |
|
5243 |
|
5244 p = ptr + 2; |
|
5245 while ((digitab[*p] & ctype_digit) != 0) p++; |
|
5246 if (*p != terminator) |
|
5247 { |
|
5248 *errorcodeptr = ERR57; |
|
5249 break; |
|
5250 } |
|
5251 ptr++; |
|
5252 goto HANDLE_NUMERICAL_RECURSION; |
|
5253 } |
|
5254 |
|
5255 /* \k<name> or \k'name' is a back reference by name (Perl syntax). |
|
5256 We also support \k{name} (.NET syntax) */ |
|
5257 |
|
5258 if (-c == ESC_k && (ptr[1] == '<' || ptr[1] == '\'' || ptr[1] == '{')) |
|
5259 { |
|
5260 is_recurse = FALSE; |
|
5261 terminator = (*(++ptr) == '<')? '>' : (*ptr == '\'')? '\'' : '}'; |
|
5262 goto NAMED_REF_OR_RECURSE; |
|
5263 } |
|
5264 |
|
5265 /* Back references are handled specially; must disable firstbyte if |
|
5266 not set to cope with cases like (?=(\w+))\1: which would otherwise set |
|
5267 ':' later. */ |
|
5268 |
|
5269 if (-c >= ESC_REF) |
|
5270 { |
|
5271 recno = -c - ESC_REF; |
|
5272 |
|
5273 HANDLE_REFERENCE: /* Come here from named backref handling */ |
|
5274 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE; |
|
5275 previous = code; |
|
5276 *code++ = OP_REF; |
|
5277 PUT2INC(code, 0, recno); |
|
5278 cd->backref_map |= (recno < 32)? (1 << recno) : 1; |
|
5279 if (recno > cd->top_backref) cd->top_backref = recno; |
|
5280 } |
|
5281 |
|
5282 /* So are Unicode property matches, if supported. */ |
|
5283 |
|
5284 #ifdef SUPPORT_UCP |
|
5285 else if (-c == ESC_P || -c == ESC_p) |
|
5286 { |
|
5287 BOOL negated; |
|
5288 int pdata; |
|
5289 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr); |
|
5290 if (ptype < 0) goto FAILED; |
|
5291 previous = code; |
|
5292 *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP; |
|
5293 *code++ = ptype; |
|
5294 *code++ = pdata; |
|
5295 } |
|
5296 #else |
|
5297 |
|
5298 /* If Unicode properties are not supported, \X, \P, and \p are not |
|
5299 allowed. */ |
|
5300 |
|
5301 else if (-c == ESC_X || -c == ESC_P || -c == ESC_p) |
|
5302 { |
|
5303 *errorcodeptr = ERR45; |
|
5304 goto FAILED; |
|
5305 } |
|
5306 #endif |
|
5307 |
|
5308 /* For the rest (including \X when Unicode properties are supported), we |
|
5309 can obtain the OP value by negating the escape value. */ |
|
5310 |
|
5311 else |
|
5312 { |
|
5313 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL; |
|
5314 *code++ = -c; |
|
5315 } |
|
5316 continue; |
|
5317 } |
|
5318 |
|
5319 /* We have a data character whose value is in c. In UTF-8 mode it may have |
|
5320 a value > 127. We set its representation in the length/buffer, and then |
|
5321 handle it as a data character. */ |
|
5322 |
|
5323 #ifdef SUPPORT_UTF8 |
|
5324 if (utf8 && c > 127) |
|
5325 mclength = _pcre_ord2utf8(c, mcbuffer); |
|
5326 else |
|
5327 #endif |
|
5328 |
|
5329 { |
|
5330 mcbuffer[0] = c; |
|
5331 mclength = 1; |
|
5332 } |
|
5333 goto ONE_CHAR; |
|
5334 |
|
5335 |
|
5336 /* ===================================================================*/ |
|
5337 /* Handle a literal character. It is guaranteed not to be whitespace or # |
|
5338 when the extended flag is set. If we are in UTF-8 mode, it may be a |
|
5339 multi-byte literal character. */ |
|
5340 |
|
5341 default: |
|
5342 NORMAL_CHAR: |
|
5343 mclength = 1; |
|
5344 mcbuffer[0] = c; |
|
5345 |
|
5346 #ifdef SUPPORT_UTF8 |
|
5347 if (utf8 && c >= 0xc0) |
|
5348 { |
|
5349 while ((ptr[1] & 0xc0) == 0x80) |
|
5350 mcbuffer[mclength++] = *(++ptr); |
|
5351 } |
|
5352 #endif |
|
5353 |
|
5354 /* At this point we have the character's bytes in mcbuffer, and the length |
|
5355 in mclength. When not in UTF-8 mode, the length is always 1. */ |
|
5356 |
|
5357 ONE_CHAR: |
|
5358 previous = code; |
|
5359 *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR; |
|
5360 for (c = 0; c < mclength; c++) *code++ = mcbuffer[c]; |
|
5361 |
|
5362 /* Remember if \r or \n were seen */ |
|
5363 |
|
5364 if (mcbuffer[0] == '\r' || mcbuffer[0] == '\n') |
|
5365 cd->external_flags |= PCRE_HASCRORLF; |
|
5366 |
|
5367 /* Set the first and required bytes appropriately. If no previous first |
|
5368 byte, set it from this character, but revert to none on a zero repeat. |
|
5369 Otherwise, leave the firstbyte value alone, and don't change it on a zero |
|
5370 repeat. */ |
|
5371 |
|
5372 if (firstbyte == REQ_UNSET) |
|
5373 { |
|
5374 zerofirstbyte = REQ_NONE; |
|
5375 zeroreqbyte = reqbyte; |
|
5376 |
|
5377 /* If the character is more than one byte long, we can set firstbyte |
|
5378 only if it is not to be matched caselessly. */ |
|
5379 |
|
5380 if (mclength == 1 || req_caseopt == 0) |
|
5381 { |
|
5382 firstbyte = mcbuffer[0] | req_caseopt; |
|
5383 if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt; |
|
5384 } |
|
5385 else firstbyte = reqbyte = REQ_NONE; |
|
5386 } |
|
5387 |
|
5388 /* firstbyte was previously set; we can set reqbyte only the length is |
|
5389 1 or the matching is caseful. */ |
|
5390 |
|
5391 else |
|
5392 { |
|
5393 zerofirstbyte = firstbyte; |
|
5394 zeroreqbyte = reqbyte; |
|
5395 if (mclength == 1 || req_caseopt == 0) |
|
5396 reqbyte = code[-1] | req_caseopt | cd->req_varyopt; |
|
5397 } |
|
5398 |
|
5399 break; /* End of literal character handling */ |
|
5400 } |
|
5401 } /* end of big loop */ |
|
5402 |
|
5403 |
|
5404 /* Control never reaches here by falling through, only by a goto for all the |
|
5405 error states. Pass back the position in the pattern so that it can be displayed |
|
5406 to the user for diagnosing the error. */ |
|
5407 |
|
5408 FAILED: |
|
5409 *ptrptr = ptr; |
|
5410 return FALSE; |
|
5411 } |
|
5412 |
|
5413 |
|
5414 |
|
5415 |
|
5416 /************************************************* |
|
5417 * Compile sequence of alternatives * |
|
5418 *************************************************/ |
|
5419 |
|
5420 /* On entry, ptr is pointing past the bracket character, but on return it |
|
5421 points to the closing bracket, or vertical bar, or end of string. The code |
|
5422 variable is pointing at the byte into which the BRA operator has been stored. |
|
5423 If the ims options are changed at the start (for a (?ims: group) or during any |
|
5424 branch, we need to insert an OP_OPT item at the start of every following branch |
|
5425 to ensure they get set correctly at run time, and also pass the new options |
|
5426 into every subsequent branch compile. |
|
5427 |
|
5428 This function is used during the pre-compile phase when we are trying to find |
|
5429 out the amount of memory needed, as well as during the real compile phase. The |
|
5430 value of lengthptr distinguishes the two phases. |
|
5431 |
|
5432 Arguments: |
|
5433 options option bits, including any changes for this subpattern |
|
5434 oldims previous settings of ims option bits |
|
5435 codeptr -> the address of the current code pointer |
|
5436 ptrptr -> the address of the current pattern pointer |
|
5437 errorcodeptr -> pointer to error code variable |
|
5438 lookbehind TRUE if this is a lookbehind assertion |
|
5439 reset_bracount TRUE to reset the count for each branch |
|
5440 skipbytes skip this many bytes at start (for brackets and OP_COND) |
|
5441 firstbyteptr place to put the first required character, or a negative number |
|
5442 reqbyteptr place to put the last required character, or a negative number |
|
5443 bcptr pointer to the chain of currently open branches |
|
5444 cd points to the data block with tables pointers etc. |
|
5445 lengthptr NULL during the real compile phase |
|
5446 points to length accumulator during pre-compile phase |
|
5447 |
|
5448 Returns: TRUE on success |
|
5449 */ |
|
5450 |
|
5451 static BOOL |
|
5452 compile_regex(int options, int oldims, uschar **codeptr, const uschar **ptrptr, |
|
5453 int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes, |
|
5454 int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd, |
|
5455 int *lengthptr) |
|
5456 { |
|
5457 const uschar *ptr = *ptrptr; |
|
5458 uschar *code = *codeptr; |
|
5459 uschar *last_branch = code; |
|
5460 uschar *start_bracket = code; |
|
5461 uschar *reverse_count = NULL; |
|
5462 int firstbyte, reqbyte; |
|
5463 int branchfirstbyte, branchreqbyte; |
|
5464 int length; |
|
5465 int orig_bracount; |
|
5466 int max_bracount; |
|
5467 branch_chain bc; |
|
5468 |
|
5469 bc.outer = bcptr; |
|
5470 bc.current = code; |
|
5471 |
|
5472 firstbyte = reqbyte = REQ_UNSET; |
|
5473 |
|
5474 /* Accumulate the length for use in the pre-compile phase. Start with the |
|
5475 length of the BRA and KET and any extra bytes that are required at the |
|
5476 beginning. We accumulate in a local variable to save frequent testing of |
|
5477 lenthptr for NULL. We cannot do this by looking at the value of code at the |
|
5478 start and end of each alternative, because compiled items are discarded during |
|
5479 the pre-compile phase so that the work space is not exceeded. */ |
|
5480 |
|
5481 length = 2 + 2*LINK_SIZE + skipbytes; |
|
5482 |
|
5483 /* WARNING: If the above line is changed for any reason, you must also change |
|
5484 the code that abstracts option settings at the start of the pattern and makes |
|
5485 them global. It tests the value of length for (2 + 2*LINK_SIZE) in the |
|
5486 pre-compile phase to find out whether anything has yet been compiled or not. */ |
|
5487 |
|
5488 /* Offset is set zero to mark that this bracket is still open */ |
|
5489 |
|
5490 PUT(code, 1, 0); |
|
5491 code += 1 + LINK_SIZE + skipbytes; |
|
5492 |
|
5493 /* Loop for each alternative branch */ |
|
5494 |
|
5495 orig_bracount = max_bracount = cd->bracount; |
|
5496 for (;;) |
|
5497 { |
|
5498 /* For a (?| group, reset the capturing bracket count so that each branch |
|
5499 uses the same numbers. */ |
|
5500 |
|
5501 if (reset_bracount) cd->bracount = orig_bracount; |
|
5502 |
|
5503 /* Handle a change of ims options at the start of the branch */ |
|
5504 |
|
5505 if ((options & PCRE_IMS) != oldims) |
|
5506 { |
|
5507 *code++ = OP_OPT; |
|
5508 *code++ = options & PCRE_IMS; |
|
5509 length += 2; |
|
5510 } |
|
5511 |
|
5512 /* Set up dummy OP_REVERSE if lookbehind assertion */ |
|
5513 |
|
5514 if (lookbehind) |
|
5515 { |
|
5516 *code++ = OP_REVERSE; |
|
5517 reverse_count = code; |
|
5518 PUTINC(code, 0, 0); |
|
5519 length += 1 + LINK_SIZE; |
|
5520 } |
|
5521 |
|
5522 /* Now compile the branch; in the pre-compile phase its length gets added |
|
5523 into the length. */ |
|
5524 |
|
5525 if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstbyte, |
|
5526 &branchreqbyte, &bc, cd, (lengthptr == NULL)? NULL : &length)) |
|
5527 { |
|
5528 *ptrptr = ptr; |
|
5529 return FALSE; |
|
5530 } |
|
5531 |
|
5532 /* Keep the highest bracket count in case (?| was used and some branch |
|
5533 has fewer than the rest. */ |
|
5534 |
|
5535 if (cd->bracount > max_bracount) max_bracount = cd->bracount; |
|
5536 |
|
5537 /* In the real compile phase, there is some post-processing to be done. */ |
|
5538 |
|
5539 if (lengthptr == NULL) |
|
5540 { |
|
5541 /* If this is the first branch, the firstbyte and reqbyte values for the |
|
5542 branch become the values for the regex. */ |
|
5543 |
|
5544 if (*last_branch != OP_ALT) |
|
5545 { |
|
5546 firstbyte = branchfirstbyte; |
|
5547 reqbyte = branchreqbyte; |
|
5548 } |
|
5549 |
|
5550 /* If this is not the first branch, the first char and reqbyte have to |
|
5551 match the values from all the previous branches, except that if the |
|
5552 previous value for reqbyte didn't have REQ_VARY set, it can still match, |
|
5553 and we set REQ_VARY for the regex. */ |
|
5554 |
|
5555 else |
|
5556 { |
|
5557 /* If we previously had a firstbyte, but it doesn't match the new branch, |
|
5558 we have to abandon the firstbyte for the regex, but if there was |
|
5559 previously no reqbyte, it takes on the value of the old firstbyte. */ |
|
5560 |
|
5561 if (firstbyte >= 0 && firstbyte != branchfirstbyte) |
|
5562 { |
|
5563 if (reqbyte < 0) reqbyte = firstbyte; |
|
5564 firstbyte = REQ_NONE; |
|
5565 } |
|
5566 |
|
5567 /* If we (now or from before) have no firstbyte, a firstbyte from the |
|
5568 branch becomes a reqbyte if there isn't a branch reqbyte. */ |
|
5569 |
|
5570 if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0) |
|
5571 branchreqbyte = branchfirstbyte; |
|
5572 |
|
5573 /* Now ensure that the reqbytes match */ |
|
5574 |
|
5575 if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY)) |
|
5576 reqbyte = REQ_NONE; |
|
5577 else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */ |
|
5578 } |
|
5579 |
|
5580 /* If lookbehind, check that this branch matches a fixed-length string, and |
|
5581 put the length into the OP_REVERSE item. Temporarily mark the end of the |
|
5582 branch with OP_END. */ |
|
5583 |
|
5584 if (lookbehind) |
|
5585 { |
|
5586 int fixed_length; |
|
5587 *code = OP_END; |
|
5588 fixed_length = find_fixedlength(last_branch, options); |
|
5589 DPRINTF(("fixed length = %d\n", fixed_length)); |
|
5590 if (fixed_length < 0) |
|
5591 { |
|
5592 *errorcodeptr = (fixed_length == -2)? ERR36 : ERR25; |
|
5593 *ptrptr = ptr; |
|
5594 return FALSE; |
|
5595 } |
|
5596 PUT(reverse_count, 0, fixed_length); |
|
5597 } |
|
5598 } |
|
5599 |
|
5600 /* Reached end of expression, either ')' or end of pattern. In the real |
|
5601 compile phase, go back through the alternative branches and reverse the chain |
|
5602 of offsets, with the field in the BRA item now becoming an offset to the |
|
5603 first alternative. If there are no alternatives, it points to the end of the |
|
5604 group. The length in the terminating ket is always the length of the whole |
|
5605 bracketed item. If any of the ims options were changed inside the group, |
|
5606 compile a resetting op-code following, except at the very end of the pattern. |
|
5607 Return leaving the pointer at the terminating char. */ |
|
5608 |
|
5609 if (*ptr != '|') |
|
5610 { |
|
5611 if (lengthptr == NULL) |
|
5612 { |
|
5613 int branch_length = code - last_branch; |
|
5614 do |
|
5615 { |
|
5616 int prev_length = GET(last_branch, 1); |
|
5617 PUT(last_branch, 1, branch_length); |
|
5618 branch_length = prev_length; |
|
5619 last_branch -= branch_length; |
|
5620 } |
|
5621 while (branch_length > 0); |
|
5622 } |
|
5623 |
|
5624 /* Fill in the ket */ |
|
5625 |
|
5626 *code = OP_KET; |
|
5627 PUT(code, 1, code - start_bracket); |
|
5628 code += 1 + LINK_SIZE; |
|
5629 |
|
5630 /* Resetting option if needed */ |
|
5631 |
|
5632 if ((options & PCRE_IMS) != oldims && *ptr == ')') |
|
5633 { |
|
5634 *code++ = OP_OPT; |
|
5635 *code++ = oldims; |
|
5636 length += 2; |
|
5637 } |
|
5638 |
|
5639 /* Retain the highest bracket number, in case resetting was used. */ |
|
5640 |
|
5641 cd->bracount = max_bracount; |
|
5642 |
|
5643 /* Set values to pass back */ |
|
5644 |
|
5645 *codeptr = code; |
|
5646 *ptrptr = ptr; |
|
5647 *firstbyteptr = firstbyte; |
|
5648 *reqbyteptr = reqbyte; |
|
5649 if (lengthptr != NULL) |
|
5650 { |
|
5651 if (OFLOW_MAX - *lengthptr < length) |
|
5652 { |
|
5653 *errorcodeptr = ERR20; |
|
5654 return FALSE; |
|
5655 } |
|
5656 *lengthptr += length; |
|
5657 } |
|
5658 return TRUE; |
|
5659 } |
|
5660 |
|
5661 /* Another branch follows. In the pre-compile phase, we can move the code |
|
5662 pointer back to where it was for the start of the first branch. (That is, |
|
5663 pretend that each branch is the only one.) |
|
5664 |
|
5665 In the real compile phase, insert an ALT node. Its length field points back |
|
5666 to the previous branch while the bracket remains open. At the end the chain |
|
5667 is reversed. It's done like this so that the start of the bracket has a |
|
5668 zero offset until it is closed, making it possible to detect recursion. */ |
|
5669 |
|
5670 if (lengthptr != NULL) |
|
5671 { |
|
5672 code = *codeptr + 1 + LINK_SIZE + skipbytes; |
|
5673 length += 1 + LINK_SIZE; |
|
5674 } |
|
5675 else |
|
5676 { |
|
5677 *code = OP_ALT; |
|
5678 PUT(code, 1, code - last_branch); |
|
5679 bc.current = last_branch = code; |
|
5680 code += 1 + LINK_SIZE; |
|
5681 } |
|
5682 |
|
5683 ptr++; |
|
5684 } |
|
5685 /* Control never reaches here */ |
|
5686 } |
|
5687 |
|
5688 |
|
5689 |
|
5690 |
|
5691 /************************************************* |
|
5692 * Check for anchored expression * |
|
5693 *************************************************/ |
|
5694 |
|
5695 /* Try to find out if this is an anchored regular expression. Consider each |
|
5696 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket |
|
5697 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then |
|
5698 it's anchored. However, if this is a multiline pattern, then only OP_SOD |
|
5699 counts, since OP_CIRC can match in the middle. |
|
5700 |
|
5701 We can also consider a regex to be anchored if OP_SOM starts all its branches. |
|
5702 This is the code for \G, which means "match at start of match position, taking |
|
5703 into account the match offset". |
|
5704 |
|
5705 A branch is also implicitly anchored if it starts with .* and DOTALL is set, |
|
5706 because that will try the rest of the pattern at all possible matching points, |
|
5707 so there is no point trying again.... er .... |
|
5708 |
|
5709 .... except when the .* appears inside capturing parentheses, and there is a |
|
5710 subsequent back reference to those parentheses. We haven't enough information |
|
5711 to catch that case precisely. |
|
5712 |
|
5713 At first, the best we could do was to detect when .* was in capturing brackets |
|
5714 and the highest back reference was greater than or equal to that level. |
|
5715 However, by keeping a bitmap of the first 31 back references, we can catch some |
|
5716 of the more common cases more precisely. |
|
5717 |
|
5718 Arguments: |
|
5719 code points to start of expression (the bracket) |
|
5720 options points to the options setting |
|
5721 bracket_map a bitmap of which brackets we are inside while testing; this |
|
5722 handles up to substring 31; after that we just have to take |
|
5723 the less precise approach |
|
5724 backref_map the back reference bitmap |
|
5725 |
|
5726 Returns: TRUE or FALSE |
|
5727 */ |
|
5728 |
|
5729 static BOOL |
|
5730 is_anchored(register const uschar *code, int *options, unsigned int bracket_map, |
|
5731 unsigned int backref_map) |
|
5732 { |
|
5733 do { |
|
5734 const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code], |
|
5735 options, PCRE_MULTILINE, FALSE); |
|
5736 register int op = *scode; |
|
5737 |
|
5738 /* Non-capturing brackets */ |
|
5739 |
|
5740 if (op == OP_BRA) |
|
5741 { |
|
5742 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE; |
|
5743 } |
|
5744 |
|
5745 /* Capturing brackets */ |
|
5746 |
|
5747 else if (op == OP_CBRA) |
|
5748 { |
|
5749 int n = GET2(scode, 1+LINK_SIZE); |
|
5750 int new_map = bracket_map | ((n < 32)? (1 << n) : 1); |
|
5751 if (!is_anchored(scode, options, new_map, backref_map)) return FALSE; |
|
5752 } |
|
5753 |
|
5754 /* Other brackets */ |
|
5755 |
|
5756 else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND) |
|
5757 { |
|
5758 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE; |
|
5759 } |
|
5760 |
|
5761 /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and |
|
5762 it isn't in brackets that are or may be referenced. */ |
|
5763 |
|
5764 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR || |
|
5765 op == OP_TYPEPOSSTAR)) |
|
5766 { |
|
5767 if (scode[1] != OP_ALLANY || (bracket_map & backref_map) != 0) |
|
5768 return FALSE; |
|
5769 } |
|
5770 |
|
5771 /* Check for explicit anchoring */ |
|
5772 |
|
5773 else if (op != OP_SOD && op != OP_SOM && |
|
5774 ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC)) |
|
5775 return FALSE; |
|
5776 code += GET(code, 1); |
|
5777 } |
|
5778 while (*code == OP_ALT); /* Loop for each alternative */ |
|
5779 return TRUE; |
|
5780 } |
|
5781 |
|
5782 |
|
5783 |
|
5784 /************************************************* |
|
5785 * Check for starting with ^ or .* * |
|
5786 *************************************************/ |
|
5787 |
|
5788 /* This is called to find out if every branch starts with ^ or .* so that |
|
5789 "first char" processing can be done to speed things up in multiline |
|
5790 matching and for non-DOTALL patterns that start with .* (which must start at |
|
5791 the beginning or after \n). As in the case of is_anchored() (see above), we |
|
5792 have to take account of back references to capturing brackets that contain .* |
|
5793 because in that case we can't make the assumption. |
|
5794 |
|
5795 Arguments: |
|
5796 code points to start of expression (the bracket) |
|
5797 bracket_map a bitmap of which brackets we are inside while testing; this |
|
5798 handles up to substring 31; after that we just have to take |
|
5799 the less precise approach |
|
5800 backref_map the back reference bitmap |
|
5801 |
|
5802 Returns: TRUE or FALSE |
|
5803 */ |
|
5804 |
|
5805 static BOOL |
|
5806 is_startline(const uschar *code, unsigned int bracket_map, |
|
5807 unsigned int backref_map) |
|
5808 { |
|
5809 do { |
|
5810 const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code], |
|
5811 NULL, 0, FALSE); |
|
5812 register int op = *scode; |
|
5813 |
|
5814 /* Non-capturing brackets */ |
|
5815 |
|
5816 if (op == OP_BRA) |
|
5817 { |
|
5818 if (!is_startline(scode, bracket_map, backref_map)) return FALSE; |
|
5819 } |
|
5820 |
|
5821 /* Capturing brackets */ |
|
5822 |
|
5823 else if (op == OP_CBRA) |
|
5824 { |
|
5825 int n = GET2(scode, 1+LINK_SIZE); |
|
5826 int new_map = bracket_map | ((n < 32)? (1 << n) : 1); |
|
5827 if (!is_startline(scode, new_map, backref_map)) return FALSE; |
|
5828 } |
|
5829 |
|
5830 /* Other brackets */ |
|
5831 |
|
5832 else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND) |
|
5833 { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; } |
|
5834 |
|
5835 /* .* means "start at start or after \n" if it isn't in brackets that |
|
5836 may be referenced. */ |
|
5837 |
|
5838 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR) |
|
5839 { |
|
5840 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE; |
|
5841 } |
|
5842 |
|
5843 /* Check for explicit circumflex */ |
|
5844 |
|
5845 else if (op != OP_CIRC) return FALSE; |
|
5846 |
|
5847 /* Move on to the next alternative */ |
|
5848 |
|
5849 code += GET(code, 1); |
|
5850 } |
|
5851 while (*code == OP_ALT); /* Loop for each alternative */ |
|
5852 return TRUE; |
|
5853 } |
|
5854 |
|
5855 |
|
5856 |
|
5857 /************************************************* |
|
5858 * Check for asserted fixed first char * |
|
5859 *************************************************/ |
|
5860 |
|
5861 /* During compilation, the "first char" settings from forward assertions are |
|
5862 discarded, because they can cause conflicts with actual literals that follow. |
|
5863 However, if we end up without a first char setting for an unanchored pattern, |
|
5864 it is worth scanning the regex to see if there is an initial asserted first |
|
5865 char. If all branches start with the same asserted char, or with a bracket all |
|
5866 of whose alternatives start with the same asserted char (recurse ad lib), then |
|
5867 we return that char, otherwise -1. |
|
5868 |
|
5869 Arguments: |
|
5870 code points to start of expression (the bracket) |
|
5871 options pointer to the options (used to check casing changes) |
|
5872 inassert TRUE if in an assertion |
|
5873 |
|
5874 Returns: -1 or the fixed first char |
|
5875 */ |
|
5876 |
|
5877 static int |
|
5878 find_firstassertedchar(const uschar *code, int *options, BOOL inassert) |
|
5879 { |
|
5880 register int c = -1; |
|
5881 do { |
|
5882 int d; |
|
5883 const uschar *scode = |
|
5884 first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE); |
|
5885 register int op = *scode; |
|
5886 |
|
5887 switch(op) |
|
5888 { |
|
5889 default: |
|
5890 return -1; |
|
5891 |
|
5892 case OP_BRA: |
|
5893 case OP_CBRA: |
|
5894 case OP_ASSERT: |
|
5895 case OP_ONCE: |
|
5896 case OP_COND: |
|
5897 if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0) |
|
5898 return -1; |
|
5899 if (c < 0) c = d; else if (c != d) return -1; |
|
5900 break; |
|
5901 |
|
5902 case OP_EXACT: /* Fall through */ |
|
5903 scode += 2; |
|
5904 |
|
5905 case OP_CHAR: |
|
5906 case OP_CHARNC: |
|
5907 case OP_PLUS: |
|
5908 case OP_MINPLUS: |
|
5909 case OP_POSPLUS: |
|
5910 if (!inassert) return -1; |
|
5911 if (c < 0) |
|
5912 { |
|
5913 c = scode[1]; |
|
5914 if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS; |
|
5915 } |
|
5916 else if (c != scode[1]) return -1; |
|
5917 break; |
|
5918 } |
|
5919 |
|
5920 code += GET(code, 1); |
|
5921 } |
|
5922 while (*code == OP_ALT); |
|
5923 return c; |
|
5924 } |
|
5925 |
|
5926 |
|
5927 |
|
5928 /************************************************* |
|
5929 * Compile a Regular Expression * |
|
5930 *************************************************/ |
|
5931 |
|
5932 /* This function takes a string and returns a pointer to a block of store |
|
5933 holding a compiled version of the expression. The original API for this |
|
5934 function had no error code return variable; it is retained for backwards |
|
5935 compatibility. The new function is given a new name. |
|
5936 |
|
5937 Arguments: |
|
5938 pattern the regular expression |
|
5939 options various option bits |
|
5940 errorcodeptr pointer to error code variable (pcre_compile2() only) |
|
5941 can be NULL if you don't want a code value |
|
5942 errorptr pointer to pointer to error text |
|
5943 erroroffset ptr offset in pattern where error was detected |
|
5944 tables pointer to character tables or NULL |
|
5945 |
|
5946 Returns: pointer to compiled data block, or NULL on error, |
|
5947 with errorptr and erroroffset set |
|
5948 */ |
|
5949 |
|
5950 PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION |
|
5951 pcre_compile(const char *pattern, int options, const char **errorptr, |
|
5952 int *erroroffset, const unsigned char *tables) |
|
5953 { |
|
5954 return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables); |
|
5955 } |
|
5956 |
|
5957 |
|
5958 PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION |
|
5959 pcre_compile2(const char *pattern, int options, int *errorcodeptr, |
|
5960 const char **errorptr, int *erroroffset, const unsigned char *tables) |
|
5961 { |
|
5962 real_pcre *re; |
|
5963 int length = 1; /* For final END opcode */ |
|
5964 int firstbyte, reqbyte, newline; |
|
5965 int errorcode = 0; |
|
5966 int skipatstart = 0; |
|
5967 #ifdef SUPPORT_UTF8 |
|
5968 BOOL utf8; |
|
5969 #endif |
|
5970 size_t size; |
|
5971 uschar *code; |
|
5972 const uschar *codestart; |
|
5973 const uschar *ptr; |
|
5974 compile_data compile_block; |
|
5975 compile_data *cd = &compile_block; |
|
5976 |
|
5977 /* This space is used for "compiling" into during the first phase, when we are |
|
5978 computing the amount of memory that is needed. Compiled items are thrown away |
|
5979 as soon as possible, so that a fairly large buffer should be sufficient for |
|
5980 this purpose. The same space is used in the second phase for remembering where |
|
5981 to fill in forward references to subpatterns. */ |
|
5982 |
|
5983 uschar cworkspace[COMPILE_WORK_SIZE]; |
|
5984 |
|
5985 /* Set this early so that early errors get offset 0. */ |
|
5986 |
|
5987 ptr = (const uschar *)pattern; |
|
5988 |
|
5989 /* We can't pass back an error message if errorptr is NULL; I guess the best we |
|
5990 can do is just return NULL, but we can set a code value if there is a code |
|
5991 pointer. */ |
|
5992 |
|
5993 if (errorptr == NULL) |
|
5994 { |
|
5995 if (errorcodeptr != NULL) *errorcodeptr = 99; |
|
5996 return NULL; |
|
5997 } |
|
5998 |
|
5999 *errorptr = NULL; |
|
6000 if (errorcodeptr != NULL) *errorcodeptr = ERR0; |
|
6001 |
|
6002 /* However, we can give a message for this error */ |
|
6003 |
|
6004 if (erroroffset == NULL) |
|
6005 { |
|
6006 errorcode = ERR16; |
|
6007 goto PCRE_EARLY_ERROR_RETURN2; |
|
6008 } |
|
6009 |
|
6010 *erroroffset = 0; |
|
6011 |
|
6012 /* Can't support UTF8 unless PCRE has been compiled to include the code. */ |
|
6013 |
|
6014 #ifdef SUPPORT_UTF8 |
|
6015 utf8 = (options & PCRE_UTF8) != 0; |
|
6016 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 && |
|
6017 (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0) |
|
6018 { |
|
6019 errorcode = ERR44; |
|
6020 goto PCRE_EARLY_ERROR_RETURN2; |
|
6021 } |
|
6022 #else |
|
6023 if ((options & PCRE_UTF8) != 0) |
|
6024 { |
|
6025 errorcode = ERR32; |
|
6026 goto PCRE_EARLY_ERROR_RETURN; |
|
6027 } |
|
6028 #endif |
|
6029 |
|
6030 if ((options & ~PUBLIC_OPTIONS) != 0) |
|
6031 { |
|
6032 errorcode = ERR17; |
|
6033 goto PCRE_EARLY_ERROR_RETURN; |
|
6034 } |
|
6035 |
|
6036 /* Set up pointers to the individual character tables */ |
|
6037 |
|
6038 if (tables == NULL) tables = _pcre_default_tables; |
|
6039 cd->lcc = tables + lcc_offset; |
|
6040 cd->fcc = tables + fcc_offset; |
|
6041 cd->cbits = tables + cbits_offset; |
|
6042 cd->ctypes = tables + ctypes_offset; |
|
6043 |
|
6044 /* Check for global one-time settings at the start of the pattern, and remember |
|
6045 the offset for later. */ |
|
6046 |
|
6047 while (ptr[skipatstart] == '(' && ptr[skipatstart+1] == '*') |
|
6048 { |
|
6049 int newnl = 0; |
|
6050 int newbsr = 0; |
|
6051 |
|
6052 if (strncmp((char *)(ptr+skipatstart+2), "CR)", 3) == 0) |
|
6053 { skipatstart += 5; newnl = PCRE_NEWLINE_CR; } |
|
6054 else if (strncmp((char *)(ptr+skipatstart+2), "LF)", 3) == 0) |
|
6055 { skipatstart += 5; newnl = PCRE_NEWLINE_LF; } |
|
6056 else if (strncmp((char *)(ptr+skipatstart+2), "CRLF)", 5) == 0) |
|
6057 { skipatstart += 7; newnl = PCRE_NEWLINE_CR + PCRE_NEWLINE_LF; } |
|
6058 else if (strncmp((char *)(ptr+skipatstart+2), "ANY)", 4) == 0) |
|
6059 { skipatstart += 6; newnl = PCRE_NEWLINE_ANY; } |
|
6060 else if (strncmp((char *)(ptr+skipatstart+2), "ANYCRLF)", 8) == 0) |
|
6061 { skipatstart += 10; newnl = PCRE_NEWLINE_ANYCRLF; } |
|
6062 |
|
6063 else if (strncmp((char *)(ptr+skipatstart+2), "BSR_ANYCRLF)", 12) == 0) |
|
6064 { skipatstart += 14; newbsr = PCRE_BSR_ANYCRLF; } |
|
6065 else if (strncmp((char *)(ptr+skipatstart+2), "BSR_UNICODE)", 12) == 0) |
|
6066 { skipatstart += 14; newbsr = PCRE_BSR_UNICODE; } |
|
6067 |
|
6068 if (newnl != 0) |
|
6069 options = (options & ~PCRE_NEWLINE_BITS) | newnl; |
|
6070 else if (newbsr != 0) |
|
6071 options = (options & ~(PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) | newbsr; |
|
6072 else break; |
|
6073 } |
|
6074 |
|
6075 /* Check validity of \R options. */ |
|
6076 |
|
6077 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) |
|
6078 { |
|
6079 case 0: |
|
6080 case PCRE_BSR_ANYCRLF: |
|
6081 case PCRE_BSR_UNICODE: |
|
6082 break; |
|
6083 default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN; |
|
6084 } |
|
6085 |
|
6086 /* Handle different types of newline. The three bits give seven cases. The |
|
6087 current code allows for fixed one- or two-byte sequences, plus "any" and |
|
6088 "anycrlf". */ |
|
6089 |
|
6090 switch (options & PCRE_NEWLINE_BITS) |
|
6091 { |
|
6092 case 0: newline = NEWLINE; break; /* Build-time default */ |
|
6093 case PCRE_NEWLINE_CR: newline = '\r'; break; |
|
6094 case PCRE_NEWLINE_LF: newline = '\n'; break; |
|
6095 case PCRE_NEWLINE_CR+ |
|
6096 PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break; |
|
6097 case PCRE_NEWLINE_ANY: newline = -1; break; |
|
6098 case PCRE_NEWLINE_ANYCRLF: newline = -2; break; |
|
6099 default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN; |
|
6100 } |
|
6101 |
|
6102 if (newline == -2) |
|
6103 { |
|
6104 cd->nltype = NLTYPE_ANYCRLF; |
|
6105 } |
|
6106 else if (newline < 0) |
|
6107 { |
|
6108 cd->nltype = NLTYPE_ANY; |
|
6109 } |
|
6110 else |
|
6111 { |
|
6112 cd->nltype = NLTYPE_FIXED; |
|
6113 if (newline > 255) |
|
6114 { |
|
6115 cd->nllen = 2; |
|
6116 cd->nl[0] = (newline >> 8) & 255; |
|
6117 cd->nl[1] = newline & 255; |
|
6118 } |
|
6119 else |
|
6120 { |
|
6121 cd->nllen = 1; |
|
6122 cd->nl[0] = newline; |
|
6123 } |
|
6124 } |
|
6125 |
|
6126 /* Maximum back reference and backref bitmap. The bitmap records up to 31 back |
|
6127 references to help in deciding whether (.*) can be treated as anchored or not. |
|
6128 */ |
|
6129 |
|
6130 cd->top_backref = 0; |
|
6131 cd->backref_map = 0; |
|
6132 |
|
6133 /* Reflect pattern for debugging output */ |
|
6134 |
|
6135 DPRINTF(("------------------------------------------------------------------\n")); |
|
6136 DPRINTF(("%s\n", pattern)); |
|
6137 |
|
6138 /* Pretend to compile the pattern while actually just accumulating the length |
|
6139 of memory required. This behaviour is triggered by passing a non-NULL final |
|
6140 argument to compile_regex(). We pass a block of workspace (cworkspace) for it |
|
6141 to compile parts of the pattern into; the compiled code is discarded when it is |
|
6142 no longer needed, so hopefully this workspace will never overflow, though there |
|
6143 is a test for its doing so. */ |
|
6144 |
|
6145 cd->bracount = cd->final_bracount = 0; |
|
6146 cd->names_found = 0; |
|
6147 cd->name_entry_size = 0; |
|
6148 cd->name_table = NULL; |
|
6149 cd->start_workspace = cworkspace; |
|
6150 cd->start_code = cworkspace; |
|
6151 cd->hwm = cworkspace; |
|
6152 cd->start_pattern = (const uschar *)pattern; |
|
6153 cd->end_pattern = (const uschar *)(pattern + strlen(pattern)); |
|
6154 cd->req_varyopt = 0; |
|
6155 cd->external_options = options; |
|
6156 cd->external_flags = 0; |
|
6157 |
|
6158 /* Now do the pre-compile. On error, errorcode will be set non-zero, so we |
|
6159 don't need to look at the result of the function here. The initial options have |
|
6160 been put into the cd block so that they can be changed if an option setting is |
|
6161 found within the regex right at the beginning. Bringing initial option settings |
|
6162 outside can help speed up starting point checks. */ |
|
6163 |
|
6164 ptr += skipatstart; |
|
6165 code = cworkspace; |
|
6166 *code = OP_BRA; |
|
6167 (void)compile_regex(cd->external_options, cd->external_options & PCRE_IMS, |
|
6168 &code, &ptr, &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, |
|
6169 &length); |
|
6170 if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN; |
|
6171 |
|
6172 DPRINTF(("end pre-compile: length=%d workspace=%d\n", length, |
|
6173 cd->hwm - cworkspace)); |
|
6174 |
|
6175 if (length > MAX_PATTERN_SIZE) |
|
6176 { |
|
6177 errorcode = ERR20; |
|
6178 goto PCRE_EARLY_ERROR_RETURN; |
|
6179 } |
|
6180 |
|
6181 /* Compute the size of data block needed and get it, either from malloc or |
|
6182 externally provided function. Integer overflow should no longer be possible |
|
6183 because nowadays we limit the maximum value of cd->names_found and |
|
6184 cd->name_entry_size. */ |
|
6185 |
|
6186 size = length + sizeof(real_pcre) + cd->names_found * (cd->name_entry_size + 3); |
|
6187 re = (real_pcre *)(pcre_malloc)(size); |
|
6188 |
|
6189 if (re == NULL) |
|
6190 { |
|
6191 errorcode = ERR21; |
|
6192 goto PCRE_EARLY_ERROR_RETURN; |
|
6193 } |
|
6194 |
|
6195 /* Put in the magic number, and save the sizes, initial options, internal |
|
6196 flags, and character table pointer. NULL is used for the default character |
|
6197 tables. The nullpad field is at the end; it's there to help in the case when a |
|
6198 regex compiled on a system with 4-byte pointers is run on another with 8-byte |
|
6199 pointers. */ |
|
6200 |
|
6201 re->magic_number = MAGIC_NUMBER; |
|
6202 re->size = size; |
|
6203 re->options = cd->external_options; |
|
6204 re->flags = cd->external_flags; |
|
6205 re->dummy1 = 0; |
|
6206 re->first_byte = 0; |
|
6207 re->req_byte = 0; |
|
6208 re->name_table_offset = sizeof(real_pcre); |
|
6209 re->name_entry_size = cd->name_entry_size; |
|
6210 re->name_count = cd->names_found; |
|
6211 re->ref_count = 0; |
|
6212 re->tables = (tables == _pcre_default_tables)? NULL : tables; |
|
6213 re->nullpad = NULL; |
|
6214 |
|
6215 /* The starting points of the name/number translation table and of the code are |
|
6216 passed around in the compile data block. The start/end pattern and initial |
|
6217 options are already set from the pre-compile phase, as is the name_entry_size |
|
6218 field. Reset the bracket count and the names_found field. Also reset the hwm |
|
6219 field; this time it's used for remembering forward references to subpatterns. |
|
6220 */ |
|
6221 |
|
6222 cd->final_bracount = cd->bracount; /* Save for checking forward references */ |
|
6223 cd->bracount = 0; |
|
6224 cd->names_found = 0; |
|
6225 cd->name_table = (uschar *)re + re->name_table_offset; |
|
6226 codestart = cd->name_table + re->name_entry_size * re->name_count; |
|
6227 cd->start_code = codestart; |
|
6228 cd->hwm = cworkspace; |
|
6229 cd->req_varyopt = 0; |
|
6230 cd->had_accept = FALSE; |
|
6231 |
|
6232 /* Set up a starting, non-extracting bracket, then compile the expression. On |
|
6233 error, errorcode will be set non-zero, so we don't need to look at the result |
|
6234 of the function here. */ |
|
6235 |
|
6236 ptr = (const uschar *)pattern + skipatstart; |
|
6237 code = (uschar *)codestart; |
|
6238 *code = OP_BRA; |
|
6239 (void)compile_regex(re->options, re->options & PCRE_IMS, &code, &ptr, |
|
6240 &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, NULL); |
|
6241 re->top_bracket = cd->bracount; |
|
6242 re->top_backref = cd->top_backref; |
|
6243 re->flags = cd->external_flags; |
|
6244 |
|
6245 if (cd->had_accept) reqbyte = -1; /* Must disable after (*ACCEPT) */ |
|
6246 |
|
6247 /* If not reached end of pattern on success, there's an excess bracket. */ |
|
6248 |
|
6249 if (errorcode == 0 && *ptr != 0) errorcode = ERR22; |
|
6250 |
|
6251 /* Fill in the terminating state and check for disastrous overflow, but |
|
6252 if debugging, leave the test till after things are printed out. */ |
|
6253 |
|
6254 *code++ = OP_END; |
|
6255 |
|
6256 #ifndef DEBUG |
|
6257 if (code - codestart > length) errorcode = ERR23; |
|
6258 #endif |
|
6259 |
|
6260 /* Fill in any forward references that are required. */ |
|
6261 |
|
6262 while (errorcode == 0 && cd->hwm > cworkspace) |
|
6263 { |
|
6264 int offset, recno; |
|
6265 const uschar *groupptr; |
|
6266 cd->hwm -= LINK_SIZE; |
|
6267 offset = GET(cd->hwm, 0); |
|
6268 recno = GET(codestart, offset); |
|
6269 groupptr = find_bracket(codestart, (re->options & PCRE_UTF8) != 0, recno); |
|
6270 if (groupptr == NULL) errorcode = ERR53; |
|
6271 else PUT(((uschar *)codestart), offset, groupptr - codestart); |
|
6272 } |
|
6273 |
|
6274 /* Give an error if there's back reference to a non-existent capturing |
|
6275 subpattern. */ |
|
6276 |
|
6277 if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15; |
|
6278 |
|
6279 /* Failed to compile, or error while post-processing */ |
|
6280 |
|
6281 if (errorcode != 0) |
|
6282 { |
|
6283 (pcre_free)(re); |
|
6284 PCRE_EARLY_ERROR_RETURN: |
|
6285 *erroroffset = ptr - (const uschar *)pattern; |
|
6286 PCRE_EARLY_ERROR_RETURN2: |
|
6287 *errorptr = find_error_text(errorcode); |
|
6288 if (errorcodeptr != NULL) *errorcodeptr = errorcode; |
|
6289 return NULL; |
|
6290 } |
|
6291 |
|
6292 /* If the anchored option was not passed, set the flag if we can determine that |
|
6293 the pattern is anchored by virtue of ^ characters or \A or anything else (such |
|
6294 as starting with .* when DOTALL is set). |
|
6295 |
|
6296 Otherwise, if we know what the first byte has to be, save it, because that |
|
6297 speeds up unanchored matches no end. If not, see if we can set the |
|
6298 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches |
|
6299 start with ^. and also when all branches start with .* for non-DOTALL matches. |
|
6300 */ |
|
6301 |
|
6302 if ((re->options & PCRE_ANCHORED) == 0) |
|
6303 { |
|
6304 int temp_options = re->options; /* May get changed during these scans */ |
|
6305 if (is_anchored(codestart, &temp_options, 0, cd->backref_map)) |
|
6306 re->options |= PCRE_ANCHORED; |
|
6307 else |
|
6308 { |
|
6309 if (firstbyte < 0) |
|
6310 firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE); |
|
6311 if (firstbyte >= 0) /* Remove caseless flag for non-caseable chars */ |
|
6312 { |
|
6313 int ch = firstbyte & 255; |
|
6314 re->first_byte = ((firstbyte & REQ_CASELESS) != 0 && |
|
6315 cd->fcc[ch] == ch)? ch : firstbyte; |
|
6316 re->flags |= PCRE_FIRSTSET; |
|
6317 } |
|
6318 else if (is_startline(codestart, 0, cd->backref_map)) |
|
6319 re->flags |= PCRE_STARTLINE; |
|
6320 } |
|
6321 } |
|
6322 |
|
6323 /* For an anchored pattern, we use the "required byte" only if it follows a |
|
6324 variable length item in the regex. Remove the caseless flag for non-caseable |
|
6325 bytes. */ |
|
6326 |
|
6327 if (reqbyte >= 0 && |
|
6328 ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0)) |
|
6329 { |
|
6330 int ch = reqbyte & 255; |
|
6331 re->req_byte = ((reqbyte & REQ_CASELESS) != 0 && |
|
6332 cd->fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte; |
|
6333 re->flags |= PCRE_REQCHSET; |
|
6334 } |
|
6335 |
|
6336 /* Print out the compiled data if debugging is enabled. This is never the |
|
6337 case when building a production library. */ |
|
6338 |
|
6339 #ifdef DEBUG |
|
6340 |
|
6341 printf("Length = %d top_bracket = %d top_backref = %d\n", |
|
6342 length, re->top_bracket, re->top_backref); |
|
6343 |
|
6344 printf("Options=%08x\n", re->options); |
|
6345 |
|
6346 if ((re->flags & PCRE_FIRSTSET) != 0) |
|
6347 { |
|
6348 int ch = re->first_byte & 255; |
|
6349 const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)? |
|
6350 "" : " (caseless)"; |
|
6351 if (isprint(ch)) printf("First char = %c%s\n", ch, caseless); |
|
6352 else printf("First char = \\x%02x%s\n", ch, caseless); |
|
6353 } |
|
6354 |
|
6355 if ((re->flags & PCRE_REQCHSET) != 0) |
|
6356 { |
|
6357 int ch = re->req_byte & 255; |
|
6358 const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)? |
|
6359 "" : " (caseless)"; |
|
6360 if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless); |
|
6361 else printf("Req char = \\x%02x%s\n", ch, caseless); |
|
6362 } |
|
6363 |
|
6364 pcre_printint(re, stdout, TRUE); |
|
6365 |
|
6366 /* This check is done here in the debugging case so that the code that |
|
6367 was compiled can be seen. */ |
|
6368 |
|
6369 if (code - codestart > length) |
|
6370 { |
|
6371 (pcre_free)(re); |
|
6372 *errorptr = find_error_text(ERR23); |
|
6373 *erroroffset = ptr - (uschar *)pattern; |
|
6374 if (errorcodeptr != NULL) *errorcodeptr = ERR23; |
|
6375 return NULL; |
|
6376 } |
|
6377 #endif /* DEBUG */ |
|
6378 |
|
6379 return (pcre *)re; |
|
6380 } |
|
6381 |
|
6382 /* End of pcre_compile.c */ |