|
1 /* |
|
2 * Copyright (C) 2009 Apple Inc. All rights reserved. |
|
3 * |
|
4 * Redistribution and use in source and binary forms, with or without |
|
5 * modification, are permitted provided that the following conditions |
|
6 * are met: |
|
7 * 1. Redistributions of source code must retain the above copyright |
|
8 * notice, this list of conditions and the following disclaimer. |
|
9 * 2. Redistributions in binary form must reproduce the above copyright |
|
10 * notice, this list of conditions and the following disclaimer in the |
|
11 * documentation and/or other materials provided with the distribution. |
|
12 * |
|
13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY |
|
14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
|
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
|
16 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR |
|
17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
|
18 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
|
19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
|
20 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY |
|
21 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
|
22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
|
23 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
|
24 */ |
|
25 |
|
26 #include "config.h" |
|
27 #include "RegexCompiler.h" |
|
28 |
|
29 #include "RegexInterpreter.h" |
|
30 #include "RegexPattern.h" |
|
31 #include <wtf/Vector.h> |
|
32 |
|
33 #if ENABLE(YARR) |
|
34 |
|
35 using namespace WTF; |
|
36 |
|
37 namespace JSC { namespace Yarr { |
|
38 |
|
39 #include "RegExpJitTables.h" |
|
40 |
|
41 class CharacterClassConstructor { |
|
42 public: |
|
43 CharacterClassConstructor(bool isCaseInsensitive = false) |
|
44 : m_isCaseInsensitive(isCaseInsensitive) |
|
45 { |
|
46 } |
|
47 |
|
48 void reset() |
|
49 { |
|
50 m_matches.clear(); |
|
51 m_ranges.clear(); |
|
52 m_matchesUnicode.clear(); |
|
53 m_rangesUnicode.clear(); |
|
54 } |
|
55 |
|
56 void append(const CharacterClass* other) |
|
57 { |
|
58 for (size_t i = 0; i < other->m_matches.size(); ++i) |
|
59 addSorted(m_matches, other->m_matches[i]); |
|
60 for (size_t i = 0; i < other->m_ranges.size(); ++i) |
|
61 addSortedRange(m_ranges, other->m_ranges[i].begin, other->m_ranges[i].end); |
|
62 for (size_t i = 0; i < other->m_matchesUnicode.size(); ++i) |
|
63 addSorted(m_matchesUnicode, other->m_matchesUnicode[i]); |
|
64 for (size_t i = 0; i < other->m_rangesUnicode.size(); ++i) |
|
65 addSortedRange(m_rangesUnicode, other->m_rangesUnicode[i].begin, other->m_rangesUnicode[i].end); |
|
66 } |
|
67 |
|
68 void putChar(UChar ch) |
|
69 { |
|
70 if (ch <= 0x7f) { |
|
71 if (m_isCaseInsensitive && isASCIIAlpha(ch)) { |
|
72 addSorted(m_matches, toASCIIUpper(ch)); |
|
73 addSorted(m_matches, toASCIILower(ch)); |
|
74 } else |
|
75 addSorted(m_matches, ch); |
|
76 } else { |
|
77 UChar upper, lower; |
|
78 if (m_isCaseInsensitive && ((upper = Unicode::toUpper(ch)) != (lower = Unicode::toLower(ch)))) { |
|
79 addSorted(m_matchesUnicode, upper); |
|
80 addSorted(m_matchesUnicode, lower); |
|
81 } else |
|
82 addSorted(m_matchesUnicode, ch); |
|
83 } |
|
84 } |
|
85 |
|
86 // returns true if this character has another case, and 'ch' is the upper case form. |
|
87 static inline bool isUnicodeUpper(UChar ch) |
|
88 { |
|
89 return ch != Unicode::toLower(ch); |
|
90 } |
|
91 |
|
92 // returns true if this character has another case, and 'ch' is the lower case form. |
|
93 static inline bool isUnicodeLower(UChar ch) |
|
94 { |
|
95 return ch != Unicode::toUpper(ch); |
|
96 } |
|
97 |
|
98 void putRange(UChar lo, UChar hi) |
|
99 { |
|
100 if (lo <= 0x7f) { |
|
101 char asciiLo = lo; |
|
102 char asciiHi = std::min(hi, (UChar)0x7f); |
|
103 addSortedRange(m_ranges, lo, asciiHi); |
|
104 |
|
105 if (m_isCaseInsensitive) { |
|
106 if ((asciiLo <= 'Z') && (asciiHi >= 'A')) |
|
107 addSortedRange(m_ranges, std::max(asciiLo, 'A')+('a'-'A'), std::min(asciiHi, 'Z')+('a'-'A')); |
|
108 if ((asciiLo <= 'z') && (asciiHi >= 'a')) |
|
109 addSortedRange(m_ranges, std::max(asciiLo, 'a')+('A'-'a'), std::min(asciiHi, 'z')+('A'-'a')); |
|
110 } |
|
111 } |
|
112 if (hi >= 0x80) { |
|
113 uint32_t unicodeCurr = std::max(lo, (UChar)0x80); |
|
114 addSortedRange(m_rangesUnicode, unicodeCurr, hi); |
|
115 |
|
116 if (m_isCaseInsensitive) { |
|
117 while (unicodeCurr <= hi) { |
|
118 // If the upper bound of the range (hi) is 0xffff, the increments to |
|
119 // unicodeCurr in this loop may take it to 0x10000. This is fine |
|
120 // (if so we won't re-enter the loop, since the loop condition above |
|
121 // will definitely fail) - but this does mean we cannot use a UChar |
|
122 // to represent unicodeCurr, we must use a 32-bit value instead. |
|
123 ASSERT(unicodeCurr <= 0xffff); |
|
124 |
|
125 if (isUnicodeUpper(unicodeCurr)) { |
|
126 UChar lowerCaseRangeBegin = Unicode::toLower(unicodeCurr); |
|
127 UChar lowerCaseRangeEnd = lowerCaseRangeBegin; |
|
128 while ((++unicodeCurr <= hi) && isUnicodeUpper(unicodeCurr) && (Unicode::toLower(unicodeCurr) == (lowerCaseRangeEnd + 1))) |
|
129 lowerCaseRangeEnd++; |
|
130 addSortedRange(m_rangesUnicode, lowerCaseRangeBegin, lowerCaseRangeEnd); |
|
131 } else if (isUnicodeLower(unicodeCurr)) { |
|
132 UChar upperCaseRangeBegin = Unicode::toUpper(unicodeCurr); |
|
133 UChar upperCaseRangeEnd = upperCaseRangeBegin; |
|
134 while ((++unicodeCurr <= hi) && isUnicodeLower(unicodeCurr) && (Unicode::toUpper(unicodeCurr) == (upperCaseRangeEnd + 1))) |
|
135 upperCaseRangeEnd++; |
|
136 addSortedRange(m_rangesUnicode, upperCaseRangeBegin, upperCaseRangeEnd); |
|
137 } else |
|
138 ++unicodeCurr; |
|
139 } |
|
140 } |
|
141 } |
|
142 } |
|
143 |
|
144 CharacterClass* charClass() |
|
145 { |
|
146 CharacterClass* characterClass = new CharacterClass(0); |
|
147 |
|
148 characterClass->m_matches.append(m_matches); |
|
149 characterClass->m_ranges.append(m_ranges); |
|
150 characterClass->m_matchesUnicode.append(m_matchesUnicode); |
|
151 characterClass->m_rangesUnicode.append(m_rangesUnicode); |
|
152 |
|
153 reset(); |
|
154 |
|
155 return characterClass; |
|
156 } |
|
157 |
|
158 private: |
|
159 void addSorted(Vector<UChar>& matches, UChar ch) |
|
160 { |
|
161 unsigned pos = 0; |
|
162 unsigned range = matches.size(); |
|
163 |
|
164 // binary chop, find position to insert char. |
|
165 while (range) { |
|
166 unsigned index = range >> 1; |
|
167 |
|
168 int val = matches[pos+index] - ch; |
|
169 if (!val) |
|
170 return; |
|
171 else if (val > 0) |
|
172 range = index; |
|
173 else { |
|
174 pos += (index+1); |
|
175 range -= (index+1); |
|
176 } |
|
177 } |
|
178 |
|
179 if (pos == matches.size()) |
|
180 matches.append(ch); |
|
181 else |
|
182 matches.insert(pos, ch); |
|
183 } |
|
184 |
|
185 void addSortedRange(Vector<CharacterRange>& ranges, UChar lo, UChar hi) |
|
186 { |
|
187 unsigned end = ranges.size(); |
|
188 |
|
189 // Simple linear scan - I doubt there are that many ranges anyway... |
|
190 // feel free to fix this with something faster (eg binary chop). |
|
191 for (unsigned i = 0; i < end; ++i) { |
|
192 // does the new range fall before the current position in the array |
|
193 if (hi < ranges[i].begin) { |
|
194 // optional optimization: concatenate appending ranges? - may not be worthwhile. |
|
195 if (hi == (ranges[i].begin - 1)) { |
|
196 ranges[i].begin = lo; |
|
197 return; |
|
198 } |
|
199 ranges.insert(i, CharacterRange(lo, hi)); |
|
200 return; |
|
201 } |
|
202 // Okay, since we didn't hit the last case, the end of the new range is definitely at or after the begining |
|
203 // If the new range start at or before the end of the last range, then the overlap (if it starts one after the |
|
204 // end of the last range they concatenate, which is just as good. |
|
205 if (lo <= (ranges[i].end + 1)) { |
|
206 // found an intersect! we'll replace this entry in the array. |
|
207 ranges[i].begin = std::min(ranges[i].begin, lo); |
|
208 ranges[i].end = std::max(ranges[i].end, hi); |
|
209 |
|
210 // now check if the new range can subsume any subsequent ranges. |
|
211 unsigned next = i+1; |
|
212 // each iteration of the loop we will either remove something from the list, or break the loop. |
|
213 while (next < ranges.size()) { |
|
214 if (ranges[next].begin <= (ranges[i].end + 1)) { |
|
215 // the next entry now overlaps / concatenates this one. |
|
216 ranges[i].end = std::max(ranges[i].end, ranges[next].end); |
|
217 ranges.remove(next); |
|
218 } else |
|
219 break; |
|
220 } |
|
221 |
|
222 return; |
|
223 } |
|
224 } |
|
225 |
|
226 // CharacterRange comes after all existing ranges. |
|
227 ranges.append(CharacterRange(lo, hi)); |
|
228 } |
|
229 |
|
230 bool m_isCaseInsensitive; |
|
231 |
|
232 Vector<UChar> m_matches; |
|
233 Vector<CharacterRange> m_ranges; |
|
234 Vector<UChar> m_matchesUnicode; |
|
235 Vector<CharacterRange> m_rangesUnicode; |
|
236 }; |
|
237 |
|
238 class RegexPatternConstructor { |
|
239 public: |
|
240 RegexPatternConstructor(RegexPattern& pattern) |
|
241 : m_pattern(pattern) |
|
242 , m_characterClassConstructor(pattern.m_ignoreCase) |
|
243 { |
|
244 } |
|
245 |
|
246 ~RegexPatternConstructor() |
|
247 { |
|
248 } |
|
249 |
|
250 void reset() |
|
251 { |
|
252 m_pattern.reset(); |
|
253 m_characterClassConstructor.reset(); |
|
254 } |
|
255 |
|
256 void assertionBOL() |
|
257 { |
|
258 m_alternative->m_terms.append(PatternTerm::BOL()); |
|
259 } |
|
260 void assertionEOL() |
|
261 { |
|
262 m_alternative->m_terms.append(PatternTerm::EOL()); |
|
263 } |
|
264 void assertionWordBoundary(bool invert) |
|
265 { |
|
266 m_alternative->m_terms.append(PatternTerm::WordBoundary(invert)); |
|
267 } |
|
268 |
|
269 void atomPatternCharacter(UChar ch) |
|
270 { |
|
271 // We handle case-insensitive checking of unicode characters which do have both |
|
272 // cases by handling them as if they were defined using a CharacterClass. |
|
273 if (m_pattern.m_ignoreCase && !isASCII(ch) && (Unicode::toUpper(ch) != Unicode::toLower(ch))) { |
|
274 atomCharacterClassBegin(); |
|
275 atomCharacterClassAtom(ch); |
|
276 atomCharacterClassEnd(); |
|
277 } else |
|
278 m_alternative->m_terms.append(PatternTerm(ch)); |
|
279 } |
|
280 |
|
281 void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert) |
|
282 { |
|
283 switch (classID) { |
|
284 case DigitClassID: |
|
285 m_alternative->m_terms.append(PatternTerm(m_pattern.digitsCharacterClass(), invert)); |
|
286 break; |
|
287 case SpaceClassID: |
|
288 m_alternative->m_terms.append(PatternTerm(m_pattern.spacesCharacterClass(), invert)); |
|
289 break; |
|
290 case WordClassID: |
|
291 m_alternative->m_terms.append(PatternTerm(m_pattern.wordcharCharacterClass(), invert)); |
|
292 break; |
|
293 case NewlineClassID: |
|
294 m_alternative->m_terms.append(PatternTerm(m_pattern.newlineCharacterClass(), invert)); |
|
295 break; |
|
296 } |
|
297 } |
|
298 |
|
299 void atomCharacterClassBegin(bool invert = false) |
|
300 { |
|
301 m_invertCharacterClass = invert; |
|
302 } |
|
303 |
|
304 void atomCharacterClassAtom(UChar ch) |
|
305 { |
|
306 m_characterClassConstructor.putChar(ch); |
|
307 } |
|
308 |
|
309 void atomCharacterClassRange(UChar begin, UChar end) |
|
310 { |
|
311 m_characterClassConstructor.putRange(begin, end); |
|
312 } |
|
313 |
|
314 void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert) |
|
315 { |
|
316 ASSERT(classID != NewlineClassID); |
|
317 |
|
318 switch (classID) { |
|
319 case DigitClassID: |
|
320 m_characterClassConstructor.append(invert ? m_pattern.nondigitsCharacterClass() : m_pattern.digitsCharacterClass()); |
|
321 break; |
|
322 |
|
323 case SpaceClassID: |
|
324 m_characterClassConstructor.append(invert ? m_pattern.nonspacesCharacterClass() : m_pattern.spacesCharacterClass()); |
|
325 break; |
|
326 |
|
327 case WordClassID: |
|
328 m_characterClassConstructor.append(invert ? m_pattern.nonwordcharCharacterClass() : m_pattern.wordcharCharacterClass()); |
|
329 break; |
|
330 |
|
331 default: |
|
332 ASSERT_NOT_REACHED(); |
|
333 } |
|
334 } |
|
335 |
|
336 void atomCharacterClassEnd() |
|
337 { |
|
338 CharacterClass* newCharacterClass = m_characterClassConstructor.charClass(); |
|
339 m_pattern.m_userCharacterClasses.append(newCharacterClass); |
|
340 m_alternative->m_terms.append(PatternTerm(newCharacterClass, m_invertCharacterClass)); |
|
341 } |
|
342 |
|
343 void atomParenthesesSubpatternBegin(bool capture = true) |
|
344 { |
|
345 unsigned subpatternId = m_pattern.m_numSubpatterns + 1; |
|
346 if (capture) |
|
347 m_pattern.m_numSubpatterns++; |
|
348 |
|
349 PatternDisjunction* parenthesesDisjunction = new PatternDisjunction(m_alternative); |
|
350 m_pattern.m_disjunctions.append(parenthesesDisjunction); |
|
351 m_alternative->m_terms.append(PatternTerm(PatternTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction, capture)); |
|
352 m_alternative = parenthesesDisjunction->addNewAlternative(); |
|
353 } |
|
354 |
|
355 void atomParentheticalAssertionBegin(bool invert = false) |
|
356 { |
|
357 PatternDisjunction* parenthesesDisjunction = new PatternDisjunction(m_alternative); |
|
358 m_pattern.m_disjunctions.append(parenthesesDisjunction); |
|
359 m_alternative->m_terms.append(PatternTerm(PatternTerm::TypeParentheticalAssertion, m_pattern.m_numSubpatterns + 1, parenthesesDisjunction, invert)); |
|
360 m_alternative = parenthesesDisjunction->addNewAlternative(); |
|
361 } |
|
362 |
|
363 void atomParenthesesEnd() |
|
364 { |
|
365 ASSERT(m_alternative->m_parent); |
|
366 ASSERT(m_alternative->m_parent->m_parent); |
|
367 m_alternative = m_alternative->m_parent->m_parent; |
|
368 |
|
369 m_alternative->lastTerm().parentheses.lastSubpatternId = m_pattern.m_numSubpatterns; |
|
370 } |
|
371 |
|
372 void atomBackReference(unsigned subpatternId) |
|
373 { |
|
374 ASSERT(subpatternId); |
|
375 m_pattern.m_containsBackreferences = true; |
|
376 m_pattern.m_maxBackReference = std::max(m_pattern.m_maxBackReference, subpatternId); |
|
377 |
|
378 if (subpatternId > m_pattern.m_numSubpatterns) { |
|
379 m_alternative->m_terms.append(PatternTerm::ForwardReference()); |
|
380 return; |
|
381 } |
|
382 |
|
383 PatternAlternative* currentAlternative = m_alternative; |
|
384 ASSERT(currentAlternative); |
|
385 |
|
386 // Note to self: if we waited until the AST was baked, we could also remove forwards refs |
|
387 while ((currentAlternative = currentAlternative->m_parent->m_parent)) { |
|
388 PatternTerm& term = currentAlternative->lastTerm(); |
|
389 ASSERT((term.type == PatternTerm::TypeParenthesesSubpattern) || (term.type == PatternTerm::TypeParentheticalAssertion)); |
|
390 |
|
391 if ((term.type == PatternTerm::TypeParenthesesSubpattern) && term.invertOrCapture && (subpatternId == term.subpatternId)) { |
|
392 m_alternative->m_terms.append(PatternTerm::ForwardReference()); |
|
393 return; |
|
394 } |
|
395 } |
|
396 |
|
397 m_alternative->m_terms.append(PatternTerm(subpatternId)); |
|
398 } |
|
399 |
|
400 PatternDisjunction* copyDisjunction(PatternDisjunction* disjunction) |
|
401 { |
|
402 PatternDisjunction* newDisjunction = new PatternDisjunction(); |
|
403 |
|
404 newDisjunction->m_parent = disjunction->m_parent; |
|
405 for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) { |
|
406 PatternAlternative* alternative = disjunction->m_alternatives[alt]; |
|
407 PatternAlternative* newAlternative = newDisjunction->addNewAlternative(); |
|
408 for (unsigned i = 0; i < alternative->m_terms.size(); ++i) |
|
409 newAlternative->m_terms.append(copyTerm(alternative->m_terms[i])); |
|
410 } |
|
411 |
|
412 m_pattern.m_disjunctions.append(newDisjunction); |
|
413 return newDisjunction; |
|
414 } |
|
415 |
|
416 PatternTerm copyTerm(PatternTerm& term) |
|
417 { |
|
418 if ((term.type != PatternTerm::TypeParenthesesSubpattern) && (term.type != PatternTerm::TypeParentheticalAssertion)) |
|
419 return PatternTerm(term); |
|
420 |
|
421 PatternTerm termCopy = term; |
|
422 termCopy.parentheses.disjunction = copyDisjunction(termCopy.parentheses.disjunction); |
|
423 return termCopy; |
|
424 } |
|
425 |
|
426 void quantifyAtom(unsigned min, unsigned max, bool greedy) |
|
427 { |
|
428 ASSERT(min <= max); |
|
429 ASSERT(m_alternative->m_terms.size()); |
|
430 |
|
431 if (!max) { |
|
432 m_alternative->removeLastTerm(); |
|
433 return; |
|
434 } |
|
435 |
|
436 PatternTerm& term = m_alternative->lastTerm(); |
|
437 ASSERT(term.type > PatternTerm::TypeAssertionWordBoundary); |
|
438 ASSERT((term.quantityCount == 1) && (term.quantityType == QuantifierFixedCount)); |
|
439 |
|
440 // For any assertion with a zero minimum, not matching is valid and has no effect, |
|
441 // remove it. Otherwise, we need to match as least once, but there is no point |
|
442 // matching more than once, so remove the quantifier. It is not entirely clear |
|
443 // from the spec whether or not this behavior is correct, but I believe this |
|
444 // matches Firefox. :-/ |
|
445 if (term.type == PatternTerm::TypeParentheticalAssertion) { |
|
446 if (!min) |
|
447 m_alternative->removeLastTerm(); |
|
448 return; |
|
449 } |
|
450 |
|
451 if (min == 0) |
|
452 term.quantify(max, greedy ? QuantifierGreedy : QuantifierNonGreedy); |
|
453 else if (min == max) |
|
454 term.quantify(min, QuantifierFixedCount); |
|
455 else { |
|
456 term.quantify(min, QuantifierFixedCount); |
|
457 m_alternative->m_terms.append(copyTerm(term)); |
|
458 // NOTE: this term is interesting from an analysis perspective, in that it can be ignored..... |
|
459 m_alternative->lastTerm().quantify((max == UINT_MAX) ? max : max - min, greedy ? QuantifierGreedy : QuantifierNonGreedy); |
|
460 if (m_alternative->lastTerm().type == PatternTerm::TypeParenthesesSubpattern) |
|
461 m_alternative->lastTerm().parentheses.isCopy = true; |
|
462 } |
|
463 } |
|
464 |
|
465 void disjunction() |
|
466 { |
|
467 m_alternative = m_alternative->m_parent->addNewAlternative(); |
|
468 } |
|
469 |
|
470 void regexBegin() |
|
471 { |
|
472 m_pattern.m_body = new PatternDisjunction(); |
|
473 m_alternative = m_pattern.m_body->addNewAlternative(); |
|
474 m_pattern.m_disjunctions.append(m_pattern.m_body); |
|
475 } |
|
476 void regexEnd() |
|
477 { |
|
478 } |
|
479 void regexError() |
|
480 { |
|
481 } |
|
482 |
|
483 unsigned setupAlternativeOffsets(PatternAlternative* alternative, unsigned currentCallFrameSize, unsigned initialInputPosition) |
|
484 { |
|
485 alternative->m_hasFixedSize = true; |
|
486 unsigned currentInputPosition = initialInputPosition; |
|
487 |
|
488 for (unsigned i = 0; i < alternative->m_terms.size(); ++i) { |
|
489 PatternTerm& term = alternative->m_terms[i]; |
|
490 |
|
491 switch (term.type) { |
|
492 case PatternTerm::TypeAssertionBOL: |
|
493 case PatternTerm::TypeAssertionEOL: |
|
494 case PatternTerm::TypeAssertionWordBoundary: |
|
495 term.inputPosition = currentInputPosition; |
|
496 break; |
|
497 |
|
498 case PatternTerm::TypeBackReference: |
|
499 term.inputPosition = currentInputPosition; |
|
500 term.frameLocation = currentCallFrameSize; |
|
501 currentCallFrameSize += RegexStackSpaceForBackTrackInfoBackReference; |
|
502 alternative->m_hasFixedSize = false; |
|
503 break; |
|
504 |
|
505 case PatternTerm::TypeForwardReference: |
|
506 break; |
|
507 |
|
508 case PatternTerm::TypePatternCharacter: |
|
509 term.inputPosition = currentInputPosition; |
|
510 if (term.quantityType != QuantifierFixedCount) { |
|
511 term.frameLocation = currentCallFrameSize; |
|
512 currentCallFrameSize += RegexStackSpaceForBackTrackInfoPatternCharacter; |
|
513 alternative->m_hasFixedSize = false; |
|
514 } else |
|
515 currentInputPosition += term.quantityCount; |
|
516 break; |
|
517 |
|
518 case PatternTerm::TypeCharacterClass: |
|
519 term.inputPosition = currentInputPosition; |
|
520 if (term.quantityType != QuantifierFixedCount) { |
|
521 term.frameLocation = currentCallFrameSize; |
|
522 currentCallFrameSize += RegexStackSpaceForBackTrackInfoCharacterClass; |
|
523 alternative->m_hasFixedSize = false; |
|
524 } else |
|
525 currentInputPosition += term.quantityCount; |
|
526 break; |
|
527 |
|
528 case PatternTerm::TypeParenthesesSubpattern: |
|
529 // Note: for fixed once parentheses we will ensure at least the minimum is available; others are on their own. |
|
530 term.frameLocation = currentCallFrameSize; |
|
531 if ((term.quantityCount == 1) && !term.parentheses.isCopy) { |
|
532 if (term.quantityType == QuantifierFixedCount) { |
|
533 currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition); |
|
534 currentInputPosition += term.parentheses.disjunction->m_minimumSize; |
|
535 } else { |
|
536 currentCallFrameSize += RegexStackSpaceForBackTrackInfoParenthesesOnce; |
|
537 currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition); |
|
538 } |
|
539 term.inputPosition = currentInputPosition; |
|
540 } else { |
|
541 term.inputPosition = currentInputPosition; |
|
542 setupDisjunctionOffsets(term.parentheses.disjunction, 0, currentInputPosition); |
|
543 currentCallFrameSize += RegexStackSpaceForBackTrackInfoParentheses; |
|
544 } |
|
545 // Fixed count of 1 could be accepted, if they have a fixed size *AND* if all alternatives are of the same length. |
|
546 alternative->m_hasFixedSize = false; |
|
547 break; |
|
548 |
|
549 case PatternTerm::TypeParentheticalAssertion: |
|
550 term.inputPosition = currentInputPosition; |
|
551 term.frameLocation = currentCallFrameSize; |
|
552 currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize + RegexStackSpaceForBackTrackInfoParentheticalAssertion, currentInputPosition); |
|
553 break; |
|
554 } |
|
555 } |
|
556 |
|
557 alternative->m_minimumSize = currentInputPosition - initialInputPosition; |
|
558 return currentCallFrameSize; |
|
559 } |
|
560 |
|
561 unsigned setupDisjunctionOffsets(PatternDisjunction* disjunction, unsigned initialCallFrameSize, unsigned initialInputPosition) |
|
562 { |
|
563 if ((disjunction != m_pattern.m_body) && (disjunction->m_alternatives.size() > 1)) |
|
564 initialCallFrameSize += RegexStackSpaceForBackTrackInfoAlternative; |
|
565 |
|
566 unsigned minimumInputSize = UINT_MAX; |
|
567 unsigned maximumCallFrameSize = 0; |
|
568 bool hasFixedSize = true; |
|
569 |
|
570 for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) { |
|
571 PatternAlternative* alternative = disjunction->m_alternatives[alt]; |
|
572 unsigned currentAlternativeCallFrameSize = setupAlternativeOffsets(alternative, initialCallFrameSize, initialInputPosition); |
|
573 minimumInputSize = min(minimumInputSize, alternative->m_minimumSize); |
|
574 maximumCallFrameSize = max(maximumCallFrameSize, currentAlternativeCallFrameSize); |
|
575 hasFixedSize &= alternative->m_hasFixedSize; |
|
576 } |
|
577 |
|
578 ASSERT(minimumInputSize != UINT_MAX); |
|
579 ASSERT(maximumCallFrameSize >= initialCallFrameSize); |
|
580 |
|
581 disjunction->m_hasFixedSize = hasFixedSize; |
|
582 disjunction->m_minimumSize = minimumInputSize; |
|
583 disjunction->m_callFrameSize = maximumCallFrameSize; |
|
584 return maximumCallFrameSize; |
|
585 } |
|
586 |
|
587 void setupOffsets() |
|
588 { |
|
589 setupDisjunctionOffsets(m_pattern.m_body, 0, 0); |
|
590 } |
|
591 |
|
592 private: |
|
593 RegexPattern& m_pattern; |
|
594 PatternAlternative* m_alternative; |
|
595 CharacterClassConstructor m_characterClassConstructor; |
|
596 bool m_invertCharacterClass; |
|
597 }; |
|
598 |
|
599 |
|
600 const char* compileRegex(const UString& patternString, RegexPattern& pattern) |
|
601 { |
|
602 RegexPatternConstructor constructor(pattern); |
|
603 |
|
604 if (const char* error = parse(constructor, patternString)) |
|
605 return error; |
|
606 |
|
607 // If the pattern contains illegal backreferences reset & reparse. |
|
608 // Quoting Netscape's "What's new in JavaScript 1.2", |
|
609 // "Note: if the number of left parentheses is less than the number specified |
|
610 // in \#, the \# is taken as an octal escape as described in the next row." |
|
611 if (pattern.containsIllegalBackReference()) { |
|
612 unsigned numSubpatterns = pattern.m_numSubpatterns; |
|
613 |
|
614 constructor.reset(); |
|
615 #if !ASSERT_DISABLED |
|
616 const char* error = |
|
617 #endif |
|
618 parse(constructor, patternString, numSubpatterns); |
|
619 |
|
620 ASSERT(!error); |
|
621 ASSERT(numSubpatterns == pattern.m_numSubpatterns); |
|
622 } |
|
623 |
|
624 constructor.setupOffsets(); |
|
625 |
|
626 return 0; |
|
627 }; |
|
628 |
|
629 |
|
630 } } |
|
631 |
|
632 #endif |