--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/JavaScriptCore/yarr/RegexInterpreter.cpp Fri Sep 17 09:02:29 2010 +0300
@@ -0,0 +1,1606 @@
+/*
+ * Copyright (C) 2009 Apple Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+#include "RegexInterpreter.h"
+
+#include "RegexCompiler.h"
+#include "RegexPattern.h"
+
+#ifndef NDEBUG
+#include <stdio.h>
+#endif
+
+#if ENABLE(YARR)
+
+using namespace WTF;
+
+namespace JSC { namespace Yarr {
+
+class Interpreter {
+public:
+ struct ParenthesesDisjunctionContext;
+
+ struct BackTrackInfoPatternCharacter {
+ uintptr_t matchAmount;
+ };
+ struct BackTrackInfoCharacterClass {
+ uintptr_t matchAmount;
+ };
+ struct BackTrackInfoBackReference {
+ uintptr_t begin; // Not really needed for greedy quantifiers.
+ uintptr_t matchAmount; // Not really needed for fixed quantifiers.
+ };
+ struct BackTrackInfoAlternative {
+ uintptr_t offset;
+ };
+ struct BackTrackInfoParentheticalAssertion {
+ uintptr_t begin;
+ };
+ struct BackTrackInfoParenthesesOnce {
+ uintptr_t inParentheses;
+ };
+ struct BackTrackInfoParentheses {
+ uintptr_t matchAmount;
+ ParenthesesDisjunctionContext* lastContext;
+ uintptr_t prevBegin;
+ uintptr_t prevEnd;
+ };
+
+ static inline void appendParenthesesDisjunctionContext(BackTrackInfoParentheses* backTrack, ParenthesesDisjunctionContext* context)
+ {
+ context->next = backTrack->lastContext;
+ backTrack->lastContext = context;
+ ++backTrack->matchAmount;
+ }
+
+ static inline void popParenthesesDisjunctionContext(BackTrackInfoParentheses* backTrack)
+ {
+ ASSERT(backTrack->matchAmount);
+ ASSERT(backTrack->lastContext);
+ backTrack->lastContext = backTrack->lastContext->next;
+ --backTrack->matchAmount;
+ }
+
+ struct DisjunctionContext
+ {
+ DisjunctionContext()
+ : term(0)
+ {
+ }
+
+ void* operator new(size_t, void* where)
+ {
+ return where;
+ }
+
+ int term;
+ unsigned matchBegin;
+ unsigned matchEnd;
+ uintptr_t frame[1];
+ };
+
+ DisjunctionContext* allocDisjunctionContext(ByteDisjunction* disjunction)
+ {
+ return new(malloc(sizeof(DisjunctionContext) + (disjunction->m_frameSize - 1) * sizeof(uintptr_t))) DisjunctionContext();
+ }
+
+ void freeDisjunctionContext(DisjunctionContext* context)
+ {
+ free(context);
+ }
+
+ struct ParenthesesDisjunctionContext
+ {
+ ParenthesesDisjunctionContext(int* output, ByteTerm& term)
+ : next(0)
+ {
+ unsigned firstSubpatternId = term.atom.subpatternId;
+ unsigned numNestedSubpatterns = term.atom.parenthesesDisjunction->m_numSubpatterns;
+
+ for (unsigned i = 0; i < (numNestedSubpatterns << 1); ++i) {
+ subpatternBackup[i] = output[(firstSubpatternId << 1) + i];
+ output[(firstSubpatternId << 1) + i] = -1;
+ }
+
+ new(getDisjunctionContext(term)) DisjunctionContext();
+ }
+
+ void* operator new(size_t, void* where)
+ {
+ return where;
+ }
+
+ void restoreOutput(int* output, unsigned firstSubpatternId, unsigned numNestedSubpatterns)
+ {
+ for (unsigned i = 0; i < (numNestedSubpatterns << 1); ++i)
+ output[(firstSubpatternId << 1) + i] = subpatternBackup[i];
+ }
+
+ DisjunctionContext* getDisjunctionContext(ByteTerm& term)
+ {
+ return reinterpret_cast<DisjunctionContext*>(&(subpatternBackup[term.atom.parenthesesDisjunction->m_numSubpatterns << 1]));
+ }
+
+ ParenthesesDisjunctionContext* next;
+ int subpatternBackup[1];
+ };
+
+ ParenthesesDisjunctionContext* allocParenthesesDisjunctionContext(ByteDisjunction* disjunction, int* output, ByteTerm& term)
+ {
+ return new(malloc(sizeof(ParenthesesDisjunctionContext) + (((term.atom.parenthesesDisjunction->m_numSubpatterns << 1) - 1) * sizeof(int)) + sizeof(DisjunctionContext) + (disjunction->m_frameSize - 1) * sizeof(uintptr_t))) ParenthesesDisjunctionContext(output, term);
+ }
+
+ void freeParenthesesDisjunctionContext(ParenthesesDisjunctionContext* context)
+ {
+ free(context);
+ }
+
+ class InputStream {
+ public:
+ InputStream(const UChar* input, unsigned start, unsigned length)
+ : input(input)
+ , pos(start)
+ , length(length)
+ {
+ }
+
+ void next()
+ {
+ ++pos;
+ }
+
+ void rewind(unsigned amount)
+ {
+ ASSERT(pos >= amount);
+ pos -= amount;
+ }
+
+ int read()
+ {
+ ASSERT(pos < length);
+ if (pos < length)
+ return input[pos];
+ return -1;
+ }
+
+ int readChecked(int position)
+ {
+ ASSERT(position < 0);
+ ASSERT((unsigned)-position <= pos);
+ unsigned p = pos + position;
+ ASSERT(p < length);
+ return input[p];
+ }
+
+ int reread(unsigned from)
+ {
+ ASSERT(from < length);
+ return input[from];
+ }
+
+ int prev()
+ {
+ ASSERT(!(pos > length));
+ if (pos && length)
+ return input[pos - 1];
+ return -1;
+ }
+
+ unsigned getPos()
+ {
+ return pos;
+ }
+
+ void setPos(unsigned p)
+ {
+ pos = p;
+ }
+
+ bool atStart()
+ {
+ return pos == 0;
+ }
+
+ bool atEnd()
+ {
+ return pos == length;
+ }
+
+ bool checkInput(int count)
+ {
+ if ((pos + count) <= length) {
+ pos += count;
+ return true;
+ } else
+ return false;
+ }
+
+ void uncheckInput(int count)
+ {
+ pos -= count;
+ }
+
+ bool atStart(int position)
+ {
+ return (pos + position) == 0;
+ }
+
+ bool atEnd(int position)
+ {
+ return (pos + position) == length;
+ }
+
+ private:
+ const UChar* input;
+ unsigned pos;
+ unsigned length;
+ };
+
+ bool testCharacterClass(CharacterClass* characterClass, int ch)
+ {
+ if (ch & 0xFF80) {
+ for (unsigned i = 0; i < characterClass->m_matchesUnicode.size(); ++i)
+ if (ch == characterClass->m_matchesUnicode[i])
+ return true;
+ for (unsigned i = 0; i < characterClass->m_rangesUnicode.size(); ++i)
+ if ((ch >= characterClass->m_rangesUnicode[i].begin) && (ch <= characterClass->m_rangesUnicode[i].end))
+ return true;
+ } else {
+ for (unsigned i = 0; i < characterClass->m_matches.size(); ++i)
+ if (ch == characterClass->m_matches[i])
+ return true;
+ for (unsigned i = 0; i < characterClass->m_ranges.size(); ++i)
+ if ((ch >= characterClass->m_ranges[i].begin) && (ch <= characterClass->m_ranges[i].end))
+ return true;
+ }
+
+ return false;
+ }
+
+ bool checkCharacter(int testChar, int inputPosition)
+ {
+ return testChar == input.readChecked(inputPosition);
+ }
+
+ bool checkCasedCharacter(int loChar, int hiChar, int inputPosition)
+ {
+ int ch = input.readChecked(inputPosition);
+ return (loChar == ch) || (hiChar == ch);
+ }
+
+ bool checkCharacterClass(CharacterClass* characterClass, bool invert, int inputPosition)
+ {
+ bool match = testCharacterClass(characterClass, input.readChecked(inputPosition));
+ return invert ? !match : match;
+ }
+
+ bool tryConsumeBackReference(int matchBegin, int matchEnd, int inputOffset)
+ {
+ int matchSize = matchEnd - matchBegin;
+
+ if (!input.checkInput(matchSize))
+ return false;
+
+ for (int i = 0; i < matchSize; ++i) {
+ if (!checkCharacter(input.reread(matchBegin + i), inputOffset - matchSize + i)) {
+ input.uncheckInput(matchSize);
+ return false;
+ }
+ }
+
+ return true;
+ }
+
+ bool matchAssertionBOL(ByteTerm& term)
+ {
+ return (input.atStart(term.inputPosition)) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.readChecked(term.inputPosition - 1)));
+ }
+
+ bool matchAssertionEOL(ByteTerm& term)
+ {
+ if (term.inputPosition)
+ return (input.atEnd(term.inputPosition)) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.readChecked(term.inputPosition)));
+ else
+ return (input.atEnd()) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.read()));
+ }
+
+ bool matchAssertionWordBoundary(ByteTerm& term)
+ {
+ bool prevIsWordchar = !input.atStart(term.inputPosition) && testCharacterClass(pattern->wordcharCharacterClass, input.readChecked(term.inputPosition - 1));
+ bool readIsWordchar;
+ if (term.inputPosition)
+ readIsWordchar = !input.atEnd(term.inputPosition) && testCharacterClass(pattern->wordcharCharacterClass, input.readChecked(term.inputPosition));
+ else
+ readIsWordchar = !input.atEnd() && testCharacterClass(pattern->wordcharCharacterClass, input.read());
+
+ bool wordBoundary = prevIsWordchar != readIsWordchar;
+ return term.invert() ? !wordBoundary : wordBoundary;
+ }
+
+ bool backtrackPatternCharacter(ByteTerm& term, DisjunctionContext* context)
+ {
+ BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation);
+
+ switch (term.atom.quantityType) {
+ case QuantifierFixedCount:
+ break;
+
+ case QuantifierGreedy:
+ if (backTrack->matchAmount) {
+ --backTrack->matchAmount;
+ input.uncheckInput(1);
+ return true;
+ }
+ break;
+
+ case QuantifierNonGreedy:
+ if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) {
+ ++backTrack->matchAmount;
+ if (checkCharacter(term.atom.patternCharacter, term.inputPosition - 1))
+ return true;
+ }
+ input.uncheckInput(backTrack->matchAmount);
+ break;
+ }
+
+ return false;
+ }
+
+ bool backtrackPatternCasedCharacter(ByteTerm& term, DisjunctionContext* context)
+ {
+ BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation);
+
+ switch (term.atom.quantityType) {
+ case QuantifierFixedCount:
+ break;
+
+ case QuantifierGreedy:
+ if (backTrack->matchAmount) {
+ --backTrack->matchAmount;
+ input.uncheckInput(1);
+ return true;
+ }
+ break;
+
+ case QuantifierNonGreedy:
+ if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) {
+ ++backTrack->matchAmount;
+ if (checkCasedCharacter(term.atom.casedCharacter.lo, term.atom.casedCharacter.hi, term.inputPosition - 1))
+ return true;
+ }
+ input.uncheckInput(backTrack->matchAmount);
+ break;
+ }
+
+ return false;
+ }
+
+ bool matchCharacterClass(ByteTerm& term, DisjunctionContext* context)
+ {
+ ASSERT(term.type == ByteTerm::TypeCharacterClass);
+ BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation);
+
+ switch (term.atom.quantityType) {
+ case QuantifierFixedCount: {
+ for (unsigned matchAmount = 0; matchAmount < term.atom.quantityCount; ++matchAmount) {
+ if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition + matchAmount))
+ return false;
+ }
+ return true;
+ }
+
+ case QuantifierGreedy: {
+ unsigned matchAmount = 0;
+ while ((matchAmount < term.atom.quantityCount) && input.checkInput(1)) {
+ if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition - 1)) {
+ input.uncheckInput(1);
+ break;
+ }
+ ++matchAmount;
+ }
+ backTrack->matchAmount = matchAmount;
+
+ return true;
+ }
+
+ case QuantifierNonGreedy:
+ backTrack->matchAmount = 0;
+ return true;
+ }
+
+ ASSERT_NOT_REACHED();
+ return false;
+ }
+
+ bool backtrackCharacterClass(ByteTerm& term, DisjunctionContext* context)
+ {
+ ASSERT(term.type == ByteTerm::TypeCharacterClass);
+ BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation);
+
+ switch (term.atom.quantityType) {
+ case QuantifierFixedCount:
+ break;
+
+ case QuantifierGreedy:
+ if (backTrack->matchAmount) {
+ --backTrack->matchAmount;
+ input.uncheckInput(1);
+ return true;
+ }
+ break;
+
+ case QuantifierNonGreedy:
+ if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) {
+ ++backTrack->matchAmount;
+ if (checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition - 1))
+ return true;
+ }
+ input.uncheckInput(backTrack->matchAmount);
+ break;
+ }
+
+ return false;
+ }
+
+ bool matchBackReference(ByteTerm& term, DisjunctionContext* context)
+ {
+ ASSERT(term.type == ByteTerm::TypeBackReference);
+ BackTrackInfoBackReference* backTrack = reinterpret_cast<BackTrackInfoBackReference*>(context->frame + term.frameLocation);
+
+ int matchBegin = output[(term.atom.subpatternId << 1)];
+ int matchEnd = output[(term.atom.subpatternId << 1) + 1];
+ ASSERT((matchBegin == -1) == (matchEnd == -1));
+ ASSERT(matchBegin <= matchEnd);
+
+ if (matchBegin == matchEnd)
+ return true;
+
+ switch (term.atom.quantityType) {
+ case QuantifierFixedCount: {
+ backTrack->begin = input.getPos();
+ for (unsigned matchAmount = 0; matchAmount < term.atom.quantityCount; ++matchAmount) {
+ if (!tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) {
+ input.setPos(backTrack->begin);
+ return false;
+ }
+ }
+ return true;
+ }
+
+ case QuantifierGreedy: {
+ unsigned matchAmount = 0;
+ while ((matchAmount < term.atom.quantityCount) && tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition))
+ ++matchAmount;
+ backTrack->matchAmount = matchAmount;
+ return true;
+ }
+
+ case QuantifierNonGreedy:
+ backTrack->begin = input.getPos();
+ backTrack->matchAmount = 0;
+ return true;
+ }
+
+ ASSERT_NOT_REACHED();
+ return false;
+ }
+
+ bool backtrackBackReference(ByteTerm& term, DisjunctionContext* context)
+ {
+ ASSERT(term.type == ByteTerm::TypeBackReference);
+ BackTrackInfoBackReference* backTrack = reinterpret_cast<BackTrackInfoBackReference*>(context->frame + term.frameLocation);
+
+ int matchBegin = output[(term.atom.subpatternId << 1)];
+ int matchEnd = output[(term.atom.subpatternId << 1) + 1];
+ ASSERT((matchBegin == -1) == (matchEnd == -1));
+ ASSERT(matchBegin <= matchEnd);
+
+ if (matchBegin == matchEnd)
+ return false;
+
+ switch (term.atom.quantityType) {
+ case QuantifierFixedCount:
+ // for quantityCount == 1, could rewind.
+ input.setPos(backTrack->begin);
+ break;
+
+ case QuantifierGreedy:
+ if (backTrack->matchAmount) {
+ --backTrack->matchAmount;
+ input.rewind(matchEnd - matchBegin);
+ return true;
+ }
+ break;
+
+ case QuantifierNonGreedy:
+ if ((backTrack->matchAmount < term.atom.quantityCount) && tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) {
+ ++backTrack->matchAmount;
+ return true;
+ } else
+ input.setPos(backTrack->begin);
+ break;
+ }
+
+ return false;
+ }
+
+ void recordParenthesesMatch(ByteTerm& term, ParenthesesDisjunctionContext* context)
+ {
+ if (term.capture()) {
+ unsigned subpatternId = term.atom.subpatternId;
+ output[(subpatternId << 1)] = context->getDisjunctionContext(term)->matchBegin + term.inputPosition;
+ output[(subpatternId << 1) + 1] = context->getDisjunctionContext(term)->matchEnd + term.inputPosition;
+ }
+ }
+ void resetMatches(ByteTerm& term, ParenthesesDisjunctionContext* context)
+ {
+ unsigned firstSubpatternId = term.atom.subpatternId;
+ unsigned count = term.atom.parenthesesDisjunction->m_numSubpatterns;
+ context->restoreOutput(output, firstSubpatternId, count);
+ }
+ void resetAssertionMatches(ByteTerm& term)
+ {
+ unsigned firstSubpatternId = term.atom.subpatternId;
+ unsigned count = term.atom.parenthesesDisjunction->m_numSubpatterns;
+ for (unsigned i = 0; i < (count << 1); ++i)
+ output[(firstSubpatternId << 1) + i] = -1;
+ }
+ bool parenthesesDoBacktrack(ByteTerm& term, BackTrackInfoParentheses* backTrack)
+ {
+ while (backTrack->matchAmount) {
+ ParenthesesDisjunctionContext* context = backTrack->lastContext;
+
+ if (matchDisjunction(term.atom.parenthesesDisjunction, context->getDisjunctionContext(term), true))
+ return true;
+
+ resetMatches(term, context);
+ popParenthesesDisjunctionContext(backTrack);
+ freeParenthesesDisjunctionContext(context);
+ }
+
+ return false;
+ }
+
+ bool matchParenthesesOnceBegin(ByteTerm& term, DisjunctionContext* context)
+ {
+ ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
+ ASSERT(term.atom.quantityCount == 1);
+
+ BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation);
+
+ switch (term.atom.quantityType) {
+ case QuantifierGreedy: {
+ // set this speculatively; if we get to the parens end this will be true.
+ backTrack->inParentheses = 1;
+ break;
+ }
+ case QuantifierNonGreedy: {
+ backTrack->inParentheses = 0;
+ context->term += term.atom.parenthesesWidth;
+ return true;
+ }
+ case QuantifierFixedCount:
+ break;
+ }
+
+ if (term.capture()) {
+ unsigned subpatternId = term.atom.subpatternId;
+ output[(subpatternId << 1)] = input.getPos() + term.inputPosition;
+ }
+
+ return true;
+ }
+
+ bool matchParenthesesOnceEnd(ByteTerm& term, DisjunctionContext*)
+ {
+ ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceEnd);
+ ASSERT(term.atom.quantityCount == 1);
+
+ if (term.capture()) {
+ unsigned subpatternId = term.atom.subpatternId;
+ output[(subpatternId << 1) + 1] = input.getPos() + term.inputPosition;
+ }
+ return true;
+ }
+
+ bool backtrackParenthesesOnceBegin(ByteTerm& term, DisjunctionContext* context)
+ {
+ ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
+ ASSERT(term.atom.quantityCount == 1);
+
+ BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation);
+
+ if (term.capture()) {
+ unsigned subpatternId = term.atom.subpatternId;
+ output[(subpatternId << 1)] = -1;
+ output[(subpatternId << 1) + 1] = -1;
+ }
+
+ switch (term.atom.quantityType) {
+ case QuantifierGreedy:
+ // if we backtrack to this point, there is another chance - try matching nothing.
+ ASSERT(backTrack->inParentheses);
+ backTrack->inParentheses = 0;
+ context->term += term.atom.parenthesesWidth;
+ return true;
+ case QuantifierNonGreedy:
+ ASSERT(backTrack->inParentheses);
+ case QuantifierFixedCount:
+ break;
+ }
+
+ return false;
+ }
+
+ bool backtrackParenthesesOnceEnd(ByteTerm& term, DisjunctionContext* context)
+ {
+ ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceEnd);
+ ASSERT(term.atom.quantityCount == 1);
+
+ BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation);
+
+ switch (term.atom.quantityType) {
+ case QuantifierGreedy:
+ if (!backTrack->inParentheses) {
+ context->term -= term.atom.parenthesesWidth;
+ return false;
+ }
+ case QuantifierNonGreedy:
+ if (!backTrack->inParentheses) {
+ // now try to match the parens; set this speculatively.
+ backTrack->inParentheses = 1;
+ if (term.capture()) {
+ unsigned subpatternId = term.atom.subpatternId;
+ output[subpatternId << 1] = input.getPos() + term.inputPosition;
+ }
+ context->term -= term.atom.parenthesesWidth;
+ return true;
+ }
+ case QuantifierFixedCount:
+ break;
+ }
+
+ return false;
+ }
+
+ bool matchParentheticalAssertionBegin(ByteTerm& term, DisjunctionContext* context)
+ {
+ ASSERT(term.type == ByteTerm::TypeParentheticalAssertionBegin);
+ ASSERT(term.atom.quantityCount == 1);
+
+ BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation);
+
+ backTrack->begin = input.getPos();
+ return true;
+ }
+
+ bool matchParentheticalAssertionEnd(ByteTerm& term, DisjunctionContext* context)
+ {
+ ASSERT(term.type == ByteTerm::TypeParentheticalAssertionEnd);
+ ASSERT(term.atom.quantityCount == 1);
+
+ BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation);
+
+ input.setPos(backTrack->begin);
+
+ // We've reached the end of the parens; if they are inverted, this is failure.
+ if (term.invert()) {
+ context->term -= term.atom.parenthesesWidth;
+ return false;
+ }
+
+ return true;
+ }
+
+ bool backtrackParentheticalAssertionBegin(ByteTerm& term, DisjunctionContext* context)
+ {
+ ASSERT(term.type == ByteTerm::TypeParentheticalAssertionBegin);
+ ASSERT(term.atom.quantityCount == 1);
+
+ // We've failed to match parens; if they are inverted, this is win!
+ if (term.invert()) {
+ context->term += term.atom.parenthesesWidth;
+ return true;
+ }
+
+ return false;
+ }
+
+ bool backtrackParentheticalAssertionEnd(ByteTerm& term, DisjunctionContext* context)
+ {
+ ASSERT(term.type == ByteTerm::TypeParentheticalAssertionEnd);
+ ASSERT(term.atom.quantityCount == 1);
+
+ BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation);
+
+ input.setPos(backTrack->begin);
+
+ context->term -= term.atom.parenthesesWidth;
+ return false;
+ }
+
+ bool matchParentheses(ByteTerm& term, DisjunctionContext* context)
+ {
+ ASSERT(term.type == ByteTerm::TypeParenthesesSubpattern);
+
+ BackTrackInfoParentheses* backTrack = reinterpret_cast<BackTrackInfoParentheses*>(context->frame + term.frameLocation);
+
+ unsigned subpatternId = term.atom.subpatternId;
+ ByteDisjunction* disjunctionBody = term.atom.parenthesesDisjunction;
+
+ backTrack->prevBegin = output[(subpatternId << 1)];
+ backTrack->prevEnd = output[(subpatternId << 1) + 1];
+
+ backTrack->matchAmount = 0;
+ backTrack->lastContext = 0;
+
+ switch (term.atom.quantityType) {
+ case QuantifierFixedCount: {
+ // While we haven't yet reached our fixed limit,
+ while (backTrack->matchAmount < term.atom.quantityCount) {
+ // Try to do a match, and it it succeeds, add it to the list.
+ ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
+ if (matchDisjunction(disjunctionBody, context->getDisjunctionContext(term)))
+ appendParenthesesDisjunctionContext(backTrack, context);
+ else {
+ // The match failed; try to find an alternate point to carry on from.
+ resetMatches(term, context);
+ freeParenthesesDisjunctionContext(context);
+ if (!parenthesesDoBacktrack(term, backTrack))
+ return false;
+ }
+ }
+
+ ASSERT(backTrack->matchAmount == term.atom.quantityCount);
+ ParenthesesDisjunctionContext* context = backTrack->lastContext;
+ recordParenthesesMatch(term, context);
+ return true;
+ }
+
+ case QuantifierGreedy: {
+ while (backTrack->matchAmount < term.atom.quantityCount) {
+ ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
+ if (matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term)))
+ appendParenthesesDisjunctionContext(backTrack, context);
+ else {
+ resetMatches(term, context);
+ freeParenthesesDisjunctionContext(context);
+ break;
+ }
+ }
+
+ if (backTrack->matchAmount) {
+ ParenthesesDisjunctionContext* context = backTrack->lastContext;
+ recordParenthesesMatch(term, context);
+ }
+ return true;
+ }
+
+ case QuantifierNonGreedy:
+ return true;
+ }
+
+ ASSERT_NOT_REACHED();
+ return false;
+ }
+
+ // Rules for backtracking differ depending on whether this is greedy or non-greedy.
+ //
+ // Greedy matches never should try just adding more - you should already have done
+ // the 'more' cases. Always backtrack, at least a leetle bit. However cases where
+ // you backtrack an item off the list needs checking, since we'll never have matched
+ // the one less case. Tracking forwards, still add as much as possible.
+ //
+ // Non-greedy, we've already done the one less case, so don't match on popping.
+ // We haven't done the one more case, so always try to add that.
+ //
+ bool backtrackParentheses(ByteTerm& term, DisjunctionContext* context)
+ {
+ ASSERT(term.type == ByteTerm::TypeParenthesesSubpattern);
+
+ BackTrackInfoParentheses* backTrack = reinterpret_cast<BackTrackInfoParentheses*>(context->frame + term.frameLocation);
+
+ if (term.capture()) {
+ unsigned subpatternId = term.atom.subpatternId;
+ output[(subpatternId << 1)] = backTrack->prevBegin;
+ output[(subpatternId << 1) + 1] = backTrack->prevEnd;
+ }
+
+ ByteDisjunction* disjunctionBody = term.atom.parenthesesDisjunction;
+
+ switch (term.atom.quantityType) {
+ case QuantifierFixedCount: {
+ ASSERT(backTrack->matchAmount == term.atom.quantityCount);
+
+ ParenthesesDisjunctionContext* context = 0;
+
+ if (!parenthesesDoBacktrack(term, backTrack))
+ return false;
+
+ // While we haven't yet reached our fixed limit,
+ while (backTrack->matchAmount < term.atom.quantityCount) {
+ // Try to do a match, and it it succeeds, add it to the list.
+ context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
+ if (matchDisjunction(disjunctionBody, context->getDisjunctionContext(term)))
+ appendParenthesesDisjunctionContext(backTrack, context);
+ else {
+ // The match failed; try to find an alternate point to carry on from.
+ resetMatches(term, context);
+ freeParenthesesDisjunctionContext(context);
+ if (!parenthesesDoBacktrack(term, backTrack))
+ return false;
+ }
+ }
+
+ ASSERT(backTrack->matchAmount == term.atom.quantityCount);
+ context = backTrack->lastContext;
+ recordParenthesesMatch(term, context);
+ return true;
+ }
+
+ case QuantifierGreedy: {
+ if (!backTrack->matchAmount)
+ return false;
+
+ ParenthesesDisjunctionContext* context = backTrack->lastContext;
+ if (matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term), true)) {
+ while (backTrack->matchAmount < term.atom.quantityCount) {
+ ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
+ if (matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term)))
+ appendParenthesesDisjunctionContext(backTrack, context);
+ else {
+ resetMatches(term, context);
+ freeParenthesesDisjunctionContext(context);
+ break;
+ }
+ }
+ } else {
+ resetMatches(term, context);
+ popParenthesesDisjunctionContext(backTrack);
+ freeParenthesesDisjunctionContext(context);
+ }
+
+ if (backTrack->matchAmount) {
+ ParenthesesDisjunctionContext* context = backTrack->lastContext;
+ recordParenthesesMatch(term, context);
+ }
+ return true;
+ }
+
+ case QuantifierNonGreedy: {
+ // If we've not reached the limit, try to add one more match.
+ if (backTrack->matchAmount < term.atom.quantityCount) {
+ ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
+ if (matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term))) {
+ appendParenthesesDisjunctionContext(backTrack, context);
+ recordParenthesesMatch(term, context);
+ return true;
+ } else {
+ resetMatches(term, context);
+ freeParenthesesDisjunctionContext(context);
+ }
+ }
+
+ // Nope - okay backtrack looking for an alternative.
+ while (backTrack->matchAmount) {
+ ParenthesesDisjunctionContext* context = backTrack->lastContext;
+ if (matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term), true)) {
+ // successful backtrack! we're back in the game!
+ if (backTrack->matchAmount) {
+ context = backTrack->lastContext;
+ recordParenthesesMatch(term, context);
+ }
+ return true;
+ }
+
+ // pop a match off the stack
+ resetMatches(term, context);
+ popParenthesesDisjunctionContext(backTrack);
+ freeParenthesesDisjunctionContext(context);
+ }
+
+ return false;
+ }
+ }
+
+ ASSERT_NOT_REACHED();
+ return false;
+ }
+
+#define MATCH_NEXT() { ++context->term; goto matchAgain; }
+#define BACKTRACK() { --context->term; goto backtrack; }
+#define currentTerm() (disjunction->terms[context->term])
+ bool matchDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false)
+ {
+ if (btrack)
+ BACKTRACK();
+
+ context->matchBegin = input.getPos();
+ context->term = 0;
+
+ matchAgain:
+ ASSERT(context->term < static_cast<int>(disjunction->terms.size()));
+
+ switch (currentTerm().type) {
+ case ByteTerm::TypeSubpatternBegin:
+ MATCH_NEXT();
+ case ByteTerm::TypeSubpatternEnd:
+ context->matchEnd = input.getPos();
+ return true;
+
+ case ByteTerm::TypeBodyAlternativeBegin:
+ MATCH_NEXT();
+ case ByteTerm::TypeBodyAlternativeDisjunction:
+ case ByteTerm::TypeBodyAlternativeEnd:
+ context->matchEnd = input.getPos();
+ return true;
+
+ case ByteTerm::TypeAlternativeBegin:
+ MATCH_NEXT();
+ case ByteTerm::TypeAlternativeDisjunction:
+ case ByteTerm::TypeAlternativeEnd: {
+ int offset = currentTerm().alternative.end;
+ BackTrackInfoAlternative* backTrack = reinterpret_cast<BackTrackInfoAlternative*>(context->frame + currentTerm().frameLocation);
+ backTrack->offset = offset;
+ context->term += offset;
+ MATCH_NEXT();
+ }
+
+ case ByteTerm::TypeAssertionBOL:
+ if (matchAssertionBOL(currentTerm()))
+ MATCH_NEXT();
+ BACKTRACK();
+ case ByteTerm::TypeAssertionEOL:
+ if (matchAssertionEOL(currentTerm()))
+ MATCH_NEXT();
+ BACKTRACK();
+ case ByteTerm::TypeAssertionWordBoundary:
+ if (matchAssertionWordBoundary(currentTerm()))
+ MATCH_NEXT();
+ BACKTRACK();
+
+ case ByteTerm::TypePatternCharacterOnce:
+ case ByteTerm::TypePatternCharacterFixed: {
+ for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityCount; ++matchAmount) {
+ if (!checkCharacter(currentTerm().atom.patternCharacter, currentTerm().inputPosition + matchAmount))
+ BACKTRACK();
+ }
+ MATCH_NEXT();
+ }
+ case ByteTerm::TypePatternCharacterGreedy: {
+ BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
+ unsigned matchAmount = 0;
+ while ((matchAmount < currentTerm().atom.quantityCount) && input.checkInput(1)) {
+ if (!checkCharacter(currentTerm().atom.patternCharacter, currentTerm().inputPosition - 1)) {
+ input.uncheckInput(1);
+ break;
+ }
+ ++matchAmount;
+ }
+ backTrack->matchAmount = matchAmount;
+
+ MATCH_NEXT();
+ }
+ case ByteTerm::TypePatternCharacterNonGreedy: {
+ BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
+ backTrack->matchAmount = 0;
+ MATCH_NEXT();
+ }
+
+ case ByteTerm::TypePatternCasedCharacterOnce:
+ case ByteTerm::TypePatternCasedCharacterFixed: {
+ for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityCount; ++matchAmount) {
+ if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition + matchAmount))
+ BACKTRACK();
+ }
+ MATCH_NEXT();
+ }
+ case ByteTerm::TypePatternCasedCharacterGreedy: {
+ BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
+ unsigned matchAmount = 0;
+ while ((matchAmount < currentTerm().atom.quantityCount) && input.checkInput(1)) {
+ if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition - 1)) {
+ input.uncheckInput(1);
+ break;
+ }
+ ++matchAmount;
+ }
+ backTrack->matchAmount = matchAmount;
+
+ MATCH_NEXT();
+ }
+ case ByteTerm::TypePatternCasedCharacterNonGreedy: {
+ BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
+ backTrack->matchAmount = 0;
+ MATCH_NEXT();
+ }
+
+ case ByteTerm::TypeCharacterClass:
+ if (matchCharacterClass(currentTerm(), context))
+ MATCH_NEXT();
+ BACKTRACK();
+ case ByteTerm::TypeBackReference:
+ if (matchBackReference(currentTerm(), context))
+ MATCH_NEXT();
+ BACKTRACK();
+ case ByteTerm::TypeParenthesesSubpattern:
+ if (matchParentheses(currentTerm(), context))
+ MATCH_NEXT();
+ BACKTRACK();
+ case ByteTerm::TypeParenthesesSubpatternOnceBegin:
+ if (matchParenthesesOnceBegin(currentTerm(), context))
+ MATCH_NEXT();
+ BACKTRACK();
+ case ByteTerm::TypeParenthesesSubpatternOnceEnd:
+ if (matchParenthesesOnceEnd(currentTerm(), context))
+ MATCH_NEXT();
+ BACKTRACK();
+ case ByteTerm::TypeParentheticalAssertionBegin:
+ if (matchParentheticalAssertionBegin(currentTerm(), context))
+ MATCH_NEXT();
+ BACKTRACK();
+ case ByteTerm::TypeParentheticalAssertionEnd:
+ if (matchParentheticalAssertionEnd(currentTerm(), context))
+ MATCH_NEXT();
+ BACKTRACK();
+
+ case ByteTerm::TypeCheckInput:
+ if (input.checkInput(currentTerm().checkInputCount))
+ MATCH_NEXT();
+ BACKTRACK();
+ }
+
+ // We should never fall-through to here.
+ ASSERT_NOT_REACHED();
+
+ backtrack:
+ ASSERT(context->term < static_cast<int>(disjunction->terms.size()));
+
+ switch (currentTerm().type) {
+ case ByteTerm::TypeSubpatternBegin:
+ return false;
+ case ByteTerm::TypeSubpatternEnd:
+ ASSERT_NOT_REACHED();
+
+ case ByteTerm::TypeBodyAlternativeBegin:
+ case ByteTerm::TypeBodyAlternativeDisjunction: {
+ int offset = currentTerm().alternative.next;
+ context->term += offset;
+ if (offset > 0)
+ MATCH_NEXT();
+
+ if (input.atEnd())
+ return false;
+
+ input.next();
+ context->matchBegin = input.getPos();
+ MATCH_NEXT();
+ }
+ case ByteTerm::TypeBodyAlternativeEnd:
+ ASSERT_NOT_REACHED();
+
+ case ByteTerm::TypeAlternativeBegin:
+ case ByteTerm::TypeAlternativeDisjunction: {
+ int offset = currentTerm().alternative.next;
+ context->term += offset;
+ if (offset > 0)
+ MATCH_NEXT();
+ BACKTRACK();
+ }
+ case ByteTerm::TypeAlternativeEnd: {
+ // We should never backtrack back into an alternative of the main body of the regex.
+ BackTrackInfoAlternative* backTrack = reinterpret_cast<BackTrackInfoAlternative*>(context->frame + currentTerm().frameLocation);
+ unsigned offset = backTrack->offset;
+ context->term -= offset;
+ BACKTRACK();
+ }
+
+ case ByteTerm::TypeAssertionBOL:
+ case ByteTerm::TypeAssertionEOL:
+ case ByteTerm::TypeAssertionWordBoundary:
+ BACKTRACK();
+
+ case ByteTerm::TypePatternCharacterOnce:
+ case ByteTerm::TypePatternCharacterFixed:
+ case ByteTerm::TypePatternCharacterGreedy:
+ case ByteTerm::TypePatternCharacterNonGreedy:
+ if (backtrackPatternCharacter(currentTerm(), context))
+ MATCH_NEXT();
+ BACKTRACK();
+ case ByteTerm::TypePatternCasedCharacterOnce:
+ case ByteTerm::TypePatternCasedCharacterFixed:
+ case ByteTerm::TypePatternCasedCharacterGreedy:
+ case ByteTerm::TypePatternCasedCharacterNonGreedy:
+ if (backtrackPatternCasedCharacter(currentTerm(), context))
+ MATCH_NEXT();
+ BACKTRACK();
+ case ByteTerm::TypeCharacterClass:
+ if (backtrackCharacterClass(currentTerm(), context))
+ MATCH_NEXT();
+ BACKTRACK();
+ case ByteTerm::TypeBackReference:
+ if (backtrackBackReference(currentTerm(), context))
+ MATCH_NEXT();
+ BACKTRACK();
+ case ByteTerm::TypeParenthesesSubpattern:
+ if (backtrackParentheses(currentTerm(), context))
+ MATCH_NEXT();
+ BACKTRACK();
+ case ByteTerm::TypeParenthesesSubpatternOnceBegin:
+ if (backtrackParenthesesOnceBegin(currentTerm(), context))
+ MATCH_NEXT();
+ BACKTRACK();
+ case ByteTerm::TypeParenthesesSubpatternOnceEnd:
+ if (backtrackParenthesesOnceEnd(currentTerm(), context))
+ MATCH_NEXT();
+ BACKTRACK();
+ case ByteTerm::TypeParentheticalAssertionBegin:
+ if (backtrackParentheticalAssertionBegin(currentTerm(), context))
+ MATCH_NEXT();
+ BACKTRACK();
+ case ByteTerm::TypeParentheticalAssertionEnd:
+ if (backtrackParentheticalAssertionEnd(currentTerm(), context))
+ MATCH_NEXT();
+ BACKTRACK();
+
+ case ByteTerm::TypeCheckInput:
+ input.uncheckInput(currentTerm().checkInputCount);
+ BACKTRACK();
+ }
+
+ ASSERT_NOT_REACHED();
+ return false;
+ }
+
+ bool matchNonZeroDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false)
+ {
+ if (matchDisjunction(disjunction, context, btrack)) {
+ while (context->matchBegin == context->matchEnd) {
+ if (!matchDisjunction(disjunction, context, true))
+ return false;
+ }
+ return true;
+ }
+
+ return false;
+ }
+
+ int interpret()
+ {
+ for (unsigned i = 0; i < ((pattern->m_body->m_numSubpatterns + 1) << 1); ++i)
+ output[i] = -1;
+
+ DisjunctionContext* context = allocDisjunctionContext(pattern->m_body.get());
+
+ if (matchDisjunction(pattern->m_body.get(), context)) {
+ output[0] = context->matchBegin;
+ output[1] = context->matchEnd;
+ }
+
+ freeDisjunctionContext(context);
+
+ return output[0];
+ }
+
+ Interpreter(BytecodePattern* pattern, int* output, const UChar* inputChar, unsigned start, unsigned length)
+ : pattern(pattern)
+ , output(output)
+ , input(inputChar, start, length)
+ {
+ }
+
+private:
+ BytecodePattern *pattern;
+ int* output;
+ InputStream input;
+};
+
+
+
+class ByteCompiler {
+ struct ParenthesesStackEntry {
+ unsigned beginTerm;
+ unsigned savedAlternativeIndex;
+ ParenthesesStackEntry(unsigned beginTerm, unsigned savedAlternativeIndex/*, unsigned subpatternId, bool capture = false*/)
+ : beginTerm(beginTerm)
+ , savedAlternativeIndex(savedAlternativeIndex)
+ {
+ }
+ };
+
+public:
+ ByteCompiler(RegexPattern& pattern)
+ : m_pattern(pattern)
+ {
+ m_currentAlternativeIndex = 0;
+ }
+
+ PassOwnPtr<BytecodePattern> compile()
+ {
+ regexBegin(m_pattern.m_numSubpatterns, m_pattern.m_body->m_callFrameSize);
+ emitDisjunction(m_pattern.m_body);
+ regexEnd();
+
+ return adoptPtr(new BytecodePattern(m_bodyDisjunction.release(), m_allParenthesesInfo, m_pattern));
+ }
+
+ void checkInput(unsigned count)
+ {
+ m_bodyDisjunction->terms.append(ByteTerm::CheckInput(count));
+ }
+
+ void assertionBOL(int inputPosition)
+ {
+ m_bodyDisjunction->terms.append(ByteTerm::BOL(inputPosition));
+ }
+
+ void assertionEOL(int inputPosition)
+ {
+ m_bodyDisjunction->terms.append(ByteTerm::EOL(inputPosition));
+ }
+
+ void assertionWordBoundary(bool invert, int inputPosition)
+ {
+ m_bodyDisjunction->terms.append(ByteTerm::WordBoundary(invert, inputPosition));
+ }
+
+ void atomPatternCharacter(UChar ch, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType)
+ {
+ if (m_pattern.m_ignoreCase) {
+ UChar lo = Unicode::toLower(ch);
+ UChar hi = Unicode::toUpper(ch);
+
+ if (lo != hi) {
+ m_bodyDisjunction->terms.append(ByteTerm(lo, hi, inputPosition, frameLocation, quantityCount, quantityType));
+ return;
+ }
+ }
+
+ m_bodyDisjunction->terms.append(ByteTerm(ch, inputPosition, frameLocation, quantityCount, quantityType));
+ }
+
+ void atomCharacterClass(CharacterClass* characterClass, bool invert, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType)
+ {
+ m_bodyDisjunction->terms.append(ByteTerm(characterClass, invert, inputPosition));
+
+ m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityCount = quantityCount;
+ m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityType = quantityType;
+ m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
+ }
+
+ void atomBackReference(unsigned subpatternId, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType)
+ {
+ ASSERT(subpatternId);
+
+ m_bodyDisjunction->terms.append(ByteTerm::BackReference(subpatternId, inputPosition));
+
+ m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityCount = quantityCount;
+ m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityType = quantityType;
+ m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
+ }
+
+ void atomParenthesesSubpatternBegin(unsigned subpatternId, bool capture, int inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation)
+ {
+ int beginTerm = m_bodyDisjunction->terms.size();
+
+ m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceBegin, subpatternId, capture, inputPosition));
+ m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
+ m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin());
+ m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation;
+
+ m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex));
+ m_currentAlternativeIndex = beginTerm + 1;
+ }
+
+ void atomParentheticalAssertionBegin(unsigned subpatternId, bool invert, unsigned frameLocation, unsigned alternativeFrameLocation)
+ {
+ int beginTerm = m_bodyDisjunction->terms.size();
+
+ m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParentheticalAssertionBegin, subpatternId, invert, 0));
+ m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
+ m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin());
+ m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation;
+
+ m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex));
+ m_currentAlternativeIndex = beginTerm + 1;
+ }
+
+ unsigned popParenthesesStack()
+ {
+ ASSERT(m_parenthesesStack.size());
+ int stackEnd = m_parenthesesStack.size() - 1;
+ unsigned beginTerm = m_parenthesesStack[stackEnd].beginTerm;
+ m_currentAlternativeIndex = m_parenthesesStack[stackEnd].savedAlternativeIndex;
+ m_parenthesesStack.shrink(stackEnd);
+
+ ASSERT(beginTerm < m_bodyDisjunction->terms.size());
+ ASSERT(m_currentAlternativeIndex < m_bodyDisjunction->terms.size());
+
+ return beginTerm;
+ }
+
+#ifndef NDEBUG
+ void dumpDisjunction(ByteDisjunction* disjunction)
+ {
+ printf("ByteDisjunction(%p):\n\t", disjunction);
+ for (unsigned i = 0; i < disjunction->terms.size(); ++i)
+ printf("{ %d } ", disjunction->terms[i].type);
+ printf("\n");
+ }
+#endif
+
+ void closeAlternative(int beginTerm)
+ {
+ int origBeginTerm = beginTerm;
+ ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeAlternativeBegin);
+ int endIndex = m_bodyDisjunction->terms.size();
+
+ unsigned frameLocation = m_bodyDisjunction->terms[beginTerm].frameLocation;
+
+ if (!m_bodyDisjunction->terms[beginTerm].alternative.next)
+ m_bodyDisjunction->terms.remove(beginTerm);
+ else {
+ while (m_bodyDisjunction->terms[beginTerm].alternative.next) {
+ beginTerm += m_bodyDisjunction->terms[beginTerm].alternative.next;
+ ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeAlternativeDisjunction);
+ m_bodyDisjunction->terms[beginTerm].alternative.end = endIndex - beginTerm;
+ m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation;
+ }
+
+ m_bodyDisjunction->terms[beginTerm].alternative.next = origBeginTerm - beginTerm;
+
+ m_bodyDisjunction->terms.append(ByteTerm::AlternativeEnd());
+ m_bodyDisjunction->terms[endIndex].frameLocation = frameLocation;
+ }
+ }
+
+ void closeBodyAlternative()
+ {
+ int beginTerm = 0;
+ int origBeginTerm = 0;
+ ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeBodyAlternativeBegin);
+ int endIndex = m_bodyDisjunction->terms.size();
+
+ unsigned frameLocation = m_bodyDisjunction->terms[beginTerm].frameLocation;
+
+ while (m_bodyDisjunction->terms[beginTerm].alternative.next) {
+ beginTerm += m_bodyDisjunction->terms[beginTerm].alternative.next;
+ ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeBodyAlternativeDisjunction);
+ m_bodyDisjunction->terms[beginTerm].alternative.end = endIndex - beginTerm;
+ m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation;
+ }
+
+ m_bodyDisjunction->terms[beginTerm].alternative.next = origBeginTerm - beginTerm;
+
+ m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeEnd());
+ m_bodyDisjunction->terms[endIndex].frameLocation = frameLocation;
+ }
+
+ void atomParenthesesEnd(bool doInline, unsigned lastSubpatternId, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType, unsigned callFrameSize = 0)
+ {
+ unsigned beginTerm = popParenthesesStack();
+ closeAlternative(beginTerm + 1);
+ unsigned endTerm = m_bodyDisjunction->terms.size();
+
+ bool isAssertion = m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParentheticalAssertionBegin;
+ bool invertOrCapture = m_bodyDisjunction->terms[beginTerm].invertOrCapture;
+ unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].atom.subpatternId;
+
+ m_bodyDisjunction->terms.append(ByteTerm(isAssertion ? ByteTerm::TypeParentheticalAssertionEnd : ByteTerm::TypeParenthesesSubpatternOnceEnd, subpatternId, invertOrCapture, inputPosition));
+ m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm;
+ m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm;
+ m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation;
+
+ if (doInline) {
+ m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount;
+ m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType;
+ m_bodyDisjunction->terms[endTerm].atom.quantityCount = quantityCount;
+ m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType;
+ } else {
+ ByteTerm& parenthesesBegin = m_bodyDisjunction->terms[beginTerm];
+ ASSERT(parenthesesBegin.type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
+
+ bool invertOrCapture = parenthesesBegin.invertOrCapture;
+ unsigned subpatternId = parenthesesBegin.atom.subpatternId;
+
+ unsigned numSubpatterns = lastSubpatternId - subpatternId + 1;
+ ByteDisjunction* parenthesesDisjunction = new ByteDisjunction(numSubpatterns, callFrameSize);
+
+ parenthesesDisjunction->terms.append(ByteTerm::SubpatternBegin());
+ for (unsigned termInParentheses = beginTerm + 1; termInParentheses < endTerm; ++termInParentheses)
+ parenthesesDisjunction->terms.append(m_bodyDisjunction->terms[termInParentheses]);
+ parenthesesDisjunction->terms.append(ByteTerm::SubpatternEnd());
+
+ m_bodyDisjunction->terms.shrink(beginTerm);
+
+ m_allParenthesesInfo.append(parenthesesDisjunction);
+ m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction, invertOrCapture, inputPosition));
+
+ m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount;
+ m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType;
+ m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation;
+ }
+ }
+
+ void regexBegin(unsigned numSubpatterns, unsigned callFrameSize)
+ {
+ m_bodyDisjunction = adoptPtr(new ByteDisjunction(numSubpatterns, callFrameSize));
+ m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeBegin());
+ m_bodyDisjunction->terms[0].frameLocation = 0;
+ m_currentAlternativeIndex = 0;
+ }
+
+ void regexEnd()
+ {
+ closeBodyAlternative();
+ }
+
+ void alternativeBodyDisjunction()
+ {
+ int newAlternativeIndex = m_bodyDisjunction->terms.size();
+ m_bodyDisjunction->terms[m_currentAlternativeIndex].alternative.next = newAlternativeIndex - m_currentAlternativeIndex;
+ m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeDisjunction());
+
+ m_currentAlternativeIndex = newAlternativeIndex;
+ }
+
+ void alternativeDisjunction()
+ {
+ int newAlternativeIndex = m_bodyDisjunction->terms.size();
+ m_bodyDisjunction->terms[m_currentAlternativeIndex].alternative.next = newAlternativeIndex - m_currentAlternativeIndex;
+ m_bodyDisjunction->terms.append(ByteTerm::AlternativeDisjunction());
+
+ m_currentAlternativeIndex = newAlternativeIndex;
+ }
+
+ void emitDisjunction(PatternDisjunction* disjunction, unsigned inputCountAlreadyChecked = 0, unsigned parenthesesInputCountAlreadyChecked = 0)
+ {
+ for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) {
+ unsigned currentCountAlreadyChecked = inputCountAlreadyChecked;
+
+ if (alt) {
+ if (disjunction == m_pattern.m_body)
+ alternativeBodyDisjunction();
+ else
+ alternativeDisjunction();
+ }
+
+ PatternAlternative* alternative = disjunction->m_alternatives[alt];
+ unsigned minimumSize = alternative->m_minimumSize;
+
+ ASSERT(minimumSize >= parenthesesInputCountAlreadyChecked);
+ unsigned countToCheck = minimumSize - parenthesesInputCountAlreadyChecked;
+ if (countToCheck)
+ checkInput(countToCheck);
+ currentCountAlreadyChecked += countToCheck;
+
+ for (unsigned i = 0; i < alternative->m_terms.size(); ++i) {
+ PatternTerm& term = alternative->m_terms[i];
+
+ switch (term.type) {
+ case PatternTerm::TypeAssertionBOL:
+ assertionBOL(term.inputPosition - currentCountAlreadyChecked);
+ break;
+
+ case PatternTerm::TypeAssertionEOL:
+ assertionEOL(term.inputPosition - currentCountAlreadyChecked);
+ break;
+
+ case PatternTerm::TypeAssertionWordBoundary:
+ assertionWordBoundary(term.invertOrCapture, term.inputPosition - currentCountAlreadyChecked);
+ break;
+
+ case PatternTerm::TypePatternCharacter:
+ atomPatternCharacter(term.patternCharacter, term.inputPosition - currentCountAlreadyChecked, term.frameLocation, term.quantityCount, term.quantityType);
+ break;
+
+ case PatternTerm::TypeCharacterClass:
+ atomCharacterClass(term.characterClass, term.invertOrCapture, term.inputPosition - currentCountAlreadyChecked, term.frameLocation, term.quantityCount, term.quantityType);
+ break;
+
+ case PatternTerm::TypeBackReference:
+ atomBackReference(term.subpatternId, term.inputPosition - currentCountAlreadyChecked, term.frameLocation, term.quantityCount, term.quantityType);
+ break;
+
+ case PatternTerm::TypeForwardReference:
+ break;
+
+ case PatternTerm::TypeParenthesesSubpattern: {
+ unsigned disjunctionAlreadyCheckedCount = 0;
+ if ((term.quantityCount == 1) && !term.parentheses.isCopy) {
+ if (term.quantityType == QuantifierFixedCount) {
+ disjunctionAlreadyCheckedCount = term.parentheses.disjunction->m_minimumSize;
+ unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked;
+ atomParenthesesSubpatternBegin(term.parentheses.subpatternId, term.invertOrCapture, delegateEndInputOffset - disjunctionAlreadyCheckedCount, term.frameLocation, term.frameLocation);
+ emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, term.parentheses.disjunction->m_minimumSize);
+ atomParenthesesEnd(true, term.parentheses.lastSubpatternId, delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType, term.parentheses.disjunction->m_callFrameSize);
+ } else {
+ unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked;
+ atomParenthesesSubpatternBegin(term.parentheses.subpatternId, term.invertOrCapture, delegateEndInputOffset - disjunctionAlreadyCheckedCount, term.frameLocation, term.frameLocation + RegexStackSpaceForBackTrackInfoParenthesesOnce);
+ emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, 0);
+ atomParenthesesEnd(true, term.parentheses.lastSubpatternId, delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType, term.parentheses.disjunction->m_callFrameSize);
+ }
+ } else {
+ unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked;
+ atomParenthesesSubpatternBegin(term.parentheses.subpatternId, term.invertOrCapture, delegateEndInputOffset - disjunctionAlreadyCheckedCount, term.frameLocation, 0);
+ emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, 0);
+ atomParenthesesEnd(false, term.parentheses.lastSubpatternId, delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType, term.parentheses.disjunction->m_callFrameSize);
+ }
+ break;
+ }
+
+ case PatternTerm::TypeParentheticalAssertion: {
+ unsigned alternativeFrameLocation = term.frameLocation + RegexStackSpaceForBackTrackInfoParentheticalAssertion;
+
+ atomParentheticalAssertionBegin(term.parentheses.subpatternId, term.invertOrCapture, term.frameLocation, alternativeFrameLocation);
+ emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, 0);
+ atomParenthesesEnd(true, term.parentheses.lastSubpatternId, 0, term.frameLocation, term.quantityCount, term.quantityType);
+ break;
+ }
+ }
+ }
+ }
+ }
+
+private:
+ RegexPattern& m_pattern;
+ OwnPtr<ByteDisjunction> m_bodyDisjunction;
+ unsigned m_currentAlternativeIndex;
+ Vector<ParenthesesStackEntry> m_parenthesesStack;
+ Vector<ByteDisjunction*> m_allParenthesesInfo;
+};
+
+
+PassOwnPtr<BytecodePattern> byteCompileRegex(const UString& patternString, unsigned& numSubpatterns, const char*& error, bool ignoreCase, bool multiline)
+{
+ RegexPattern pattern(ignoreCase, multiline);
+
+ if ((error = compileRegex(patternString, pattern)))
+ return PassOwnPtr<BytecodePattern>();
+
+ numSubpatterns = pattern.m_numSubpatterns;
+
+ return ByteCompiler(pattern).compile();
+}
+
+int interpretRegex(BytecodePattern* regex, const UChar* input, unsigned start, unsigned length, int* output)
+{
+ return Interpreter(regex, output, input, start, length).interpret();
+}
+
+
+COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoPatternCharacter) == (RegexStackSpaceForBackTrackInfoPatternCharacter * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoPatternCharacter);
+COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoCharacterClass) == (RegexStackSpaceForBackTrackInfoCharacterClass * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoCharacterClass);
+COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoBackReference) == (RegexStackSpaceForBackTrackInfoBackReference * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoBackReference);
+COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoAlternative) == (RegexStackSpaceForBackTrackInfoAlternative * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoAlternative);
+COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoParentheticalAssertion) == (RegexStackSpaceForBackTrackInfoParentheticalAssertion * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoParentheticalAssertion);
+COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoParenthesesOnce) == (RegexStackSpaceForBackTrackInfoParenthesesOnce * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoParenthesesOnce);
+COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoParentheses) == (RegexStackSpaceForBackTrackInfoParentheses * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoParentheses);
+
+
+} }
+
+#endif