diff -r 000000000000 -r 4f2f89ce4247 WebCore/html/LegacyHTMLDocumentParser.cpp --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/WebCore/html/LegacyHTMLDocumentParser.cpp Fri Sep 17 09:02:29 2010 +0300 @@ -0,0 +1,2126 @@ +/* + Copyright (C) 1997 Martin Jones (mjones@kde.org) + (C) 1997 Torben Weis (weis@kde.org) + (C) 1998 Waldo Bastian (bastian@kde.org) + (C) 1999 Lars Knoll (knoll@kde.org) + (C) 1999 Antti Koivisto (koivisto@kde.org) + (C) 2001 Dirk Mueller (mueller@kde.org) + Copyright (C) 2004, 2005, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved. + Copyright (C) 2005, 2006 Alexey Proskuryakov (ap@nypop.com) + Copyright (C) 2009 Torch Mobile Inc. All rights reserved. (http://www.torchmobile.com/) + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Library General Public + License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Library General Public License for more details. + + You should have received a copy of the GNU Library General Public License + along with this library; see the file COPYING.LIB. If not, write to + the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, + Boston, MA 02110-1301, USA. +*/ + +#include "config.h" +#include "LegacyHTMLDocumentParser.h" + +#include "Attribute.h" +#include "CSSHelper.h" +#include "Cache.h" +#include "CachedScript.h" +#include "DocLoader.h" +#include "DocumentFragment.h" +#include "Event.h" +#include "EventNames.h" +#include "Frame.h" +#include "FrameLoader.h" +#include "FrameView.h" +#include "HTMLElement.h" +#include "HTMLNames.h" +#include "LegacyHTMLTreeBuilder.h" +#include "HTMLScriptElement.h" +#include "HTMLViewSourceDocument.h" +#include "ImageLoader.h" +#include "InspectorTimelineAgent.h" +#include "Page.h" +#include "LegacyPreloadScanner.h" +#include "ScriptSourceCode.h" +#include "ScriptValue.h" +#include "XSSAuditor.h" +#include +#include + +#include "HTMLEntityNames.cpp" + +#define PRELOAD_SCANNER_ENABLED 1 + +using namespace WTF; +using namespace std; + +namespace WebCore { + +using namespace HTMLNames; + +// This value is used to define how many loops (approximately tokens) +// the parser will make before checking if it should yield. +// To increase responsiveness reduce both ChunkSize and TimeDelay contants. +static const int defaultTokenizerChunkSize = 4096; + +// FIXME: We would like this constant to be 200ms. +// Yielding more aggressively results in increased responsiveness and better incremental rendering. +// It slows down overall page-load on slower machines, though, so for now we set a value of 500. +// TimeDelay controls the maximum time the parser will run before yielding. +// Inline script execution can cause the parser to excede this limit. +static const double defaultTokenizerTimeDelay = 0.500; + +static const char commentStart [] = " as a close comment, even though it's + // not technically valid. + endCharsCount = 4; + } + if (handleBrokenComments || endCharsCount > 1) { + src.advancePastNonNewline(); + if (!(state.inTitle() || state.inScript() || state.inXmp() || state.inTextArea() || state.inStyle() || state.inIFrame())) { + checkScriptBuffer(); + m_scriptCode[m_scriptCodeSize] = 0; + m_scriptCode[m_scriptCodeSize + 1] = 0; + m_currentToken.tagName = commentAtom; + m_currentToken.beginTag = true; + state = processListing(SegmentedString(m_scriptCode, m_scriptCodeSize - endCharsCount), state); + processToken(); + m_currentToken.tagName = commentAtom; + m_currentToken.beginTag = false; + processToken(); + m_scriptCodeSize = 0; + } + state.setInComment(false); + return state; // Finished parsing comment + } + } + src.advance(m_lineNumber); + } + + return state; +} + +LegacyHTMLDocumentParser::State LegacyHTMLDocumentParser::parseServer(SegmentedString& src, State state) +{ + checkScriptBuffer(src.length()); + while (!src.isEmpty()) { + UChar ch = *src; + m_scriptCode[m_scriptCodeSize++] = ch; + if (ch == '>' && m_scriptCodeSize > 1 && m_scriptCode[m_scriptCodeSize - 2] == '%') { + src.advancePastNonNewline(); + state.setInServer(false); + m_scriptCodeSize = 0; + return state; // Finished parsing server include + } + src.advance(m_lineNumber); + } + return state; +} + +LegacyHTMLDocumentParser::State LegacyHTMLDocumentParser::parseProcessingInstruction(SegmentedString& src, State state) +{ + UChar oldchar = 0; + while (!src.isEmpty()) { + UChar chbegin = *src; + if (chbegin == '\'') + tquote = tquote == SingleQuote ? NoQuote : SingleQuote; + else if (chbegin == '\"') + tquote = tquote == DoubleQuote ? NoQuote : DoubleQuote; + // Look for '?>' + // Some crappy sites omit the "?" before it, so + // we look for an unquoted '>' instead. (IE compatible) + else if (chbegin == '>' && (!tquote || oldchar == '?')) { + // We got a '?>' sequence + state.setInProcessingInstruction(false); + src.advancePastNonNewline(); + state.setDiscardLF(true); + return state; // Finished parsing comment! + } + src.advance(m_lineNumber); + oldchar = chbegin; + } + + return state; +} + +LegacyHTMLDocumentParser::State LegacyHTMLDocumentParser::parseText(SegmentedString& src, State state) +{ + while (!src.isEmpty()) { + UChar cc = *src; + + if (state.skipLF()) { + state.setSkipLF(false); + if (cc == '\n') { + src.advancePastNewline(m_lineNumber); + continue; + } + } + + // do we need to enlarge the buffer? + checkBuffer(); + + if (cc == '\r') { + state.setSkipLF(true); + *m_dest++ = '\n'; + } else + *m_dest++ = cc; + src.advance(m_lineNumber); + } + + return state; +} + + +LegacyHTMLDocumentParser::State LegacyHTMLDocumentParser::parseEntity(SegmentedString& src, UChar*& dest, State state, unsigned& cBufferPos, bool start, bool parsingTag) +{ + if (start) { + cBufferPos = 0; + state.setEntityState(SearchEntity); + EntityUnicodeValue = 0; + } + + while (!src.isEmpty()) { + UChar cc = *src; + switch (state.entityState()) { + case NoEntity: + ASSERT(state.entityState() != NoEntity); + return state; + + case SearchEntity: + if (cc == '#') { + m_cBuffer[cBufferPos++] = cc; + src.advancePastNonNewline(); + state.setEntityState(NumericSearch); + } else + state.setEntityState(EntityName); + break; + + case NumericSearch: + if (cc == 'x' || cc == 'X') { + m_cBuffer[cBufferPos++] = cc; + src.advancePastNonNewline(); + state.setEntityState(Hexadecimal); + } else if (cc >= '0' && cc <= '9') + state.setEntityState(Decimal); + else + state.setEntityState(SearchSemicolon); + break; + + case Hexadecimal: { + int ll = min(src.length(), 10 - cBufferPos); + while (ll--) { + cc = *src; + if (!((cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'f') || (cc >= 'A' && cc <= 'F'))) { + state.setEntityState(SearchSemicolon); + break; + } + int digit; + if (cc < 'A') + digit = cc - '0'; + else + digit = (cc - 'A' + 10) & 0xF; // handle both upper and lower case without a branch + EntityUnicodeValue = EntityUnicodeValue * 16 + digit; + m_cBuffer[cBufferPos++] = cc; + src.advancePastNonNewline(); + } + if (cBufferPos == 10) + state.setEntityState(SearchSemicolon); + break; + } + case Decimal: + { + int ll = min(src.length(), 9-cBufferPos); + while (ll--) { + cc = *src; + + if (!(cc >= '0' && cc <= '9')) { + state.setEntityState(SearchSemicolon); + break; + } + + EntityUnicodeValue = EntityUnicodeValue * 10 + (cc - '0'); + m_cBuffer[cBufferPos++] = cc; + src.advancePastNonNewline(); + } + if (cBufferPos == 9) + state.setEntityState(SearchSemicolon); + break; + } + case EntityName: + { + int ll = min(src.length(), 9-cBufferPos); + while (ll--) { + cc = *src; + + if (!((cc >= 'a' && cc <= 'z') || (cc >= '0' && cc <= '9') || (cc >= 'A' && cc <= 'Z'))) { + state.setEntityState(SearchSemicolon); + break; + } + + m_cBuffer[cBufferPos++] = cc; + src.advancePastNonNewline(); + } + if (cBufferPos == 9) + state.setEntityState(SearchSemicolon); + if (state.entityState() == SearchSemicolon) { + if (cBufferPos > 1) { + // Since the maximum length of entity name is 9, + // so a single char array which is allocated on + // the stack, its length is 10, should be OK. + // Also if we have an illegal character, we treat it + // as illegal entity name. + unsigned testedEntityNameLen = 0; + char tmpEntityNameBuffer[10]; + + ASSERT(cBufferPos < 10); + for (; testedEntityNameLen < cBufferPos; ++testedEntityNameLen) { + if (m_cBuffer[testedEntityNameLen] > 0x7e) + break; + tmpEntityNameBuffer[testedEntityNameLen] = m_cBuffer[testedEntityNameLen]; + } + + const Entity *e; + + if (testedEntityNameLen == cBufferPos) + e = findEntity(tmpEntityNameBuffer, cBufferPos); + else + e = 0; + + if (e) + EntityUnicodeValue = e->code; + + // be IE compatible + if (parsingTag && EntityUnicodeValue > 255 && *src != ';') + EntityUnicodeValue = 0; + } + } + else + break; + } + case SearchSemicolon: + // Don't allow values that are more than 21 bits. + if (EntityUnicodeValue > 0 && EntityUnicodeValue <= 0x10FFFF) { + if (!inViewSourceMode()) { + if (*src == ';') + src.advancePastNonNewline(); + if (EntityUnicodeValue <= 0xFFFF) { + checkBuffer(); + src.push(fixUpChar(EntityUnicodeValue)); + } else { + // Convert to UTF-16, using surrogate code points. + checkBuffer(2); + src.push(U16_LEAD(EntityUnicodeValue)); + src.push(U16_TRAIL(EntityUnicodeValue)); + } + } else { + // FIXME: We should eventually colorize entities by sending them as a special token. + // 12 bytes required: up to 10 bytes in m_cBuffer plus the + // leading '&' and trailing ';' + checkBuffer(12); + *dest++ = '&'; + for (unsigned i = 0; i < cBufferPos; i++) + dest[i] = m_cBuffer[i]; + dest += cBufferPos; + if (*src == ';') { + *dest++ = ';'; + src.advancePastNonNewline(); + } + } + } else { + // 11 bytes required: up to 10 bytes in m_cBuffer plus the + // leading '&' + checkBuffer(11); + // ignore the sequence, add it to the buffer as plaintext + *dest++ = '&'; + for (unsigned i = 0; i < cBufferPos; i++) + dest[i] = m_cBuffer[i]; + dest += cBufferPos; + } + + state.setEntityState(NoEntity); + return state; + } + } + + return state; +} + +LegacyHTMLDocumentParser::State LegacyHTMLDocumentParser::parseDoctype(SegmentedString& src, State state) +{ + ASSERT(state.inDoctype()); + while (!src.isEmpty() && state.inDoctype()) { + UChar c = *src; + bool isWhitespace = c == '\r' || c == '\n' || c == '\t' || c == ' '; + switch (m_doctypeToken.state()) { + case DoctypeBegin: { + m_doctypeToken.setState(DoctypeBeforeName); + if (isWhitespace) { + src.advance(m_lineNumber); + if (inViewSourceMode()) + m_doctypeToken.m_source.append(c); + } + break; + } + case DoctypeBeforeName: { + if (c == '>') { + // Malformed. Just exit. + src.advancePastNonNewline(); + state.setInDoctype(false); + if (inViewSourceMode()) + processDoctypeToken(); + } else if (isWhitespace) { + src.advance(m_lineNumber); + if (inViewSourceMode()) + m_doctypeToken.m_source.append(c); + } else + m_doctypeToken.setState(DoctypeName); + break; + } + case DoctypeName: { + if (c == '>') { + // Valid doctype. Emit it. + src.advancePastNonNewline(); + state.setInDoctype(false); + processDoctypeToken(); + } else if (isWhitespace) { + m_doctypeSearchCount = 0; // Used now to scan for PUBLIC + m_doctypeSecondarySearchCount = 0; // Used now to scan for SYSTEM + m_doctypeToken.setState(DoctypeAfterName); + src.advance(m_lineNumber); + if (inViewSourceMode()) + m_doctypeToken.m_source.append(c); + } else { + src.advancePastNonNewline(); + m_doctypeToken.m_name.append(c); + if (inViewSourceMode()) + m_doctypeToken.m_source.append(c); + } + break; + } + case DoctypeAfterName: { + if (c == '>') { + // Valid doctype. Emit it. + src.advancePastNonNewline(); + state.setInDoctype(false); + processDoctypeToken(); + } else if (!isWhitespace) { + src.advancePastNonNewline(); + if (toASCIILower(c) == publicStart[m_doctypeSearchCount]) { + m_doctypeSearchCount++; + if (m_doctypeSearchCount == 6) + // Found 'PUBLIC' sequence + m_doctypeToken.setState(DoctypeBeforePublicID); + } else if (m_doctypeSearchCount > 0) { + m_doctypeSearchCount = 0; + m_doctypeToken.setState(DoctypeBogus); + } else if (toASCIILower(c) == systemStart[m_doctypeSecondarySearchCount]) { + m_doctypeSecondarySearchCount++; + if (m_doctypeSecondarySearchCount == 6) + // Found 'SYSTEM' sequence + m_doctypeToken.setState(DoctypeBeforeSystemID); + } else { + m_doctypeSecondarySearchCount = 0; + m_doctypeToken.setState(DoctypeBogus); + } + if (inViewSourceMode()) + m_doctypeToken.m_source.append(c); + } else { + src.advance(m_lineNumber); // Whitespace keeps us in the after name state. + if (inViewSourceMode()) + m_doctypeToken.m_source.append(c); + } + break; + } + case DoctypeBeforePublicID: { + if (c == '\"' || c == '\'') { + tquote = c == '\"' ? DoubleQuote : SingleQuote; + m_doctypeToken.setState(DoctypePublicID); + src.advancePastNonNewline(); + if (inViewSourceMode()) + m_doctypeToken.m_source.append(c); + } else if (c == '>') { + // Considered bogus. Don't process the doctype. + src.advancePastNonNewline(); + state.setInDoctype(false); + if (inViewSourceMode()) + processDoctypeToken(); + } else if (isWhitespace) { + src.advance(m_lineNumber); + if (inViewSourceMode()) + m_doctypeToken.m_source.append(c); + } else + m_doctypeToken.setState(DoctypeBogus); + break; + } + case DoctypePublicID: { + if ((c == '\"' && tquote == DoubleQuote) || (c == '\'' && tquote == SingleQuote)) { + src.advancePastNonNewline(); + m_doctypeToken.setState(DoctypeAfterPublicID); + if (inViewSourceMode()) + m_doctypeToken.m_source.append(c); + } else if (c == '>') { + // Considered bogus. Don't process the doctype. + src.advancePastNonNewline(); + state.setInDoctype(false); + if (inViewSourceMode()) + processDoctypeToken(); + } else { + m_doctypeToken.m_publicID.append(c); + src.advance(m_lineNumber); + if (inViewSourceMode()) + m_doctypeToken.m_source.append(c); + } + break; + } + case DoctypeAfterPublicID: + if (c == '\"' || c == '\'') { + tquote = c == '\"' ? DoubleQuote : SingleQuote; + m_doctypeToken.setState(DoctypeSystemID); + src.advancePastNonNewline(); + if (inViewSourceMode()) + m_doctypeToken.m_source.append(c); + } else if (c == '>') { + // Valid doctype. Emit it now. + src.advancePastNonNewline(); + state.setInDoctype(false); + processDoctypeToken(); + } else if (isWhitespace) { + src.advance(m_lineNumber); + if (inViewSourceMode()) + m_doctypeToken.m_source.append(c); + } else + m_doctypeToken.setState(DoctypeBogus); + break; + case DoctypeBeforeSystemID: + if (c == '\"' || c == '\'') { + tquote = c == '\"' ? DoubleQuote : SingleQuote; + m_doctypeToken.setState(DoctypeSystemID); + src.advancePastNonNewline(); + if (inViewSourceMode()) + m_doctypeToken.m_source.append(c); + } else if (c == '>') { + // Considered bogus. Don't process the doctype. + src.advancePastNonNewline(); + state.setInDoctype(false); + } else if (isWhitespace) { + src.advance(m_lineNumber); + if (inViewSourceMode()) + m_doctypeToken.m_source.append(c); + } else + m_doctypeToken.setState(DoctypeBogus); + break; + case DoctypeSystemID: + if ((c == '\"' && tquote == DoubleQuote) || (c == '\'' && tquote == SingleQuote)) { + src.advancePastNonNewline(); + m_doctypeToken.setState(DoctypeAfterSystemID); + if (inViewSourceMode()) + m_doctypeToken.m_source.append(c); + } else if (c == '>') { + // Considered bogus. Don't process the doctype. + src.advancePastNonNewline(); + state.setInDoctype(false); + if (inViewSourceMode()) + processDoctypeToken(); + } else { + m_doctypeToken.m_systemID.append(c); + src.advance(m_lineNumber); + if (inViewSourceMode()) + m_doctypeToken.m_source.append(c); + } + break; + case DoctypeAfterSystemID: + if (c == '>') { + // Valid doctype. Emit it now. + src.advancePastNonNewline(); + state.setInDoctype(false); + processDoctypeToken(); + } else if (isWhitespace) { + src.advance(m_lineNumber); + if (inViewSourceMode()) + m_doctypeToken.m_source.append(c); + } else + m_doctypeToken.setState(DoctypeBogus); + break; + case DoctypeBogus: + if (c == '>') { + // Done with the bogus doctype. + src.advancePastNonNewline(); + state.setInDoctype(false); + if (inViewSourceMode()) + processDoctypeToken(); + } else { + src.advance(m_lineNumber); // Just keep scanning for '>' + if (inViewSourceMode()) + m_doctypeToken.m_source.append(c); + } + break; + default: + break; + } + } + return state; +} + +LegacyHTMLDocumentParser::State LegacyHTMLDocumentParser::parseTag(SegmentedString& src, State state) +{ + ASSERT(!state.hasEntityState()); + + unsigned cBufferPos = m_cBufferPos; + + bool lastIsSlash = false; + + while (!src.isEmpty()) { + checkBuffer(); + switch (state.tagState()) { + case NoTag: + { + m_cBufferPos = cBufferPos; + return state; + } + case TagName: + { + if (searchCount > 0) { + if (*src == commentStart[searchCount]) { + searchCount++; + if (searchCount == 2) + m_doctypeSearchCount++; // A '!' is also part of a doctype, so we are moving through that still as well. + else + m_doctypeSearchCount = 0; + if (searchCount == 4) { + // Found ' as a valid comment, since both mozilla and IE on windows + // can handle this case. Only do this in quirks mode. -dwh + if (!src.isEmpty() && *src == '>' && document()->inCompatMode()) { + state.setInComment(false); + src.advancePastNonNewline(); + if (!src.isEmpty()) + m_cBuffer[cBufferPos++] = *src; + } else + state = parseComment(src, state); + + m_cBufferPos = cBufferPos; + return state; // Finished parsing tag! + } + m_cBuffer[cBufferPos++] = *src; + src.advancePastNonNewline(); + break; + } else + searchCount = 0; // Stop looking for ' or + searchCount = 1; // Look for '