diff -r 000000000000 -r 4f2f89ce4247 WebCore/html/LegacyHTMLDocumentParser.cpp
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/WebCore/html/LegacyHTMLDocumentParser.cpp Fri Sep 17 09:02:29 2010 +0300
@@ -0,0 +1,2126 @@
+/*
+ Copyright (C) 1997 Martin Jones (mjones@kde.org)
+ (C) 1997 Torben Weis (weis@kde.org)
+ (C) 1998 Waldo Bastian (bastian@kde.org)
+ (C) 1999 Lars Knoll (knoll@kde.org)
+ (C) 1999 Antti Koivisto (koivisto@kde.org)
+ (C) 2001 Dirk Mueller (mueller@kde.org)
+ Copyright (C) 2004, 2005, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved.
+ Copyright (C) 2005, 2006 Alexey Proskuryakov (ap@nypop.com)
+ Copyright (C) 2009 Torch Mobile Inc. All rights reserved. (http://www.torchmobile.com/)
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Library General Public
+ License as published by the Free Software Foundation; either
+ version 2 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Library General Public License for more details.
+
+ You should have received a copy of the GNU Library General Public License
+ along with this library; see the file COPYING.LIB. If not, write to
+ the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+ Boston, MA 02110-1301, USA.
+*/
+
+#include "config.h"
+#include "LegacyHTMLDocumentParser.h"
+
+#include "Attribute.h"
+#include "CSSHelper.h"
+#include "Cache.h"
+#include "CachedScript.h"
+#include "DocLoader.h"
+#include "DocumentFragment.h"
+#include "Event.h"
+#include "EventNames.h"
+#include "Frame.h"
+#include "FrameLoader.h"
+#include "FrameView.h"
+#include "HTMLElement.h"
+#include "HTMLNames.h"
+#include "LegacyHTMLTreeBuilder.h"
+#include "HTMLScriptElement.h"
+#include "HTMLViewSourceDocument.h"
+#include "ImageLoader.h"
+#include "InspectorTimelineAgent.h"
+#include "Page.h"
+#include "LegacyPreloadScanner.h"
+#include "ScriptSourceCode.h"
+#include "ScriptValue.h"
+#include "XSSAuditor.h"
+#include
+#include
+
+#include "HTMLEntityNames.cpp"
+
+#define PRELOAD_SCANNER_ENABLED 1
+
+using namespace WTF;
+using namespace std;
+
+namespace WebCore {
+
+using namespace HTMLNames;
+
+// This value is used to define how many loops (approximately tokens)
+// the parser will make before checking if it should yield.
+// To increase responsiveness reduce both ChunkSize and TimeDelay contants.
+static const int defaultTokenizerChunkSize = 4096;
+
+// FIXME: We would like this constant to be 200ms.
+// Yielding more aggressively results in increased responsiveness and better incremental rendering.
+// It slows down overall page-load on slower machines, though, so for now we set a value of 500.
+// TimeDelay controls the maximum time the parser will run before yielding.
+// Inline script execution can cause the parser to excede this limit.
+static const double defaultTokenizerTimeDelay = 0.500;
+
+static const char commentStart [] = " as a close comment, even though it's
+ // not technically valid.
+ endCharsCount = 4;
+ }
+ if (handleBrokenComments || endCharsCount > 1) {
+ src.advancePastNonNewline();
+ if (!(state.inTitle() || state.inScript() || state.inXmp() || state.inTextArea() || state.inStyle() || state.inIFrame())) {
+ checkScriptBuffer();
+ m_scriptCode[m_scriptCodeSize] = 0;
+ m_scriptCode[m_scriptCodeSize + 1] = 0;
+ m_currentToken.tagName = commentAtom;
+ m_currentToken.beginTag = true;
+ state = processListing(SegmentedString(m_scriptCode, m_scriptCodeSize - endCharsCount), state);
+ processToken();
+ m_currentToken.tagName = commentAtom;
+ m_currentToken.beginTag = false;
+ processToken();
+ m_scriptCodeSize = 0;
+ }
+ state.setInComment(false);
+ return state; // Finished parsing comment
+ }
+ }
+ src.advance(m_lineNumber);
+ }
+
+ return state;
+}
+
+LegacyHTMLDocumentParser::State LegacyHTMLDocumentParser::parseServer(SegmentedString& src, State state)
+{
+ checkScriptBuffer(src.length());
+ while (!src.isEmpty()) {
+ UChar ch = *src;
+ m_scriptCode[m_scriptCodeSize++] = ch;
+ if (ch == '>' && m_scriptCodeSize > 1 && m_scriptCode[m_scriptCodeSize - 2] == '%') {
+ src.advancePastNonNewline();
+ state.setInServer(false);
+ m_scriptCodeSize = 0;
+ return state; // Finished parsing server include
+ }
+ src.advance(m_lineNumber);
+ }
+ return state;
+}
+
+LegacyHTMLDocumentParser::State LegacyHTMLDocumentParser::parseProcessingInstruction(SegmentedString& src, State state)
+{
+ UChar oldchar = 0;
+ while (!src.isEmpty()) {
+ UChar chbegin = *src;
+ if (chbegin == '\'')
+ tquote = tquote == SingleQuote ? NoQuote : SingleQuote;
+ else if (chbegin == '\"')
+ tquote = tquote == DoubleQuote ? NoQuote : DoubleQuote;
+ // Look for '?>'
+ // Some crappy sites omit the "?" before it, so
+ // we look for an unquoted '>' instead. (IE compatible)
+ else if (chbegin == '>' && (!tquote || oldchar == '?')) {
+ // We got a '?>' sequence
+ state.setInProcessingInstruction(false);
+ src.advancePastNonNewline();
+ state.setDiscardLF(true);
+ return state; // Finished parsing comment!
+ }
+ src.advance(m_lineNumber);
+ oldchar = chbegin;
+ }
+
+ return state;
+}
+
+LegacyHTMLDocumentParser::State LegacyHTMLDocumentParser::parseText(SegmentedString& src, State state)
+{
+ while (!src.isEmpty()) {
+ UChar cc = *src;
+
+ if (state.skipLF()) {
+ state.setSkipLF(false);
+ if (cc == '\n') {
+ src.advancePastNewline(m_lineNumber);
+ continue;
+ }
+ }
+
+ // do we need to enlarge the buffer?
+ checkBuffer();
+
+ if (cc == '\r') {
+ state.setSkipLF(true);
+ *m_dest++ = '\n';
+ } else
+ *m_dest++ = cc;
+ src.advance(m_lineNumber);
+ }
+
+ return state;
+}
+
+
+LegacyHTMLDocumentParser::State LegacyHTMLDocumentParser::parseEntity(SegmentedString& src, UChar*& dest, State state, unsigned& cBufferPos, bool start, bool parsingTag)
+{
+ if (start) {
+ cBufferPos = 0;
+ state.setEntityState(SearchEntity);
+ EntityUnicodeValue = 0;
+ }
+
+ while (!src.isEmpty()) {
+ UChar cc = *src;
+ switch (state.entityState()) {
+ case NoEntity:
+ ASSERT(state.entityState() != NoEntity);
+ return state;
+
+ case SearchEntity:
+ if (cc == '#') {
+ m_cBuffer[cBufferPos++] = cc;
+ src.advancePastNonNewline();
+ state.setEntityState(NumericSearch);
+ } else
+ state.setEntityState(EntityName);
+ break;
+
+ case NumericSearch:
+ if (cc == 'x' || cc == 'X') {
+ m_cBuffer[cBufferPos++] = cc;
+ src.advancePastNonNewline();
+ state.setEntityState(Hexadecimal);
+ } else if (cc >= '0' && cc <= '9')
+ state.setEntityState(Decimal);
+ else
+ state.setEntityState(SearchSemicolon);
+ break;
+
+ case Hexadecimal: {
+ int ll = min(src.length(), 10 - cBufferPos);
+ while (ll--) {
+ cc = *src;
+ if (!((cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'f') || (cc >= 'A' && cc <= 'F'))) {
+ state.setEntityState(SearchSemicolon);
+ break;
+ }
+ int digit;
+ if (cc < 'A')
+ digit = cc - '0';
+ else
+ digit = (cc - 'A' + 10) & 0xF; // handle both upper and lower case without a branch
+ EntityUnicodeValue = EntityUnicodeValue * 16 + digit;
+ m_cBuffer[cBufferPos++] = cc;
+ src.advancePastNonNewline();
+ }
+ if (cBufferPos == 10)
+ state.setEntityState(SearchSemicolon);
+ break;
+ }
+ case Decimal:
+ {
+ int ll = min(src.length(), 9-cBufferPos);
+ while (ll--) {
+ cc = *src;
+
+ if (!(cc >= '0' && cc <= '9')) {
+ state.setEntityState(SearchSemicolon);
+ break;
+ }
+
+ EntityUnicodeValue = EntityUnicodeValue * 10 + (cc - '0');
+ m_cBuffer[cBufferPos++] = cc;
+ src.advancePastNonNewline();
+ }
+ if (cBufferPos == 9)
+ state.setEntityState(SearchSemicolon);
+ break;
+ }
+ case EntityName:
+ {
+ int ll = min(src.length(), 9-cBufferPos);
+ while (ll--) {
+ cc = *src;
+
+ if (!((cc >= 'a' && cc <= 'z') || (cc >= '0' && cc <= '9') || (cc >= 'A' && cc <= 'Z'))) {
+ state.setEntityState(SearchSemicolon);
+ break;
+ }
+
+ m_cBuffer[cBufferPos++] = cc;
+ src.advancePastNonNewline();
+ }
+ if (cBufferPos == 9)
+ state.setEntityState(SearchSemicolon);
+ if (state.entityState() == SearchSemicolon) {
+ if (cBufferPos > 1) {
+ // Since the maximum length of entity name is 9,
+ // so a single char array which is allocated on
+ // the stack, its length is 10, should be OK.
+ // Also if we have an illegal character, we treat it
+ // as illegal entity name.
+ unsigned testedEntityNameLen = 0;
+ char tmpEntityNameBuffer[10];
+
+ ASSERT(cBufferPos < 10);
+ for (; testedEntityNameLen < cBufferPos; ++testedEntityNameLen) {
+ if (m_cBuffer[testedEntityNameLen] > 0x7e)
+ break;
+ tmpEntityNameBuffer[testedEntityNameLen] = m_cBuffer[testedEntityNameLen];
+ }
+
+ const Entity *e;
+
+ if (testedEntityNameLen == cBufferPos)
+ e = findEntity(tmpEntityNameBuffer, cBufferPos);
+ else
+ e = 0;
+
+ if (e)
+ EntityUnicodeValue = e->code;
+
+ // be IE compatible
+ if (parsingTag && EntityUnicodeValue > 255 && *src != ';')
+ EntityUnicodeValue = 0;
+ }
+ }
+ else
+ break;
+ }
+ case SearchSemicolon:
+ // Don't allow values that are more than 21 bits.
+ if (EntityUnicodeValue > 0 && EntityUnicodeValue <= 0x10FFFF) {
+ if (!inViewSourceMode()) {
+ if (*src == ';')
+ src.advancePastNonNewline();
+ if (EntityUnicodeValue <= 0xFFFF) {
+ checkBuffer();
+ src.push(fixUpChar(EntityUnicodeValue));
+ } else {
+ // Convert to UTF-16, using surrogate code points.
+ checkBuffer(2);
+ src.push(U16_LEAD(EntityUnicodeValue));
+ src.push(U16_TRAIL(EntityUnicodeValue));
+ }
+ } else {
+ // FIXME: We should eventually colorize entities by sending them as a special token.
+ // 12 bytes required: up to 10 bytes in m_cBuffer plus the
+ // leading '&' and trailing ';'
+ checkBuffer(12);
+ *dest++ = '&';
+ for (unsigned i = 0; i < cBufferPos; i++)
+ dest[i] = m_cBuffer[i];
+ dest += cBufferPos;
+ if (*src == ';') {
+ *dest++ = ';';
+ src.advancePastNonNewline();
+ }
+ }
+ } else {
+ // 11 bytes required: up to 10 bytes in m_cBuffer plus the
+ // leading '&'
+ checkBuffer(11);
+ // ignore the sequence, add it to the buffer as plaintext
+ *dest++ = '&';
+ for (unsigned i = 0; i < cBufferPos; i++)
+ dest[i] = m_cBuffer[i];
+ dest += cBufferPos;
+ }
+
+ state.setEntityState(NoEntity);
+ return state;
+ }
+ }
+
+ return state;
+}
+
+LegacyHTMLDocumentParser::State LegacyHTMLDocumentParser::parseDoctype(SegmentedString& src, State state)
+{
+ ASSERT(state.inDoctype());
+ while (!src.isEmpty() && state.inDoctype()) {
+ UChar c = *src;
+ bool isWhitespace = c == '\r' || c == '\n' || c == '\t' || c == ' ';
+ switch (m_doctypeToken.state()) {
+ case DoctypeBegin: {
+ m_doctypeToken.setState(DoctypeBeforeName);
+ if (isWhitespace) {
+ src.advance(m_lineNumber);
+ if (inViewSourceMode())
+ m_doctypeToken.m_source.append(c);
+ }
+ break;
+ }
+ case DoctypeBeforeName: {
+ if (c == '>') {
+ // Malformed. Just exit.
+ src.advancePastNonNewline();
+ state.setInDoctype(false);
+ if (inViewSourceMode())
+ processDoctypeToken();
+ } else if (isWhitespace) {
+ src.advance(m_lineNumber);
+ if (inViewSourceMode())
+ m_doctypeToken.m_source.append(c);
+ } else
+ m_doctypeToken.setState(DoctypeName);
+ break;
+ }
+ case DoctypeName: {
+ if (c == '>') {
+ // Valid doctype. Emit it.
+ src.advancePastNonNewline();
+ state.setInDoctype(false);
+ processDoctypeToken();
+ } else if (isWhitespace) {
+ m_doctypeSearchCount = 0; // Used now to scan for PUBLIC
+ m_doctypeSecondarySearchCount = 0; // Used now to scan for SYSTEM
+ m_doctypeToken.setState(DoctypeAfterName);
+ src.advance(m_lineNumber);
+ if (inViewSourceMode())
+ m_doctypeToken.m_source.append(c);
+ } else {
+ src.advancePastNonNewline();
+ m_doctypeToken.m_name.append(c);
+ if (inViewSourceMode())
+ m_doctypeToken.m_source.append(c);
+ }
+ break;
+ }
+ case DoctypeAfterName: {
+ if (c == '>') {
+ // Valid doctype. Emit it.
+ src.advancePastNonNewline();
+ state.setInDoctype(false);
+ processDoctypeToken();
+ } else if (!isWhitespace) {
+ src.advancePastNonNewline();
+ if (toASCIILower(c) == publicStart[m_doctypeSearchCount]) {
+ m_doctypeSearchCount++;
+ if (m_doctypeSearchCount == 6)
+ // Found 'PUBLIC' sequence
+ m_doctypeToken.setState(DoctypeBeforePublicID);
+ } else if (m_doctypeSearchCount > 0) {
+ m_doctypeSearchCount = 0;
+ m_doctypeToken.setState(DoctypeBogus);
+ } else if (toASCIILower(c) == systemStart[m_doctypeSecondarySearchCount]) {
+ m_doctypeSecondarySearchCount++;
+ if (m_doctypeSecondarySearchCount == 6)
+ // Found 'SYSTEM' sequence
+ m_doctypeToken.setState(DoctypeBeforeSystemID);
+ } else {
+ m_doctypeSecondarySearchCount = 0;
+ m_doctypeToken.setState(DoctypeBogus);
+ }
+ if (inViewSourceMode())
+ m_doctypeToken.m_source.append(c);
+ } else {
+ src.advance(m_lineNumber); // Whitespace keeps us in the after name state.
+ if (inViewSourceMode())
+ m_doctypeToken.m_source.append(c);
+ }
+ break;
+ }
+ case DoctypeBeforePublicID: {
+ if (c == '\"' || c == '\'') {
+ tquote = c == '\"' ? DoubleQuote : SingleQuote;
+ m_doctypeToken.setState(DoctypePublicID);
+ src.advancePastNonNewline();
+ if (inViewSourceMode())
+ m_doctypeToken.m_source.append(c);
+ } else if (c == '>') {
+ // Considered bogus. Don't process the doctype.
+ src.advancePastNonNewline();
+ state.setInDoctype(false);
+ if (inViewSourceMode())
+ processDoctypeToken();
+ } else if (isWhitespace) {
+ src.advance(m_lineNumber);
+ if (inViewSourceMode())
+ m_doctypeToken.m_source.append(c);
+ } else
+ m_doctypeToken.setState(DoctypeBogus);
+ break;
+ }
+ case DoctypePublicID: {
+ if ((c == '\"' && tquote == DoubleQuote) || (c == '\'' && tquote == SingleQuote)) {
+ src.advancePastNonNewline();
+ m_doctypeToken.setState(DoctypeAfterPublicID);
+ if (inViewSourceMode())
+ m_doctypeToken.m_source.append(c);
+ } else if (c == '>') {
+ // Considered bogus. Don't process the doctype.
+ src.advancePastNonNewline();
+ state.setInDoctype(false);
+ if (inViewSourceMode())
+ processDoctypeToken();
+ } else {
+ m_doctypeToken.m_publicID.append(c);
+ src.advance(m_lineNumber);
+ if (inViewSourceMode())
+ m_doctypeToken.m_source.append(c);
+ }
+ break;
+ }
+ case DoctypeAfterPublicID:
+ if (c == '\"' || c == '\'') {
+ tquote = c == '\"' ? DoubleQuote : SingleQuote;
+ m_doctypeToken.setState(DoctypeSystemID);
+ src.advancePastNonNewline();
+ if (inViewSourceMode())
+ m_doctypeToken.m_source.append(c);
+ } else if (c == '>') {
+ // Valid doctype. Emit it now.
+ src.advancePastNonNewline();
+ state.setInDoctype(false);
+ processDoctypeToken();
+ } else if (isWhitespace) {
+ src.advance(m_lineNumber);
+ if (inViewSourceMode())
+ m_doctypeToken.m_source.append(c);
+ } else
+ m_doctypeToken.setState(DoctypeBogus);
+ break;
+ case DoctypeBeforeSystemID:
+ if (c == '\"' || c == '\'') {
+ tquote = c == '\"' ? DoubleQuote : SingleQuote;
+ m_doctypeToken.setState(DoctypeSystemID);
+ src.advancePastNonNewline();
+ if (inViewSourceMode())
+ m_doctypeToken.m_source.append(c);
+ } else if (c == '>') {
+ // Considered bogus. Don't process the doctype.
+ src.advancePastNonNewline();
+ state.setInDoctype(false);
+ } else if (isWhitespace) {
+ src.advance(m_lineNumber);
+ if (inViewSourceMode())
+ m_doctypeToken.m_source.append(c);
+ } else
+ m_doctypeToken.setState(DoctypeBogus);
+ break;
+ case DoctypeSystemID:
+ if ((c == '\"' && tquote == DoubleQuote) || (c == '\'' && tquote == SingleQuote)) {
+ src.advancePastNonNewline();
+ m_doctypeToken.setState(DoctypeAfterSystemID);
+ if (inViewSourceMode())
+ m_doctypeToken.m_source.append(c);
+ } else if (c == '>') {
+ // Considered bogus. Don't process the doctype.
+ src.advancePastNonNewline();
+ state.setInDoctype(false);
+ if (inViewSourceMode())
+ processDoctypeToken();
+ } else {
+ m_doctypeToken.m_systemID.append(c);
+ src.advance(m_lineNumber);
+ if (inViewSourceMode())
+ m_doctypeToken.m_source.append(c);
+ }
+ break;
+ case DoctypeAfterSystemID:
+ if (c == '>') {
+ // Valid doctype. Emit it now.
+ src.advancePastNonNewline();
+ state.setInDoctype(false);
+ processDoctypeToken();
+ } else if (isWhitespace) {
+ src.advance(m_lineNumber);
+ if (inViewSourceMode())
+ m_doctypeToken.m_source.append(c);
+ } else
+ m_doctypeToken.setState(DoctypeBogus);
+ break;
+ case DoctypeBogus:
+ if (c == '>') {
+ // Done with the bogus doctype.
+ src.advancePastNonNewline();
+ state.setInDoctype(false);
+ if (inViewSourceMode())
+ processDoctypeToken();
+ } else {
+ src.advance(m_lineNumber); // Just keep scanning for '>'
+ if (inViewSourceMode())
+ m_doctypeToken.m_source.append(c);
+ }
+ break;
+ default:
+ break;
+ }
+ }
+ return state;
+}
+
+LegacyHTMLDocumentParser::State LegacyHTMLDocumentParser::parseTag(SegmentedString& src, State state)
+{
+ ASSERT(!state.hasEntityState());
+
+ unsigned cBufferPos = m_cBufferPos;
+
+ bool lastIsSlash = false;
+
+ while (!src.isEmpty()) {
+ checkBuffer();
+ switch (state.tagState()) {
+ case NoTag:
+ {
+ m_cBufferPos = cBufferPos;
+ return state;
+ }
+ case TagName:
+ {
+ if (searchCount > 0) {
+ if (*src == commentStart[searchCount]) {
+ searchCount++;
+ if (searchCount == 2)
+ m_doctypeSearchCount++; // A '!' is also part of a doctype, so we are moving through that still as well.
+ else
+ m_doctypeSearchCount = 0;
+ if (searchCount == 4) {
+ // Found ' as a valid comment, since both mozilla and IE on windows
+ // can handle this case. Only do this in quirks mode. -dwh
+ if (!src.isEmpty() && *src == '>' && document()->inCompatMode()) {
+ state.setInComment(false);
+ src.advancePastNonNewline();
+ if (!src.isEmpty())
+ m_cBuffer[cBufferPos++] = *src;
+ } else
+ state = parseComment(src, state);
+
+ m_cBufferPos = cBufferPos;
+ return state; // Finished parsing tag!
+ }
+ m_cBuffer[cBufferPos++] = *src;
+ src.advancePastNonNewline();
+ break;
+ } else
+ searchCount = 0; // Stop looking for ' or
+ searchCount = 1; // Look for '