diff -r 000000000000 -r 4f2f89ce4247 WebCore/html/HTMLTreeBuilder.cpp
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/WebCore/html/HTMLTreeBuilder.cpp Fri Sep 17 09:02:29 2010 +0300
@@ -0,0 +1,2833 @@
+/*
+ * Copyright (C) 2010 Google, Inc. All Rights Reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY GOOGLE INC. ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GOOGLE INC. OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+#include "HTMLTreeBuilder.h"
+
+#include "Comment.h"
+#include "DocumentFragment.h"
+#include "DocumentType.h"
+#include "Element.h"
+#include "Frame.h"
+#include "HTMLDocument.h"
+#include "HTMLElementFactory.h"
+#include "HTMLFormElement.h"
+#include "HTMLHtmlElement.h"
+#include "HTMLNames.h"
+#include "HTMLScriptElement.h"
+#include "HTMLToken.h"
+#include "HTMLTokenizer.h"
+#include "LegacyHTMLDocumentParser.h"
+#include "LegacyHTMLTreeBuilder.h"
+#include "LocalizedStrings.h"
+#include "MathMLNames.h"
+#include "NotImplemented.h"
+#include "SVGNames.h"
+#include "ScriptController.h"
+#include "Settings.h"
+#include "Text.h"
+#include "XLinkNames.h"
+#include "XMLNSNames.h"
+#include "XMLNames.h"
+#include
+
+namespace WebCore {
+
+using namespace HTMLNames;
+
+static const int uninitializedLineNumberValue = -1;
+
+namespace {
+
+inline bool isTreeBuilderWhitepace(UChar cc)
+{
+ return cc == '\t' || cc == '\x0A' || cc == '\x0C' || cc == '\x0D' || cc == ' ';
+}
+
+inline bool hasNonWhitespace(const String& string)
+{
+ const UChar* characters = string.characters();
+ const unsigned length = string.length();
+ for (unsigned i = 0; i < length; ++i) {
+ if (!isTreeBuilderWhitepace(characters[i]))
+ return true;
+ }
+ return false;
+}
+
+bool shouldUseLegacyTreeBuilder(Document* document)
+{
+ return !document->settings() || !document->settings()->html5TreeBuilderEnabled();
+}
+
+bool isNumberedHeaderTag(const AtomicString& tagName)
+{
+ return tagName == h1Tag
+ || tagName == h2Tag
+ || tagName == h3Tag
+ || tagName == h4Tag
+ || tagName == h5Tag
+ || tagName == h6Tag;
+}
+
+bool isCaptionColOrColgroupTag(const AtomicString& tagName)
+{
+ return tagName == captionTag
+ || tagName == colTag
+ || tagName == colgroupTag;
+}
+
+bool isTableCellContextTag(const AtomicString& tagName)
+{
+ return tagName == thTag || tagName == tdTag;
+}
+
+bool isTableBodyContextTag(const AtomicString& tagName)
+{
+ return tagName == tbodyTag
+ || tagName == tfootTag
+ || tagName == theadTag;
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#special
+bool isSpecialTag(const AtomicString& tagName)
+{
+ return tagName == addressTag
+ || tagName == articleTag
+ || tagName == asideTag
+ || tagName == baseTag
+ || tagName == basefontTag
+ || tagName == "bgsound"
+ || tagName == blockquoteTag
+ || tagName == bodyTag
+ || tagName == brTag
+ || tagName == buttonTag
+ || tagName == centerTag
+ || tagName == colTag
+ || tagName == colgroupTag
+ || tagName == "command"
+ || tagName == ddTag
+ || tagName == "details"
+ || tagName == dirTag
+ || tagName == divTag
+ || tagName == dlTag
+ || tagName == dtTag
+ || tagName == embedTag
+ || tagName == fieldsetTag
+ || tagName == "figure"
+ || tagName == footerTag
+ || tagName == formTag
+ || tagName == frameTag
+ || tagName == framesetTag
+ || isNumberedHeaderTag(tagName)
+ || tagName == headTag
+ || tagName == headerTag
+ || tagName == hgroupTag
+ || tagName == hrTag
+ || tagName == iframeTag
+ || tagName == imgTag
+ || tagName == inputTag
+ || tagName == isindexTag
+ || tagName == liTag
+ || tagName == linkTag
+ || tagName == listingTag
+ || tagName == menuTag
+ || tagName == metaTag
+ || tagName == navTag
+ || tagName == noembedTag
+ || tagName == noframesTag
+ || tagName == noscriptTag
+ || tagName == olTag
+ || tagName == pTag
+ || tagName == paramTag
+ || tagName == plaintextTag
+ || tagName == preTag
+ || tagName == scriptTag
+ || tagName == sectionTag
+ || tagName == selectTag
+ || tagName == styleTag
+ || isTableBodyContextTag(tagName)
+ || tagName == textareaTag
+ || tagName == titleTag
+ || tagName == trTag
+ || tagName == ulTag
+ || tagName == wbrTag
+ || tagName == xmpTag;
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#scoping
+// Same as isScopingTag in LegacyHTMLTreeBuilder.cpp
+// and isScopeMarker in HTMLElementStack.cpp
+bool isScopingTag(const AtomicString& tagName)
+{
+ return tagName == appletTag
+ || tagName == captionTag
+ || tagName == SVGNames::foreignObjectTag
+ || tagName == htmlTag
+ || tagName == marqueeTag
+ || tagName == objectTag
+ || tagName == tableTag
+ || isTableCellContextTag(tagName);
+}
+
+bool isNonAnchorNonNobrFormattingTag(const AtomicString& tagName)
+{
+ return tagName == bTag
+ || tagName == bigTag
+ || tagName == codeTag
+ || tagName == emTag
+ || tagName == fontTag
+ || tagName == iTag
+ || tagName == sTag
+ || tagName == smallTag
+ || tagName == strikeTag
+ || tagName == strongTag
+ || tagName == ttTag
+ || tagName == uTag;
+}
+
+bool isNonAnchorFormattingTag(const AtomicString& tagName)
+{
+ return tagName == nobrTag
+ || isNonAnchorNonNobrFormattingTag(tagName);
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#formatting
+bool isFormattingTag(const AtomicString& tagName)
+{
+ return tagName == aTag || isNonAnchorFormattingTag(tagName);
+}
+
+// http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#phrasing
+bool isPhrasingTag(const AtomicString& tagName)
+{
+ return !isSpecialTag(tagName) && !isScopingTag(tagName) && !isFormattingTag(tagName);
+}
+
+bool isNotFormattingAndNotPhrasing(const Element* element)
+{
+ // The spec often says "node is not in the formatting category, and is not
+ // in the phrasing category". !phrasing && !formatting == scoping || special
+ // scoping || special is easier to compute.
+ // FIXME: localName() is wrong for non-html content.
+ const AtomicString& tagName = element->localName();
+ return isScopingTag(tagName) || isSpecialTag(tagName);
+}
+
+} // namespace
+
+class HTMLTreeBuilder::ExternalCharacterTokenBuffer : public Noncopyable {
+public:
+ explicit ExternalCharacterTokenBuffer(AtomicHTMLToken& token)
+ : m_current(token.characters().data())
+ , m_end(m_current + token.characters().size())
+ {
+ ASSERT(!isEmpty());
+ }
+
+ explicit ExternalCharacterTokenBuffer(const String& string)
+ : m_current(string.characters())
+ , m_end(m_current + string.length())
+ {
+ ASSERT(!isEmpty());
+ }
+
+ ~ExternalCharacterTokenBuffer()
+ {
+ ASSERT(isEmpty());
+ }
+
+ bool isEmpty() const { return m_current == m_end; }
+
+ void skipLeadingWhitespace()
+ {
+ ASSERT(!isEmpty());
+ while (isTreeBuilderWhitepace(*m_current)) {
+ if (++m_current == m_end)
+ return;
+ }
+ }
+
+ String takeLeadingWhitespace()
+ {
+ ASSERT(!isEmpty());
+ const UChar* start = m_current;
+ skipLeadingWhitespace();
+ if (start == m_current)
+ return String();
+ return String(start, m_current - start);
+ }
+
+ String takeRemaining()
+ {
+ ASSERT(!isEmpty());
+ const UChar* start = m_current;
+ m_current = m_end;
+ return String(start, m_current - start);
+ }
+
+ void giveRemainingTo(Vector& recipient)
+ {
+ recipient.append(m_current, m_end - m_current);
+ m_current = m_end;
+ }
+
+ String takeRemainingWhitespace()
+ {
+ ASSERT(!isEmpty());
+ Vector whitespace;
+ do {
+ UChar cc = *m_current++;
+ if (isTreeBuilderWhitepace(cc))
+ whitespace.append(cc);
+ } while (m_current < m_end);
+ // Returning the null string when there aren't any whitespace
+ // characters is slightly cleaner semantically because we don't want
+ // to insert a text node (as opposed to inserting an empty text node).
+ if (whitespace.isEmpty())
+ return String();
+ return String::adopt(whitespace);
+ }
+
+private:
+ const UChar* m_current;
+ const UChar* m_end;
+};
+
+
+HTMLTreeBuilder::HTMLTreeBuilder(HTMLTokenizer* tokenizer, HTMLDocument* document, bool reportErrors)
+ : m_framesetOk(true)
+ , m_document(document)
+ , m_tree(document, FragmentScriptingAllowed, false)
+ , m_reportErrors(reportErrors)
+ , m_isPaused(false)
+ , m_insertionMode(InitialMode)
+ , m_originalInsertionMode(InitialMode)
+ , m_secondaryInsertionMode(InitialMode)
+ , m_tokenizer(tokenizer)
+ , m_legacyTreeBuilder(shouldUseLegacyTreeBuilder(document) ? new LegacyHTMLTreeBuilder(document, reportErrors) : 0)
+ , m_lastScriptElementStartLine(uninitializedLineNumberValue)
+ , m_scriptToProcessStartLine(uninitializedLineNumberValue)
+ , m_fragmentScriptingPermission(FragmentScriptingAllowed)
+ , m_isParsingFragment(false)
+{
+}
+
+// FIXME: Member variables should be grouped into self-initializing structs to
+// minimize code duplication between these constructors.
+HTMLTreeBuilder::HTMLTreeBuilder(HTMLTokenizer* tokenizer, DocumentFragment* fragment, FragmentScriptingPermission scriptingPermission)
+ : m_framesetOk(true)
+ , m_document(fragment->document())
+ , m_tree(fragment->document(), scriptingPermission, true)
+ , m_reportErrors(false) // FIXME: Why not report errors in fragments?
+ , m_isPaused(false)
+ , m_insertionMode(InitialMode)
+ , m_originalInsertionMode(InitialMode)
+ , m_secondaryInsertionMode(InitialMode)
+ , m_tokenizer(tokenizer)
+ , m_legacyTreeBuilder(new LegacyHTMLTreeBuilder(fragment, scriptingPermission))
+ , m_lastScriptElementStartLine(uninitializedLineNumberValue)
+ , m_scriptToProcessStartLine(uninitializedLineNumberValue)
+ , m_fragmentScriptingPermission(scriptingPermission)
+ , m_isParsingFragment(true)
+{
+}
+
+HTMLTreeBuilder::~HTMLTreeBuilder()
+{
+}
+
+static void convertToOldStyle(AtomicHTMLToken& token, Token& oldStyleToken)
+{
+ switch (token.type()) {
+ case HTMLToken::Uninitialized:
+ case HTMLToken::DOCTYPE:
+ ASSERT_NOT_REACHED();
+ break;
+ case HTMLToken::EndOfFile:
+ ASSERT_NOT_REACHED();
+ notImplemented();
+ break;
+ case HTMLToken::StartTag:
+ case HTMLToken::EndTag: {
+ oldStyleToken.beginTag = (token.type() == HTMLToken::StartTag);
+ // The LegacyHTMLTreeBuilder seems to work better if we lie here and
+ // say that tags are never self closing. As a wise man once said:
+ // "You can't handle the truth!"
+ oldStyleToken.selfClosingTag = false;
+ oldStyleToken.tagName = token.name();
+ oldStyleToken.attrs = token.takeAtributes();
+ break;
+ }
+ case HTMLToken::Comment:
+ oldStyleToken.tagName = commentAtom;
+ oldStyleToken.text = token.comment().impl();
+ break;
+ case HTMLToken::Character:
+ oldStyleToken.tagName = textAtom;
+ oldStyleToken.text = StringImpl::create(token.characters().data(), token.characters().size());
+ break;
+ }
+}
+
+void HTMLTreeBuilder::handleScriptStartTag()
+{
+ notImplemented(); // The HTML frgment case?
+ m_tokenizer->setState(HTMLTokenizer::ScriptDataState);
+ notImplemented(); // Save insertion mode.
+}
+
+void HTMLTreeBuilder::handleScriptEndTag(Element* scriptElement, int scriptStartLine)
+{
+ ASSERT(!m_scriptToProcess); // Caller never called takeScriptToProcess!
+ ASSERT(m_scriptToProcessStartLine == uninitializedLineNumberValue); // Caller never called takeScriptToProcess!
+ notImplemented(); // Save insertion mode and insertion point?
+
+ // Pause ourselves so that parsing stops until the script can be processed by the caller.
+ m_isPaused = true;
+ m_scriptToProcess = scriptElement;
+ // Lexer line numbers are 0-based, ScriptSourceCode expects 1-based lines,
+ // so we convert here before passing the line number off to HTMLScriptRunner.
+ m_scriptToProcessStartLine = scriptStartLine + 1;
+}
+
+PassRefPtr HTMLTreeBuilder::takeScriptToProcess(int& scriptStartLine)
+{
+ // Unpause ourselves, callers may pause us again when processing the script.
+ // The HTML5 spec is written as though scripts are executed inside the tree
+ // builder. We pause the parser to exit the tree builder, and then resume
+ // before running scripts.
+ m_isPaused = false;
+ scriptStartLine = m_scriptToProcessStartLine;
+ m_scriptToProcessStartLine = uninitializedLineNumberValue;
+ return m_scriptToProcess.release();
+}
+
+HTMLTokenizer::State HTMLTreeBuilder::adjustedLexerState(HTMLTokenizer::State state, const AtomicString& tagName, Frame* frame)
+{
+ if (tagName == textareaTag || tagName == titleTag)
+ return HTMLTokenizer::RCDATAState;
+
+ if (tagName == styleTag
+ || tagName == iframeTag
+ || tagName == xmpTag
+ || tagName == noembedTag
+ || tagName == noframesTag
+ || (tagName == noscriptTag && isScriptingFlagEnabled(frame)))
+ return HTMLTokenizer::RAWTEXTState;
+
+ if (tagName == plaintextTag)
+ return HTMLTokenizer::PLAINTEXTState;
+
+ return state;
+}
+
+void HTMLTreeBuilder::passTokenToLegacyParser(HTMLToken& token)
+{
+ if (token.type() == HTMLToken::DOCTYPE) {
+ DoctypeToken doctypeToken;
+ doctypeToken.m_name.append(token.name().data(), token.name().size());
+ doctypeToken.m_publicID = token.publicIdentifier();
+ doctypeToken.m_systemID = token.systemIdentifier();
+ doctypeToken.m_forceQuirks = token.forceQuirks();
+
+ m_legacyTreeBuilder->parseDoctypeToken(&doctypeToken);
+ return;
+ }
+
+ if (token.type() == HTMLToken::EndOfFile)
+ return;
+
+ // For now, we translate into an old-style token for testing.
+ Token oldStyleToken;
+ AtomicHTMLToken atomicToken(token);
+ convertToOldStyle(atomicToken, oldStyleToken);
+
+ RefPtr result = m_legacyTreeBuilder->parseToken(&oldStyleToken);
+ if (token.type() == HTMLToken::StartTag) {
+ // This work is supposed to be done by the parser, but
+ // when using the old parser for we have to do this manually.
+ if (oldStyleToken.tagName == scriptTag) {
+ handleScriptStartTag();
+ m_lastScriptElement = static_pointer_cast(result);
+ m_lastScriptElementStartLine = m_tokenizer->lineNumber();
+ } else if (oldStyleToken.tagName == preTag || oldStyleToken.tagName == listingTag)
+ m_tokenizer->skipLeadingNewLineForListing();
+ else
+ m_tokenizer->setState(adjustedLexerState(m_tokenizer->state(), oldStyleToken.tagName, m_document->frame()));
+ } else if (token.type() == HTMLToken::EndTag) {
+ if (oldStyleToken.tagName == scriptTag) {
+ if (m_lastScriptElement) {
+ ASSERT(m_lastScriptElementStartLine != uninitializedLineNumberValue);
+ if (m_fragmentScriptingPermission == FragmentScriptingNotAllowed) {
+ // FIXME: This is a horrible hack for platform/Pasteboard.
+ // Clear the