diff -r 000000000000 -r 4f2f89ce4247 WebCore/html/HTMLTreeBuilder.cpp --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/WebCore/html/HTMLTreeBuilder.cpp Fri Sep 17 09:02:29 2010 +0300 @@ -0,0 +1,2833 @@ +/* + * Copyright (C) 2010 Google, Inc. All Rights Reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY GOOGLE INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GOOGLE INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" +#include "HTMLTreeBuilder.h" + +#include "Comment.h" +#include "DocumentFragment.h" +#include "DocumentType.h" +#include "Element.h" +#include "Frame.h" +#include "HTMLDocument.h" +#include "HTMLElementFactory.h" +#include "HTMLFormElement.h" +#include "HTMLHtmlElement.h" +#include "HTMLNames.h" +#include "HTMLScriptElement.h" +#include "HTMLToken.h" +#include "HTMLTokenizer.h" +#include "LegacyHTMLDocumentParser.h" +#include "LegacyHTMLTreeBuilder.h" +#include "LocalizedStrings.h" +#include "MathMLNames.h" +#include "NotImplemented.h" +#include "SVGNames.h" +#include "ScriptController.h" +#include "Settings.h" +#include "Text.h" +#include "XLinkNames.h" +#include "XMLNSNames.h" +#include "XMLNames.h" +#include + +namespace WebCore { + +using namespace HTMLNames; + +static const int uninitializedLineNumberValue = -1; + +namespace { + +inline bool isTreeBuilderWhitepace(UChar cc) +{ + return cc == '\t' || cc == '\x0A' || cc == '\x0C' || cc == '\x0D' || cc == ' '; +} + +inline bool hasNonWhitespace(const String& string) +{ + const UChar* characters = string.characters(); + const unsigned length = string.length(); + for (unsigned i = 0; i < length; ++i) { + if (!isTreeBuilderWhitepace(characters[i])) + return true; + } + return false; +} + +bool shouldUseLegacyTreeBuilder(Document* document) +{ + return !document->settings() || !document->settings()->html5TreeBuilderEnabled(); +} + +bool isNumberedHeaderTag(const AtomicString& tagName) +{ + return tagName == h1Tag + || tagName == h2Tag + || tagName == h3Tag + || tagName == h4Tag + || tagName == h5Tag + || tagName == h6Tag; +} + +bool isCaptionColOrColgroupTag(const AtomicString& tagName) +{ + return tagName == captionTag + || tagName == colTag + || tagName == colgroupTag; +} + +bool isTableCellContextTag(const AtomicString& tagName) +{ + return tagName == thTag || tagName == tdTag; +} + +bool isTableBodyContextTag(const AtomicString& tagName) +{ + return tagName == tbodyTag + || tagName == tfootTag + || tagName == theadTag; +} + +// http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#special +bool isSpecialTag(const AtomicString& tagName) +{ + return tagName == addressTag + || tagName == articleTag + || tagName == asideTag + || tagName == baseTag + || tagName == basefontTag + || tagName == "bgsound" + || tagName == blockquoteTag + || tagName == bodyTag + || tagName == brTag + || tagName == buttonTag + || tagName == centerTag + || tagName == colTag + || tagName == colgroupTag + || tagName == "command" + || tagName == ddTag + || tagName == "details" + || tagName == dirTag + || tagName == divTag + || tagName == dlTag + || tagName == dtTag + || tagName == embedTag + || tagName == fieldsetTag + || tagName == "figure" + || tagName == footerTag + || tagName == formTag + || tagName == frameTag + || tagName == framesetTag + || isNumberedHeaderTag(tagName) + || tagName == headTag + || tagName == headerTag + || tagName == hgroupTag + || tagName == hrTag + || tagName == iframeTag + || tagName == imgTag + || tagName == inputTag + || tagName == isindexTag + || tagName == liTag + || tagName == linkTag + || tagName == listingTag + || tagName == menuTag + || tagName == metaTag + || tagName == navTag + || tagName == noembedTag + || tagName == noframesTag + || tagName == noscriptTag + || tagName == olTag + || tagName == pTag + || tagName == paramTag + || tagName == plaintextTag + || tagName == preTag + || tagName == scriptTag + || tagName == sectionTag + || tagName == selectTag + || tagName == styleTag + || isTableBodyContextTag(tagName) + || tagName == textareaTag + || tagName == titleTag + || tagName == trTag + || tagName == ulTag + || tagName == wbrTag + || tagName == xmpTag; +} + +// http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#scoping +// Same as isScopingTag in LegacyHTMLTreeBuilder.cpp +// and isScopeMarker in HTMLElementStack.cpp +bool isScopingTag(const AtomicString& tagName) +{ + return tagName == appletTag + || tagName == captionTag + || tagName == SVGNames::foreignObjectTag + || tagName == htmlTag + || tagName == marqueeTag + || tagName == objectTag + || tagName == tableTag + || isTableCellContextTag(tagName); +} + +bool isNonAnchorNonNobrFormattingTag(const AtomicString& tagName) +{ + return tagName == bTag + || tagName == bigTag + || tagName == codeTag + || tagName == emTag + || tagName == fontTag + || tagName == iTag + || tagName == sTag + || tagName == smallTag + || tagName == strikeTag + || tagName == strongTag + || tagName == ttTag + || tagName == uTag; +} + +bool isNonAnchorFormattingTag(const AtomicString& tagName) +{ + return tagName == nobrTag + || isNonAnchorNonNobrFormattingTag(tagName); +} + +// http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#formatting +bool isFormattingTag(const AtomicString& tagName) +{ + return tagName == aTag || isNonAnchorFormattingTag(tagName); +} + +// http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#phrasing +bool isPhrasingTag(const AtomicString& tagName) +{ + return !isSpecialTag(tagName) && !isScopingTag(tagName) && !isFormattingTag(tagName); +} + +bool isNotFormattingAndNotPhrasing(const Element* element) +{ + // The spec often says "node is not in the formatting category, and is not + // in the phrasing category". !phrasing && !formatting == scoping || special + // scoping || special is easier to compute. + // FIXME: localName() is wrong for non-html content. + const AtomicString& tagName = element->localName(); + return isScopingTag(tagName) || isSpecialTag(tagName); +} + +} // namespace + +class HTMLTreeBuilder::ExternalCharacterTokenBuffer : public Noncopyable { +public: + explicit ExternalCharacterTokenBuffer(AtomicHTMLToken& token) + : m_current(token.characters().data()) + , m_end(m_current + token.characters().size()) + { + ASSERT(!isEmpty()); + } + + explicit ExternalCharacterTokenBuffer(const String& string) + : m_current(string.characters()) + , m_end(m_current + string.length()) + { + ASSERT(!isEmpty()); + } + + ~ExternalCharacterTokenBuffer() + { + ASSERT(isEmpty()); + } + + bool isEmpty() const { return m_current == m_end; } + + void skipLeadingWhitespace() + { + ASSERT(!isEmpty()); + while (isTreeBuilderWhitepace(*m_current)) { + if (++m_current == m_end) + return; + } + } + + String takeLeadingWhitespace() + { + ASSERT(!isEmpty()); + const UChar* start = m_current; + skipLeadingWhitespace(); + if (start == m_current) + return String(); + return String(start, m_current - start); + } + + String takeRemaining() + { + ASSERT(!isEmpty()); + const UChar* start = m_current; + m_current = m_end; + return String(start, m_current - start); + } + + void giveRemainingTo(Vector& recipient) + { + recipient.append(m_current, m_end - m_current); + m_current = m_end; + } + + String takeRemainingWhitespace() + { + ASSERT(!isEmpty()); + Vector whitespace; + do { + UChar cc = *m_current++; + if (isTreeBuilderWhitepace(cc)) + whitespace.append(cc); + } while (m_current < m_end); + // Returning the null string when there aren't any whitespace + // characters is slightly cleaner semantically because we don't want + // to insert a text node (as opposed to inserting an empty text node). + if (whitespace.isEmpty()) + return String(); + return String::adopt(whitespace); + } + +private: + const UChar* m_current; + const UChar* m_end; +}; + + +HTMLTreeBuilder::HTMLTreeBuilder(HTMLTokenizer* tokenizer, HTMLDocument* document, bool reportErrors) + : m_framesetOk(true) + , m_document(document) + , m_tree(document, FragmentScriptingAllowed, false) + , m_reportErrors(reportErrors) + , m_isPaused(false) + , m_insertionMode(InitialMode) + , m_originalInsertionMode(InitialMode) + , m_secondaryInsertionMode(InitialMode) + , m_tokenizer(tokenizer) + , m_legacyTreeBuilder(shouldUseLegacyTreeBuilder(document) ? new LegacyHTMLTreeBuilder(document, reportErrors) : 0) + , m_lastScriptElementStartLine(uninitializedLineNumberValue) + , m_scriptToProcessStartLine(uninitializedLineNumberValue) + , m_fragmentScriptingPermission(FragmentScriptingAllowed) + , m_isParsingFragment(false) +{ +} + +// FIXME: Member variables should be grouped into self-initializing structs to +// minimize code duplication between these constructors. +HTMLTreeBuilder::HTMLTreeBuilder(HTMLTokenizer* tokenizer, DocumentFragment* fragment, FragmentScriptingPermission scriptingPermission) + : m_framesetOk(true) + , m_document(fragment->document()) + , m_tree(fragment->document(), scriptingPermission, true) + , m_reportErrors(false) // FIXME: Why not report errors in fragments? + , m_isPaused(false) + , m_insertionMode(InitialMode) + , m_originalInsertionMode(InitialMode) + , m_secondaryInsertionMode(InitialMode) + , m_tokenizer(tokenizer) + , m_legacyTreeBuilder(new LegacyHTMLTreeBuilder(fragment, scriptingPermission)) + , m_lastScriptElementStartLine(uninitializedLineNumberValue) + , m_scriptToProcessStartLine(uninitializedLineNumberValue) + , m_fragmentScriptingPermission(scriptingPermission) + , m_isParsingFragment(true) +{ +} + +HTMLTreeBuilder::~HTMLTreeBuilder() +{ +} + +static void convertToOldStyle(AtomicHTMLToken& token, Token& oldStyleToken) +{ + switch (token.type()) { + case HTMLToken::Uninitialized: + case HTMLToken::DOCTYPE: + ASSERT_NOT_REACHED(); + break; + case HTMLToken::EndOfFile: + ASSERT_NOT_REACHED(); + notImplemented(); + break; + case HTMLToken::StartTag: + case HTMLToken::EndTag: { + oldStyleToken.beginTag = (token.type() == HTMLToken::StartTag); + // The LegacyHTMLTreeBuilder seems to work better if we lie here and + // say that tags are never self closing. As a wise man once said: + // "You can't handle the truth!" + oldStyleToken.selfClosingTag = false; + oldStyleToken.tagName = token.name(); + oldStyleToken.attrs = token.takeAtributes(); + break; + } + case HTMLToken::Comment: + oldStyleToken.tagName = commentAtom; + oldStyleToken.text = token.comment().impl(); + break; + case HTMLToken::Character: + oldStyleToken.tagName = textAtom; + oldStyleToken.text = StringImpl::create(token.characters().data(), token.characters().size()); + break; + } +} + +void HTMLTreeBuilder::handleScriptStartTag() +{ + notImplemented(); // The HTML frgment case? + m_tokenizer->setState(HTMLTokenizer::ScriptDataState); + notImplemented(); // Save insertion mode. +} + +void HTMLTreeBuilder::handleScriptEndTag(Element* scriptElement, int scriptStartLine) +{ + ASSERT(!m_scriptToProcess); // Caller never called takeScriptToProcess! + ASSERT(m_scriptToProcessStartLine == uninitializedLineNumberValue); // Caller never called takeScriptToProcess! + notImplemented(); // Save insertion mode and insertion point? + + // Pause ourselves so that parsing stops until the script can be processed by the caller. + m_isPaused = true; + m_scriptToProcess = scriptElement; + // Lexer line numbers are 0-based, ScriptSourceCode expects 1-based lines, + // so we convert here before passing the line number off to HTMLScriptRunner. + m_scriptToProcessStartLine = scriptStartLine + 1; +} + +PassRefPtr HTMLTreeBuilder::takeScriptToProcess(int& scriptStartLine) +{ + // Unpause ourselves, callers may pause us again when processing the script. + // The HTML5 spec is written as though scripts are executed inside the tree + // builder. We pause the parser to exit the tree builder, and then resume + // before running scripts. + m_isPaused = false; + scriptStartLine = m_scriptToProcessStartLine; + m_scriptToProcessStartLine = uninitializedLineNumberValue; + return m_scriptToProcess.release(); +} + +HTMLTokenizer::State HTMLTreeBuilder::adjustedLexerState(HTMLTokenizer::State state, const AtomicString& tagName, Frame* frame) +{ + if (tagName == textareaTag || tagName == titleTag) + return HTMLTokenizer::RCDATAState; + + if (tagName == styleTag + || tagName == iframeTag + || tagName == xmpTag + || tagName == noembedTag + || tagName == noframesTag + || (tagName == noscriptTag && isScriptingFlagEnabled(frame))) + return HTMLTokenizer::RAWTEXTState; + + if (tagName == plaintextTag) + return HTMLTokenizer::PLAINTEXTState; + + return state; +} + +void HTMLTreeBuilder::passTokenToLegacyParser(HTMLToken& token) +{ + if (token.type() == HTMLToken::DOCTYPE) { + DoctypeToken doctypeToken; + doctypeToken.m_name.append(token.name().data(), token.name().size()); + doctypeToken.m_publicID = token.publicIdentifier(); + doctypeToken.m_systemID = token.systemIdentifier(); + doctypeToken.m_forceQuirks = token.forceQuirks(); + + m_legacyTreeBuilder->parseDoctypeToken(&doctypeToken); + return; + } + + if (token.type() == HTMLToken::EndOfFile) + return; + + // For now, we translate into an old-style token for testing. + Token oldStyleToken; + AtomicHTMLToken atomicToken(token); + convertToOldStyle(atomicToken, oldStyleToken); + + RefPtr result = m_legacyTreeBuilder->parseToken(&oldStyleToken); + if (token.type() == HTMLToken::StartTag) { + // This work is supposed to be done by the parser, but + // when using the old parser for we have to do this manually. + if (oldStyleToken.tagName == scriptTag) { + handleScriptStartTag(); + m_lastScriptElement = static_pointer_cast(result); + m_lastScriptElementStartLine = m_tokenizer->lineNumber(); + } else if (oldStyleToken.tagName == preTag || oldStyleToken.tagName == listingTag) + m_tokenizer->skipLeadingNewLineForListing(); + else + m_tokenizer->setState(adjustedLexerState(m_tokenizer->state(), oldStyleToken.tagName, m_document->frame())); + } else if (token.type() == HTMLToken::EndTag) { + if (oldStyleToken.tagName == scriptTag) { + if (m_lastScriptElement) { + ASSERT(m_lastScriptElementStartLine != uninitializedLineNumberValue); + if (m_fragmentScriptingPermission == FragmentScriptingNotAllowed) { + // FIXME: This is a horrible hack for platform/Pasteboard. + // Clear the