symbian-qemu-0.9.1-12/python-2.6.1/Lib/test/test_sgmllib.py
changeset 1 2fb8b9db1c86
equal deleted inserted replaced
0:ffa851df0825 1:2fb8b9db1c86
       
     1 import pprint
       
     2 import re
       
     3 import unittest
       
     4 from test import test_support
       
     5 sgmllib = test_support.import_module('sgmllib', deprecated=True)
       
     6 
       
     7 
       
     8 class EventCollector(sgmllib.SGMLParser):
       
     9 
       
    10     def __init__(self):
       
    11         self.events = []
       
    12         self.append = self.events.append
       
    13         sgmllib.SGMLParser.__init__(self)
       
    14 
       
    15     def get_events(self):
       
    16         # Normalize the list of events so that buffer artefacts don't
       
    17         # separate runs of contiguous characters.
       
    18         L = []
       
    19         prevtype = None
       
    20         for event in self.events:
       
    21             type = event[0]
       
    22             if type == prevtype == "data":
       
    23                 L[-1] = ("data", L[-1][1] + event[1])
       
    24             else:
       
    25                 L.append(event)
       
    26             prevtype = type
       
    27         self.events = L
       
    28         return L
       
    29 
       
    30     # structure markup
       
    31 
       
    32     def unknown_starttag(self, tag, attrs):
       
    33         self.append(("starttag", tag, attrs))
       
    34 
       
    35     def unknown_endtag(self, tag):
       
    36         self.append(("endtag", tag))
       
    37 
       
    38     # all other markup
       
    39 
       
    40     def handle_comment(self, data):
       
    41         self.append(("comment", data))
       
    42 
       
    43     def handle_charref(self, data):
       
    44         self.append(("charref", data))
       
    45 
       
    46     def handle_data(self, data):
       
    47         self.append(("data", data))
       
    48 
       
    49     def handle_decl(self, decl):
       
    50         self.append(("decl", decl))
       
    51 
       
    52     def handle_entityref(self, data):
       
    53         self.append(("entityref", data))
       
    54 
       
    55     def handle_pi(self, data):
       
    56         self.append(("pi", data))
       
    57 
       
    58     def unknown_decl(self, decl):
       
    59         self.append(("unknown decl", decl))
       
    60 
       
    61 
       
    62 class CDATAEventCollector(EventCollector):
       
    63     def start_cdata(self, attrs):
       
    64         self.append(("starttag", "cdata", attrs))
       
    65         self.setliteral()
       
    66 
       
    67 
       
    68 class HTMLEntityCollector(EventCollector):
       
    69 
       
    70     entity_or_charref = re.compile('(?:&([a-zA-Z][-.a-zA-Z0-9]*)'
       
    71         '|&#(x[0-9a-zA-Z]+|[0-9]+))(;?)')
       
    72 
       
    73     def convert_charref(self, name):
       
    74         self.append(("charref", "convert", name))
       
    75         if name[0] != "x":
       
    76             return EventCollector.convert_charref(self, name)
       
    77 
       
    78     def convert_codepoint(self, codepoint):
       
    79         self.append(("codepoint", "convert", codepoint))
       
    80         EventCollector.convert_codepoint(self, codepoint)
       
    81 
       
    82     def convert_entityref(self, name):
       
    83         self.append(("entityref", "convert", name))
       
    84         return EventCollector.convert_entityref(self, name)
       
    85 
       
    86     # These to record that they were called, then pass the call along
       
    87     # to the default implementation so that it's actions can be
       
    88     # recorded.
       
    89 
       
    90     def handle_charref(self, data):
       
    91         self.append(("charref", data))
       
    92         sgmllib.SGMLParser.handle_charref(self, data)
       
    93 
       
    94     def handle_entityref(self, data):
       
    95         self.append(("entityref", data))
       
    96         sgmllib.SGMLParser.handle_entityref(self, data)
       
    97 
       
    98 
       
    99 class SGMLParserTestCase(unittest.TestCase):
       
   100 
       
   101     collector = EventCollector
       
   102 
       
   103     def get_events(self, source):
       
   104         parser = self.collector()
       
   105         try:
       
   106             for s in source:
       
   107                 parser.feed(s)
       
   108             parser.close()
       
   109         except:
       
   110             #self.events = parser.events
       
   111             raise
       
   112         return parser.get_events()
       
   113 
       
   114     def check_events(self, source, expected_events):
       
   115         try:
       
   116             events = self.get_events(source)
       
   117         except:
       
   118             #import sys
       
   119             #print >>sys.stderr, pprint.pformat(self.events)
       
   120             raise
       
   121         if events != expected_events:
       
   122             self.fail("received events did not match expected events\n"
       
   123                       "Expected:\n" + pprint.pformat(expected_events) +
       
   124                       "\nReceived:\n" + pprint.pformat(events))
       
   125 
       
   126     def check_parse_error(self, source):
       
   127         parser = EventCollector()
       
   128         try:
       
   129             parser.feed(source)
       
   130             parser.close()
       
   131         except sgmllib.SGMLParseError:
       
   132             pass
       
   133         else:
       
   134             self.fail("expected SGMLParseError for %r\nReceived:\n%s"
       
   135                       % (source, pprint.pformat(parser.get_events())))
       
   136 
       
   137     def test_doctype_decl_internal(self):
       
   138         inside = """\
       
   139 DOCTYPE html PUBLIC '-//W3C//DTD HTML 4.01//EN'
       
   140              SYSTEM 'http://www.w3.org/TR/html401/strict.dtd' [
       
   141   <!ELEMENT html - O EMPTY>
       
   142   <!ATTLIST html
       
   143       version CDATA #IMPLIED
       
   144       profile CDATA 'DublinCore'>
       
   145   <!NOTATION datatype SYSTEM 'http://xml.python.org/notations/python-module'>
       
   146   <!ENTITY myEntity 'internal parsed entity'>
       
   147   <!ENTITY anEntity SYSTEM 'http://xml.python.org/entities/something.xml'>
       
   148   <!ENTITY % paramEntity 'name|name|name'>
       
   149   %paramEntity;
       
   150   <!-- comment -->
       
   151 ]"""
       
   152         self.check_events(["<!%s>" % inside], [
       
   153             ("decl", inside),
       
   154             ])
       
   155 
       
   156     def test_doctype_decl_external(self):
       
   157         inside = "DOCTYPE html PUBLIC '-//W3C//DTD HTML 4.01//EN'"
       
   158         self.check_events("<!%s>" % inside, [
       
   159             ("decl", inside),
       
   160             ])
       
   161 
       
   162     def test_underscore_in_attrname(self):
       
   163         # SF bug #436621
       
   164         """Make sure attribute names with underscores are accepted"""
       
   165         self.check_events("<a has_under _under>", [
       
   166             ("starttag", "a", [("has_under", "has_under"),
       
   167                                ("_under", "_under")]),
       
   168             ])
       
   169 
       
   170     def test_underscore_in_tagname(self):
       
   171         # SF bug #436621
       
   172         """Make sure tag names with underscores are accepted"""
       
   173         self.check_events("<has_under></has_under>", [
       
   174             ("starttag", "has_under", []),
       
   175             ("endtag", "has_under"),
       
   176             ])
       
   177 
       
   178     def test_quotes_in_unquoted_attrs(self):
       
   179         # SF bug #436621
       
   180         """Be sure quotes in unquoted attributes are made part of the value"""
       
   181         self.check_events("<a href=foo'bar\"baz>", [
       
   182             ("starttag", "a", [("href", "foo'bar\"baz")]),
       
   183             ])
       
   184 
       
   185     def test_xhtml_empty_tag(self):
       
   186         """Handling of XHTML-style empty start tags"""
       
   187         self.check_events("<br />text<i></i>", [
       
   188             ("starttag", "br", []),
       
   189             ("data", "text"),
       
   190             ("starttag", "i", []),
       
   191             ("endtag", "i"),
       
   192             ])
       
   193 
       
   194     def test_processing_instruction_only(self):
       
   195         self.check_events("<?processing instruction>", [
       
   196             ("pi", "processing instruction"),
       
   197             ])
       
   198 
       
   199     def test_bad_nesting(self):
       
   200         self.check_events("<a><b></a></b>", [
       
   201             ("starttag", "a", []),
       
   202             ("starttag", "b", []),
       
   203             ("endtag", "a"),
       
   204             ("endtag", "b"),
       
   205             ])
       
   206 
       
   207     def test_bare_ampersands(self):
       
   208         self.check_events("this text & contains & ampersands &", [
       
   209             ("data", "this text & contains & ampersands &"),
       
   210             ])
       
   211 
       
   212     def test_bare_pointy_brackets(self):
       
   213         self.check_events("this < text > contains < bare>pointy< brackets", [
       
   214             ("data", "this < text > contains < bare>pointy< brackets"),
       
   215             ])
       
   216 
       
   217     def test_attr_syntax(self):
       
   218         output = [
       
   219           ("starttag", "a", [("b", "v"), ("c", "v"), ("d", "v"), ("e", "e")])
       
   220           ]
       
   221         self.check_events("""<a b='v' c="v" d=v e>""", output)
       
   222         self.check_events("""<a  b = 'v' c = "v" d = v e>""", output)
       
   223         self.check_events("""<a\nb\n=\n'v'\nc\n=\n"v"\nd\n=\nv\ne>""", output)
       
   224         self.check_events("""<a\tb\t=\t'v'\tc\t=\t"v"\td\t=\tv\te>""", output)
       
   225 
       
   226     def test_attr_values(self):
       
   227         self.check_events("""<a b='xxx\n\txxx' c="yyy\t\nyyy" d='\txyz\n'>""",
       
   228                         [("starttag", "a", [("b", "xxx\n\txxx"),
       
   229                                             ("c", "yyy\t\nyyy"),
       
   230                                             ("d", "\txyz\n")])
       
   231                          ])
       
   232         self.check_events("""<a b='' c="">""", [
       
   233             ("starttag", "a", [("b", ""), ("c", "")]),
       
   234             ])
       
   235         # URL construction stuff from RFC 1808:
       
   236         safe = "$-_.+"
       
   237         extra = "!*'(),"
       
   238         reserved = ";/?:@&="
       
   239         url = "http://example.com:8080/path/to/file?%s%s%s" % (
       
   240             safe, extra, reserved)
       
   241         self.check_events("""<e a=%s>""" % url, [
       
   242             ("starttag", "e", [("a", url)]),
       
   243             ])
       
   244         # Regression test for SF patch #669683.
       
   245         self.check_events("<e a=rgb(1,2,3)>", [
       
   246             ("starttag", "e", [("a", "rgb(1,2,3)")]),
       
   247             ])
       
   248 
       
   249     def test_attr_values_entities(self):
       
   250         """Substitution of entities and charrefs in attribute values"""
       
   251         # SF bug #1452246
       
   252         self.check_events("""<a b=&lt; c=&lt;&gt; d=&lt-&gt; e='&lt; '
       
   253                                 f="&xxx;" g='&#32;&#33;' h='&#500;'
       
   254                                 i='x?a=b&c=d;'
       
   255                                 j='&amp;#42;' k='&#38;#42;'>""",
       
   256             [("starttag", "a", [("b", "<"),
       
   257                                 ("c", "<>"),
       
   258                                 ("d", "&lt->"),
       
   259                                 ("e", "< "),
       
   260                                 ("f", "&xxx;"),
       
   261                                 ("g", " !"),
       
   262                                 ("h", "&#500;"),
       
   263                                 ("i", "x?a=b&c=d;"),
       
   264                                 ("j", "&#42;"),
       
   265                                 ("k", "&#42;"),
       
   266                                 ])])
       
   267 
       
   268     def test_convert_overrides(self):
       
   269         # This checks that the character and entity reference
       
   270         # conversion helpers are called at the documented times.  No
       
   271         # attempt is made to really change what the parser accepts.
       
   272         #
       
   273         self.collector = HTMLEntityCollector
       
   274         self.check_events(('<a title="&ldquo;test&#x201d;">foo</a>'
       
   275                            '&foobar;&#42;'), [
       
   276             ('entityref', 'convert', 'ldquo'),
       
   277             ('charref', 'convert', 'x201d'),
       
   278             ('starttag', 'a', [('title', '&ldquo;test&#x201d;')]),
       
   279             ('data', 'foo'),
       
   280             ('endtag', 'a'),
       
   281             ('entityref', 'foobar'),
       
   282             ('entityref', 'convert', 'foobar'),
       
   283             ('charref', '42'),
       
   284             ('charref', 'convert', '42'),
       
   285             ('codepoint', 'convert', 42),
       
   286             ])
       
   287 
       
   288     def test_attr_funky_names(self):
       
   289         self.check_events("""<a a.b='v' c:d=v e-f=v>""", [
       
   290             ("starttag", "a", [("a.b", "v"), ("c:d", "v"), ("e-f", "v")]),
       
   291             ])
       
   292 
       
   293     def test_attr_value_ip6_url(self):
       
   294         # http://www.python.org/sf/853506
       
   295         self.check_events(("<a href='http://[1080::8:800:200C:417A]/'>"
       
   296                            "<a href=http://[1080::8:800:200C:417A]/>"), [
       
   297             ("starttag", "a", [("href", "http://[1080::8:800:200C:417A]/")]),
       
   298             ("starttag", "a", [("href", "http://[1080::8:800:200C:417A]/")]),
       
   299             ])
       
   300 
       
   301     def test_illegal_declarations(self):
       
   302         s = 'abc<!spacer type="block" height="25">def'
       
   303         self.check_events(s, [
       
   304             ("data", "abc"),
       
   305             ("unknown decl", 'spacer type="block" height="25"'),
       
   306             ("data", "def"),
       
   307             ])
       
   308 
       
   309     def test_weird_starttags(self):
       
   310         self.check_events("<a<a>", [
       
   311             ("starttag", "a", []),
       
   312             ("starttag", "a", []),
       
   313             ])
       
   314         self.check_events("</a<a>", [
       
   315             ("endtag", "a"),
       
   316             ("starttag", "a", []),
       
   317             ])
       
   318 
       
   319     def test_declaration_junk_chars(self):
       
   320         self.check_parse_error("<!DOCTYPE foo $ >")
       
   321 
       
   322     def test_get_starttag_text(self):
       
   323         s = """<foobar   \n   one="1"\ttwo=2   >"""
       
   324         self.check_events(s, [
       
   325             ("starttag", "foobar", [("one", "1"), ("two", "2")]),
       
   326             ])
       
   327 
       
   328     def test_cdata_content(self):
       
   329         s = ("<cdata> <!-- not a comment --> &not-an-entity-ref; </cdata>"
       
   330              "<notcdata> <!-- comment --> </notcdata>")
       
   331         self.collector = CDATAEventCollector
       
   332         self.check_events(s, [
       
   333             ("starttag", "cdata", []),
       
   334             ("data", " <!-- not a comment --> &not-an-entity-ref; "),
       
   335             ("endtag", "cdata"),
       
   336             ("starttag", "notcdata", []),
       
   337             ("data", " "),
       
   338             ("comment", " comment "),
       
   339             ("data", " "),
       
   340             ("endtag", "notcdata"),
       
   341             ])
       
   342         s = """<cdata> <not a='start tag'> </cdata>"""
       
   343         self.check_events(s, [
       
   344             ("starttag", "cdata", []),
       
   345             ("data", " <not a='start tag'> "),
       
   346             ("endtag", "cdata"),
       
   347             ])
       
   348 
       
   349     def test_illegal_declarations(self):
       
   350         s = 'abc<!spacer type="block" height="25">def'
       
   351         self.check_events(s, [
       
   352             ("data", "abc"),
       
   353             ("unknown decl", 'spacer type="block" height="25"'),
       
   354             ("data", "def"),
       
   355             ])
       
   356 
       
   357     def test_enumerated_attr_type(self):
       
   358         s = "<!DOCTYPE doc [<!ATTLIST doc attr (a | b) >]>"
       
   359         self.check_events(s, [
       
   360             ('decl', 'DOCTYPE doc [<!ATTLIST doc attr (a | b) >]'),
       
   361             ])
       
   362 
       
   363     def test_read_chunks(self):
       
   364         # SF bug #1541697, this caused sgml parser to hang
       
   365         # Just verify this code doesn't cause a hang.
       
   366         CHUNK = 1024  # increasing this to 8212 makes the problem go away
       
   367 
       
   368         f = open(test_support.findfile('sgml_input.html'))
       
   369         fp = sgmllib.SGMLParser()
       
   370         while 1:
       
   371             data = f.read(CHUNK)
       
   372             fp.feed(data)
       
   373             if len(data) != CHUNK:
       
   374                 break
       
   375 
       
   376     # XXX These tests have been disabled by prefixing their names with
       
   377     # an underscore.  The first two exercise outstanding bugs in the
       
   378     # sgmllib module, and the third exhibits questionable behavior
       
   379     # that needs to be carefully considered before changing it.
       
   380 
       
   381     def _test_starttag_end_boundary(self):
       
   382         self.check_events("<a b='<'>", [("starttag", "a", [("b", "<")])])
       
   383         self.check_events("<a b='>'>", [("starttag", "a", [("b", ">")])])
       
   384 
       
   385     def _test_buffer_artefacts(self):
       
   386         output = [("starttag", "a", [("b", "<")])]
       
   387         self.check_events(["<a b='<'>"], output)
       
   388         self.check_events(["<a ", "b='<'>"], output)
       
   389         self.check_events(["<a b", "='<'>"], output)
       
   390         self.check_events(["<a b=", "'<'>"], output)
       
   391         self.check_events(["<a b='<", "'>"], output)
       
   392         self.check_events(["<a b='<'", ">"], output)
       
   393 
       
   394         output = [("starttag", "a", [("b", ">")])]
       
   395         self.check_events(["<a b='>'>"], output)
       
   396         self.check_events(["<a ", "b='>'>"], output)
       
   397         self.check_events(["<a b", "='>'>"], output)
       
   398         self.check_events(["<a b=", "'>'>"], output)
       
   399         self.check_events(["<a b='>", "'>"], output)
       
   400         self.check_events(["<a b='>'", ">"], output)
       
   401 
       
   402         output = [("comment", "abc")]
       
   403         self.check_events(["", "<!--abc-->"], output)
       
   404         self.check_events(["<", "!--abc-->"], output)
       
   405         self.check_events(["<!", "--abc-->"], output)
       
   406         self.check_events(["<!-", "-abc-->"], output)
       
   407         self.check_events(["<!--", "abc-->"], output)
       
   408         self.check_events(["<!--a", "bc-->"], output)
       
   409         self.check_events(["<!--ab", "c-->"], output)
       
   410         self.check_events(["<!--abc", "-->"], output)
       
   411         self.check_events(["<!--abc-", "->"], output)
       
   412         self.check_events(["<!--abc--", ">"], output)
       
   413         self.check_events(["<!--abc-->", ""], output)
       
   414 
       
   415     def _test_starttag_junk_chars(self):
       
   416         self.check_parse_error("<")
       
   417         self.check_parse_error("<>")
       
   418         self.check_parse_error("</$>")
       
   419         self.check_parse_error("</")
       
   420         self.check_parse_error("</a")
       
   421         self.check_parse_error("<$")
       
   422         self.check_parse_error("<$>")
       
   423         self.check_parse_error("<!")
       
   424         self.check_parse_error("<a $>")
       
   425         self.check_parse_error("<a")
       
   426         self.check_parse_error("<a foo='bar'")
       
   427         self.check_parse_error("<a foo='bar")
       
   428         self.check_parse_error("<a foo='>'")
       
   429         self.check_parse_error("<a foo='>")
       
   430         self.check_parse_error("<a foo=>")
       
   431 
       
   432 
       
   433 def test_main():
       
   434     test_support.run_unittest(SGMLParserTestCase)
       
   435 
       
   436 
       
   437 if __name__ == "__main__":
       
   438     test_main()