python-2.5.2/win32/Lib/test/test_htmlparser.py
changeset 0 ae805ac0140d
equal deleted inserted replaced
-1:000000000000 0:ae805ac0140d
       
     1 """Tests for HTMLParser.py."""
       
     2 
       
     3 import HTMLParser
       
     4 import pprint
       
     5 import sys
       
     6 import unittest
       
     7 from test import test_support
       
     8 
       
     9 
       
    10 class EventCollector(HTMLParser.HTMLParser):
       
    11 
       
    12     def __init__(self):
       
    13         self.events = []
       
    14         self.append = self.events.append
       
    15         HTMLParser.HTMLParser.__init__(self)
       
    16 
       
    17     def get_events(self):
       
    18         # Normalize the list of events so that buffer artefacts don't
       
    19         # separate runs of contiguous characters.
       
    20         L = []
       
    21         prevtype = None
       
    22         for event in self.events:
       
    23             type = event[0]
       
    24             if type == prevtype == "data":
       
    25                 L[-1] = ("data", L[-1][1] + event[1])
       
    26             else:
       
    27                 L.append(event)
       
    28             prevtype = type
       
    29         self.events = L
       
    30         return L
       
    31 
       
    32     # structure markup
       
    33 
       
    34     def handle_starttag(self, tag, attrs):
       
    35         self.append(("starttag", tag, attrs))
       
    36 
       
    37     def handle_startendtag(self, tag, attrs):
       
    38         self.append(("startendtag", tag, attrs))
       
    39 
       
    40     def handle_endtag(self, tag):
       
    41         self.append(("endtag", tag))
       
    42 
       
    43     # all other markup
       
    44 
       
    45     def handle_comment(self, data):
       
    46         self.append(("comment", data))
       
    47 
       
    48     def handle_charref(self, data):
       
    49         self.append(("charref", data))
       
    50 
       
    51     def handle_data(self, data):
       
    52         self.append(("data", data))
       
    53 
       
    54     def handle_decl(self, data):
       
    55         self.append(("decl", data))
       
    56 
       
    57     def handle_entityref(self, data):
       
    58         self.append(("entityref", data))
       
    59 
       
    60     def handle_pi(self, data):
       
    61         self.append(("pi", data))
       
    62 
       
    63     def unknown_decl(self, decl):
       
    64         self.append(("unknown decl", decl))
       
    65 
       
    66 
       
    67 class EventCollectorExtra(EventCollector):
       
    68 
       
    69     def handle_starttag(self, tag, attrs):
       
    70         EventCollector.handle_starttag(self, tag, attrs)
       
    71         self.append(("starttag_text", self.get_starttag_text()))
       
    72 
       
    73 
       
    74 class TestCaseBase(unittest.TestCase):
       
    75 
       
    76     def _run_check(self, source, expected_events, collector=EventCollector):
       
    77         parser = collector()
       
    78         for s in source:
       
    79             parser.feed(s)
       
    80         parser.close()
       
    81         events = parser.get_events()
       
    82         if events != expected_events:
       
    83             self.fail("received events did not match expected events\n"
       
    84                       "Expected:\n" + pprint.pformat(expected_events) +
       
    85                       "\nReceived:\n" + pprint.pformat(events))
       
    86 
       
    87     def _run_check_extra(self, source, events):
       
    88         self._run_check(source, events, EventCollectorExtra)
       
    89 
       
    90     def _parse_error(self, source):
       
    91         def parse(source=source):
       
    92             parser = HTMLParser.HTMLParser()
       
    93             parser.feed(source)
       
    94             parser.close()
       
    95         self.assertRaises(HTMLParser.HTMLParseError, parse)
       
    96 
       
    97 
       
    98 class HTMLParserTestCase(TestCaseBase):
       
    99 
       
   100     def test_processing_instruction_only(self):
       
   101         self._run_check("<?processing instruction>", [
       
   102             ("pi", "processing instruction"),
       
   103             ])
       
   104         self._run_check("<?processing instruction ?>", [
       
   105             ("pi", "processing instruction ?"),
       
   106             ])
       
   107 
       
   108     def test_simple_html(self):
       
   109         self._run_check("""
       
   110 <!DOCTYPE html PUBLIC 'foo'>
       
   111 <HTML>&entity;&#32;
       
   112 <!--comment1a
       
   113 -></foo><bar>&lt;<?pi?></foo<bar
       
   114 comment1b-->
       
   115 <Img sRc='Bar' isMAP>sample
       
   116 text
       
   117 &#x201C;
       
   118 <!--comment2a-- --comment2b--><!>
       
   119 </Html>
       
   120 """, [
       
   121     ("data", "\n"),
       
   122     ("decl", "DOCTYPE html PUBLIC 'foo'"),
       
   123     ("data", "\n"),
       
   124     ("starttag", "html", []),
       
   125     ("entityref", "entity"),
       
   126     ("charref", "32"),
       
   127     ("data", "\n"),
       
   128     ("comment", "comment1a\n-></foo><bar>&lt;<?pi?></foo<bar\ncomment1b"),
       
   129     ("data", "\n"),
       
   130     ("starttag", "img", [("src", "Bar"), ("ismap", None)]),
       
   131     ("data", "sample\ntext\n"),
       
   132     ("charref", "x201C"),
       
   133     ("data", "\n"),
       
   134     ("comment", "comment2a-- --comment2b"),
       
   135     ("data", "\n"),
       
   136     ("endtag", "html"),
       
   137     ("data", "\n"),
       
   138     ])
       
   139 
       
   140     def test_unclosed_entityref(self):
       
   141         self._run_check("&entityref foo", [
       
   142             ("entityref", "entityref"),
       
   143             ("data", " foo"),
       
   144             ])
       
   145 
       
   146     def test_doctype_decl(self):
       
   147         inside = """\
       
   148 DOCTYPE html [
       
   149   <!ELEMENT html - O EMPTY>
       
   150   <!ATTLIST html
       
   151       version CDATA #IMPLIED
       
   152       profile CDATA 'DublinCore'>
       
   153   <!NOTATION datatype SYSTEM 'http://xml.python.org/notations/python-module'>
       
   154   <!ENTITY myEntity 'internal parsed entity'>
       
   155   <!ENTITY anEntity SYSTEM 'http://xml.python.org/entities/something.xml'>
       
   156   <!ENTITY % paramEntity 'name|name|name'>
       
   157   %paramEntity;
       
   158   <!-- comment -->
       
   159 ]"""
       
   160         self._run_check("<!%s>" % inside, [
       
   161             ("decl", inside),
       
   162             ])
       
   163 
       
   164     def test_bad_nesting(self):
       
   165         # Strangely, this *is* supposed to test that overlapping
       
   166         # elements are allowed.  HTMLParser is more geared toward
       
   167         # lexing the input that parsing the structure.
       
   168         self._run_check("<a><b></a></b>", [
       
   169             ("starttag", "a", []),
       
   170             ("starttag", "b", []),
       
   171             ("endtag", "a"),
       
   172             ("endtag", "b"),
       
   173             ])
       
   174 
       
   175     def test_bare_ampersands(self):
       
   176         self._run_check("this text & contains & ampersands &", [
       
   177             ("data", "this text & contains & ampersands &"),
       
   178             ])
       
   179 
       
   180     def test_bare_pointy_brackets(self):
       
   181         self._run_check("this < text > contains < bare>pointy< brackets", [
       
   182             ("data", "this < text > contains < bare>pointy< brackets"),
       
   183             ])
       
   184 
       
   185     def test_attr_syntax(self):
       
   186         output = [
       
   187           ("starttag", "a", [("b", "v"), ("c", "v"), ("d", "v"), ("e", None)])
       
   188           ]
       
   189         self._run_check("""<a b='v' c="v" d=v e>""", output)
       
   190         self._run_check("""<a  b = 'v' c = "v" d = v e>""", output)
       
   191         self._run_check("""<a\nb\n=\n'v'\nc\n=\n"v"\nd\n=\nv\ne>""", output)
       
   192         self._run_check("""<a\tb\t=\t'v'\tc\t=\t"v"\td\t=\tv\te>""", output)
       
   193 
       
   194     def test_attr_values(self):
       
   195         self._run_check("""<a b='xxx\n\txxx' c="yyy\t\nyyy" d='\txyz\n'>""",
       
   196                         [("starttag", "a", [("b", "xxx\n\txxx"),
       
   197                                             ("c", "yyy\t\nyyy"),
       
   198                                             ("d", "\txyz\n")])
       
   199                          ])
       
   200         self._run_check("""<a b='' c="">""", [
       
   201             ("starttag", "a", [("b", ""), ("c", "")]),
       
   202             ])
       
   203         # Regression test for SF patch #669683.
       
   204         self._run_check("<e a=rgb(1,2,3)>", [
       
   205             ("starttag", "e", [("a", "rgb(1,2,3)")]),
       
   206             ])
       
   207         # Regression test for SF bug #921657.
       
   208         self._run_check("<a href=mailto:xyz@example.com>", [
       
   209             ("starttag", "a", [("href", "mailto:xyz@example.com")]),
       
   210             ])
       
   211 
       
   212     def test_attr_entity_replacement(self):
       
   213         self._run_check("""<a b='&amp;&gt;&lt;&quot;&apos;'>""", [
       
   214             ("starttag", "a", [("b", "&><\"'")]),
       
   215             ])
       
   216 
       
   217     def test_attr_funky_names(self):
       
   218         self._run_check("""<a a.b='v' c:d=v e-f=v>""", [
       
   219             ("starttag", "a", [("a.b", "v"), ("c:d", "v"), ("e-f", "v")]),
       
   220             ])
       
   221 
       
   222     def test_illegal_declarations(self):
       
   223         self._parse_error('<!spacer type="block" height="25">')
       
   224 
       
   225     def test_starttag_end_boundary(self):
       
   226         self._run_check("""<a b='<'>""", [("starttag", "a", [("b", "<")])])
       
   227         self._run_check("""<a b='>'>""", [("starttag", "a", [("b", ">")])])
       
   228 
       
   229     def test_buffer_artefacts(self):
       
   230         output = [("starttag", "a", [("b", "<")])]
       
   231         self._run_check(["<a b='<'>"], output)
       
   232         self._run_check(["<a ", "b='<'>"], output)
       
   233         self._run_check(["<a b", "='<'>"], output)
       
   234         self._run_check(["<a b=", "'<'>"], output)
       
   235         self._run_check(["<a b='<", "'>"], output)
       
   236         self._run_check(["<a b='<'", ">"], output)
       
   237 
       
   238         output = [("starttag", "a", [("b", ">")])]
       
   239         self._run_check(["<a b='>'>"], output)
       
   240         self._run_check(["<a ", "b='>'>"], output)
       
   241         self._run_check(["<a b", "='>'>"], output)
       
   242         self._run_check(["<a b=", "'>'>"], output)
       
   243         self._run_check(["<a b='>", "'>"], output)
       
   244         self._run_check(["<a b='>'", ">"], output)
       
   245 
       
   246         output = [("comment", "abc")]
       
   247         self._run_check(["", "<!--abc-->"], output)
       
   248         self._run_check(["<", "!--abc-->"], output)
       
   249         self._run_check(["<!", "--abc-->"], output)
       
   250         self._run_check(["<!-", "-abc-->"], output)
       
   251         self._run_check(["<!--", "abc-->"], output)
       
   252         self._run_check(["<!--a", "bc-->"], output)
       
   253         self._run_check(["<!--ab", "c-->"], output)
       
   254         self._run_check(["<!--abc", "-->"], output)
       
   255         self._run_check(["<!--abc-", "->"], output)
       
   256         self._run_check(["<!--abc--", ">"], output)
       
   257         self._run_check(["<!--abc-->", ""], output)
       
   258 
       
   259     def test_starttag_junk_chars(self):
       
   260         self._parse_error("</>")
       
   261         self._parse_error("</$>")
       
   262         self._parse_error("</")
       
   263         self._parse_error("</a")
       
   264         self._parse_error("<a<a>")
       
   265         self._parse_error("</a<a>")
       
   266         self._parse_error("<!")
       
   267         self._parse_error("<a $>")
       
   268         self._parse_error("<a")
       
   269         self._parse_error("<a foo='bar'")
       
   270         self._parse_error("<a foo='bar")
       
   271         self._parse_error("<a foo='>'")
       
   272         self._parse_error("<a foo='>")
       
   273         self._parse_error("<a foo=>")
       
   274 
       
   275     def test_declaration_junk_chars(self):
       
   276         self._parse_error("<!DOCTYPE foo $ >")
       
   277 
       
   278     def test_startendtag(self):
       
   279         self._run_check("<p/>", [
       
   280             ("startendtag", "p", []),
       
   281             ])
       
   282         self._run_check("<p></p>", [
       
   283             ("starttag", "p", []),
       
   284             ("endtag", "p"),
       
   285             ])
       
   286         self._run_check("<p><img src='foo' /></p>", [
       
   287             ("starttag", "p", []),
       
   288             ("startendtag", "img", [("src", "foo")]),
       
   289             ("endtag", "p"),
       
   290             ])
       
   291 
       
   292     def test_get_starttag_text(self):
       
   293         s = """<foo:bar   \n   one="1"\ttwo=2   >"""
       
   294         self._run_check_extra(s, [
       
   295             ("starttag", "foo:bar", [("one", "1"), ("two", "2")]),
       
   296             ("starttag_text", s)])
       
   297 
       
   298     def test_cdata_content(self):
       
   299         s = """<script> <!-- not a comment --> &not-an-entity-ref; </script>"""
       
   300         self._run_check(s, [
       
   301             ("starttag", "script", []),
       
   302             ("data", " <!-- not a comment --> &not-an-entity-ref; "),
       
   303             ("endtag", "script"),
       
   304             ])
       
   305         s = """<script> <not a='start tag'> </script>"""
       
   306         self._run_check(s, [
       
   307             ("starttag", "script", []),
       
   308             ("data", " <not a='start tag'> "),
       
   309             ("endtag", "script"),
       
   310             ])
       
   311 
       
   312 
       
   313 def test_main():
       
   314     test_support.run_unittest(HTMLParserTestCase)
       
   315 
       
   316 
       
   317 if __name__ == "__main__":
       
   318     test_main()