python-2.5.2/win32/Lib/htmllib.py
changeset 0 ae805ac0140d
equal deleted inserted replaced
-1:000000000000 0:ae805ac0140d
       
     1 """HTML 2.0 parser.
       
     2 
       
     3 See the HTML 2.0 specification:
       
     4 http://www.w3.org/hypertext/WWW/MarkUp/html-spec/html-spec_toc.html
       
     5 """
       
     6 
       
     7 import sgmllib
       
     8 
       
     9 from formatter import AS_IS
       
    10 
       
    11 __all__ = ["HTMLParser", "HTMLParseError"]
       
    12 
       
    13 
       
    14 class HTMLParseError(sgmllib.SGMLParseError):
       
    15     """Error raised when an HTML document can't be parsed."""
       
    16 
       
    17 
       
    18 class HTMLParser(sgmllib.SGMLParser):
       
    19     """This is the basic HTML parser class.
       
    20 
       
    21     It supports all entity names required by the XHTML 1.0 Recommendation.
       
    22     It also defines handlers for all HTML 2.0 and many HTML 3.0 and 3.2
       
    23     elements.
       
    24 
       
    25     """
       
    26 
       
    27     from htmlentitydefs import entitydefs
       
    28 
       
    29     def __init__(self, formatter, verbose=0):
       
    30         """Creates an instance of the HTMLParser class.
       
    31 
       
    32         The formatter parameter is the formatter instance associated with
       
    33         the parser.
       
    34 
       
    35         """
       
    36         sgmllib.SGMLParser.__init__(self, verbose)
       
    37         self.formatter = formatter
       
    38 
       
    39     def error(self, message):
       
    40         raise HTMLParseError(message)
       
    41 
       
    42     def reset(self):
       
    43         sgmllib.SGMLParser.reset(self)
       
    44         self.savedata = None
       
    45         self.isindex = 0
       
    46         self.title = None
       
    47         self.base = None
       
    48         self.anchor = None
       
    49         self.anchorlist = []
       
    50         self.nofill = 0
       
    51         self.list_stack = []
       
    52 
       
    53     # ------ Methods used internally; some may be overridden
       
    54 
       
    55     # --- Formatter interface, taking care of 'savedata' mode;
       
    56     # shouldn't need to be overridden
       
    57 
       
    58     def handle_data(self, data):
       
    59         if self.savedata is not None:
       
    60             self.savedata = self.savedata + data
       
    61         else:
       
    62             if self.nofill:
       
    63                 self.formatter.add_literal_data(data)
       
    64             else:
       
    65                 self.formatter.add_flowing_data(data)
       
    66 
       
    67     # --- Hooks to save data; shouldn't need to be overridden
       
    68 
       
    69     def save_bgn(self):
       
    70         """Begins saving character data in a buffer instead of sending it
       
    71         to the formatter object.
       
    72 
       
    73         Retrieve the stored data via the save_end() method.  Use of the
       
    74         save_bgn() / save_end() pair may not be nested.
       
    75 
       
    76         """
       
    77         self.savedata = ''
       
    78 
       
    79     def save_end(self):
       
    80         """Ends buffering character data and returns all data saved since
       
    81         the preceding call to the save_bgn() method.
       
    82 
       
    83         If the nofill flag is false, whitespace is collapsed to single
       
    84         spaces.  A call to this method without a preceding call to the
       
    85         save_bgn() method will raise a TypeError exception.
       
    86 
       
    87         """
       
    88         data = self.savedata
       
    89         self.savedata = None
       
    90         if not self.nofill:
       
    91             data = ' '.join(data.split())
       
    92         return data
       
    93 
       
    94     # --- Hooks for anchors; should probably be overridden
       
    95 
       
    96     def anchor_bgn(self, href, name, type):
       
    97         """This method is called at the start of an anchor region.
       
    98 
       
    99         The arguments correspond to the attributes of the <A> tag with
       
   100         the same names.  The default implementation maintains a list of
       
   101         hyperlinks (defined by the HREF attribute for <A> tags) within
       
   102         the document.  The list of hyperlinks is available as the data
       
   103         attribute anchorlist.
       
   104 
       
   105         """
       
   106         self.anchor = href
       
   107         if self.anchor:
       
   108             self.anchorlist.append(href)
       
   109 
       
   110     def anchor_end(self):
       
   111         """This method is called at the end of an anchor region.
       
   112 
       
   113         The default implementation adds a textual footnote marker using an
       
   114         index into the list of hyperlinks created by the anchor_bgn()method.
       
   115 
       
   116         """
       
   117         if self.anchor:
       
   118             self.handle_data("[%d]" % len(self.anchorlist))
       
   119             self.anchor = None
       
   120 
       
   121     # --- Hook for images; should probably be overridden
       
   122 
       
   123     def handle_image(self, src, alt, *args):
       
   124         """This method is called to handle images.
       
   125 
       
   126         The default implementation simply passes the alt value to the
       
   127         handle_data() method.
       
   128 
       
   129         """
       
   130         self.handle_data(alt)
       
   131 
       
   132     # --------- Top level elememts
       
   133 
       
   134     def start_html(self, attrs): pass
       
   135     def end_html(self): pass
       
   136 
       
   137     def start_head(self, attrs): pass
       
   138     def end_head(self): pass
       
   139 
       
   140     def start_body(self, attrs): pass
       
   141     def end_body(self): pass
       
   142 
       
   143     # ------ Head elements
       
   144 
       
   145     def start_title(self, attrs):
       
   146         self.save_bgn()
       
   147 
       
   148     def end_title(self):
       
   149         self.title = self.save_end()
       
   150 
       
   151     def do_base(self, attrs):
       
   152         for a, v in attrs:
       
   153             if a == 'href':
       
   154                 self.base = v
       
   155 
       
   156     def do_isindex(self, attrs):
       
   157         self.isindex = 1
       
   158 
       
   159     def do_link(self, attrs):
       
   160         pass
       
   161 
       
   162     def do_meta(self, attrs):
       
   163         pass
       
   164 
       
   165     def do_nextid(self, attrs): # Deprecated
       
   166         pass
       
   167 
       
   168     # ------ Body elements
       
   169 
       
   170     # --- Headings
       
   171 
       
   172     def start_h1(self, attrs):
       
   173         self.formatter.end_paragraph(1)
       
   174         self.formatter.push_font(('h1', 0, 1, 0))
       
   175 
       
   176     def end_h1(self):
       
   177         self.formatter.end_paragraph(1)
       
   178         self.formatter.pop_font()
       
   179 
       
   180     def start_h2(self, attrs):
       
   181         self.formatter.end_paragraph(1)
       
   182         self.formatter.push_font(('h2', 0, 1, 0))
       
   183 
       
   184     def end_h2(self):
       
   185         self.formatter.end_paragraph(1)
       
   186         self.formatter.pop_font()
       
   187 
       
   188     def start_h3(self, attrs):
       
   189         self.formatter.end_paragraph(1)
       
   190         self.formatter.push_font(('h3', 0, 1, 0))
       
   191 
       
   192     def end_h3(self):
       
   193         self.formatter.end_paragraph(1)
       
   194         self.formatter.pop_font()
       
   195 
       
   196     def start_h4(self, attrs):
       
   197         self.formatter.end_paragraph(1)
       
   198         self.formatter.push_font(('h4', 0, 1, 0))
       
   199 
       
   200     def end_h4(self):
       
   201         self.formatter.end_paragraph(1)
       
   202         self.formatter.pop_font()
       
   203 
       
   204     def start_h5(self, attrs):
       
   205         self.formatter.end_paragraph(1)
       
   206         self.formatter.push_font(('h5', 0, 1, 0))
       
   207 
       
   208     def end_h5(self):
       
   209         self.formatter.end_paragraph(1)
       
   210         self.formatter.pop_font()
       
   211 
       
   212     def start_h6(self, attrs):
       
   213         self.formatter.end_paragraph(1)
       
   214         self.formatter.push_font(('h6', 0, 1, 0))
       
   215 
       
   216     def end_h6(self):
       
   217         self.formatter.end_paragraph(1)
       
   218         self.formatter.pop_font()
       
   219 
       
   220     # --- Block Structuring Elements
       
   221 
       
   222     def do_p(self, attrs):
       
   223         self.formatter.end_paragraph(1)
       
   224 
       
   225     def start_pre(self, attrs):
       
   226         self.formatter.end_paragraph(1)
       
   227         self.formatter.push_font((AS_IS, AS_IS, AS_IS, 1))
       
   228         self.nofill = self.nofill + 1
       
   229 
       
   230     def end_pre(self):
       
   231         self.formatter.end_paragraph(1)
       
   232         self.formatter.pop_font()
       
   233         self.nofill = max(0, self.nofill - 1)
       
   234 
       
   235     def start_xmp(self, attrs):
       
   236         self.start_pre(attrs)
       
   237         self.setliteral('xmp') # Tell SGML parser
       
   238 
       
   239     def end_xmp(self):
       
   240         self.end_pre()
       
   241 
       
   242     def start_listing(self, attrs):
       
   243         self.start_pre(attrs)
       
   244         self.setliteral('listing') # Tell SGML parser
       
   245 
       
   246     def end_listing(self):
       
   247         self.end_pre()
       
   248 
       
   249     def start_address(self, attrs):
       
   250         self.formatter.end_paragraph(0)
       
   251         self.formatter.push_font((AS_IS, 1, AS_IS, AS_IS))
       
   252 
       
   253     def end_address(self):
       
   254         self.formatter.end_paragraph(0)
       
   255         self.formatter.pop_font()
       
   256 
       
   257     def start_blockquote(self, attrs):
       
   258         self.formatter.end_paragraph(1)
       
   259         self.formatter.push_margin('blockquote')
       
   260 
       
   261     def end_blockquote(self):
       
   262         self.formatter.end_paragraph(1)
       
   263         self.formatter.pop_margin()
       
   264 
       
   265     # --- List Elements
       
   266 
       
   267     def start_ul(self, attrs):
       
   268         self.formatter.end_paragraph(not self.list_stack)
       
   269         self.formatter.push_margin('ul')
       
   270         self.list_stack.append(['ul', '*', 0])
       
   271 
       
   272     def end_ul(self):
       
   273         if self.list_stack: del self.list_stack[-1]
       
   274         self.formatter.end_paragraph(not self.list_stack)
       
   275         self.formatter.pop_margin()
       
   276 
       
   277     def do_li(self, attrs):
       
   278         self.formatter.end_paragraph(0)
       
   279         if self.list_stack:
       
   280             [dummy, label, counter] = top = self.list_stack[-1]
       
   281             top[2] = counter = counter+1
       
   282         else:
       
   283             label, counter = '*', 0
       
   284         self.formatter.add_label_data(label, counter)
       
   285 
       
   286     def start_ol(self, attrs):
       
   287         self.formatter.end_paragraph(not self.list_stack)
       
   288         self.formatter.push_margin('ol')
       
   289         label = '1.'
       
   290         for a, v in attrs:
       
   291             if a == 'type':
       
   292                 if len(v) == 1: v = v + '.'
       
   293                 label = v
       
   294         self.list_stack.append(['ol', label, 0])
       
   295 
       
   296     def end_ol(self):
       
   297         if self.list_stack: del self.list_stack[-1]
       
   298         self.formatter.end_paragraph(not self.list_stack)
       
   299         self.formatter.pop_margin()
       
   300 
       
   301     def start_menu(self, attrs):
       
   302         self.start_ul(attrs)
       
   303 
       
   304     def end_menu(self):
       
   305         self.end_ul()
       
   306 
       
   307     def start_dir(self, attrs):
       
   308         self.start_ul(attrs)
       
   309 
       
   310     def end_dir(self):
       
   311         self.end_ul()
       
   312 
       
   313     def start_dl(self, attrs):
       
   314         self.formatter.end_paragraph(1)
       
   315         self.list_stack.append(['dl', '', 0])
       
   316 
       
   317     def end_dl(self):
       
   318         self.ddpop(1)
       
   319         if self.list_stack: del self.list_stack[-1]
       
   320 
       
   321     def do_dt(self, attrs):
       
   322         self.ddpop()
       
   323 
       
   324     def do_dd(self, attrs):
       
   325         self.ddpop()
       
   326         self.formatter.push_margin('dd')
       
   327         self.list_stack.append(['dd', '', 0])
       
   328 
       
   329     def ddpop(self, bl=0):
       
   330         self.formatter.end_paragraph(bl)
       
   331         if self.list_stack:
       
   332             if self.list_stack[-1][0] == 'dd':
       
   333                 del self.list_stack[-1]
       
   334                 self.formatter.pop_margin()
       
   335 
       
   336     # --- Phrase Markup
       
   337 
       
   338     # Idiomatic Elements
       
   339 
       
   340     def start_cite(self, attrs): self.start_i(attrs)
       
   341     def end_cite(self): self.end_i()
       
   342 
       
   343     def start_code(self, attrs): self.start_tt(attrs)
       
   344     def end_code(self): self.end_tt()
       
   345 
       
   346     def start_em(self, attrs): self.start_i(attrs)
       
   347     def end_em(self): self.end_i()
       
   348 
       
   349     def start_kbd(self, attrs): self.start_tt(attrs)
       
   350     def end_kbd(self): self.end_tt()
       
   351 
       
   352     def start_samp(self, attrs): self.start_tt(attrs)
       
   353     def end_samp(self): self.end_tt()
       
   354 
       
   355     def start_strong(self, attrs): self.start_b(attrs)
       
   356     def end_strong(self): self.end_b()
       
   357 
       
   358     def start_var(self, attrs): self.start_i(attrs)
       
   359     def end_var(self): self.end_i()
       
   360 
       
   361     # Typographic Elements
       
   362 
       
   363     def start_i(self, attrs):
       
   364         self.formatter.push_font((AS_IS, 1, AS_IS, AS_IS))
       
   365     def end_i(self):
       
   366         self.formatter.pop_font()
       
   367 
       
   368     def start_b(self, attrs):
       
   369         self.formatter.push_font((AS_IS, AS_IS, 1, AS_IS))
       
   370     def end_b(self):
       
   371         self.formatter.pop_font()
       
   372 
       
   373     def start_tt(self, attrs):
       
   374         self.formatter.push_font((AS_IS, AS_IS, AS_IS, 1))
       
   375     def end_tt(self):
       
   376         self.formatter.pop_font()
       
   377 
       
   378     def start_a(self, attrs):
       
   379         href = ''
       
   380         name = ''
       
   381         type = ''
       
   382         for attrname, value in attrs:
       
   383             value = value.strip()
       
   384             if attrname == 'href':
       
   385                 href = value
       
   386             if attrname == 'name':
       
   387                 name = value
       
   388             if attrname == 'type':
       
   389                 type = value.lower()
       
   390         self.anchor_bgn(href, name, type)
       
   391 
       
   392     def end_a(self):
       
   393         self.anchor_end()
       
   394 
       
   395     # --- Line Break
       
   396 
       
   397     def do_br(self, attrs):
       
   398         self.formatter.add_line_break()
       
   399 
       
   400     # --- Horizontal Rule
       
   401 
       
   402     def do_hr(self, attrs):
       
   403         self.formatter.add_hor_rule()
       
   404 
       
   405     # --- Image
       
   406 
       
   407     def do_img(self, attrs):
       
   408         align = ''
       
   409         alt = '(image)'
       
   410         ismap = ''
       
   411         src = ''
       
   412         width = 0
       
   413         height = 0
       
   414         for attrname, value in attrs:
       
   415             if attrname == 'align':
       
   416                 align = value
       
   417             if attrname == 'alt':
       
   418                 alt = value
       
   419             if attrname == 'ismap':
       
   420                 ismap = value
       
   421             if attrname == 'src':
       
   422                 src = value
       
   423             if attrname == 'width':
       
   424                 try: width = int(value)
       
   425                 except ValueError: pass
       
   426             if attrname == 'height':
       
   427                 try: height = int(value)
       
   428                 except ValueError: pass
       
   429         self.handle_image(src, alt, ismap, align, width, height)
       
   430 
       
   431     # --- Really Old Unofficial Deprecated Stuff
       
   432 
       
   433     def do_plaintext(self, attrs):
       
   434         self.start_pre(attrs)
       
   435         self.setnomoretags() # Tell SGML parser
       
   436 
       
   437     # --- Unhandled tags
       
   438 
       
   439     def unknown_starttag(self, tag, attrs):
       
   440         pass
       
   441 
       
   442     def unknown_endtag(self, tag):
       
   443         pass
       
   444 
       
   445 
       
   446 def test(args = None):
       
   447     import sys, formatter
       
   448 
       
   449     if not args:
       
   450         args = sys.argv[1:]
       
   451 
       
   452     silent = args and args[0] == '-s'
       
   453     if silent:
       
   454         del args[0]
       
   455 
       
   456     if args:
       
   457         file = args[0]
       
   458     else:
       
   459         file = 'test.html'
       
   460 
       
   461     if file == '-':
       
   462         f = sys.stdin
       
   463     else:
       
   464         try:
       
   465             f = open(file, 'r')
       
   466         except IOError, msg:
       
   467             print file, ":", msg
       
   468             sys.exit(1)
       
   469 
       
   470     data = f.read()
       
   471 
       
   472     if f is not sys.stdin:
       
   473         f.close()
       
   474 
       
   475     if silent:
       
   476         f = formatter.NullFormatter()
       
   477     else:
       
   478         f = formatter.AbstractFormatter(formatter.DumbWriter())
       
   479 
       
   480     p = HTMLParser(f)
       
   481     p.feed(data)
       
   482     p.close()
       
   483 
       
   484 
       
   485 if __name__ == '__main__':
       
   486     test()