python-2.5.2/win32/Lib/email/feedparser.py
changeset 0 ae805ac0140d
equal deleted inserted replaced
-1:000000000000 0:ae805ac0140d
       
     1 # Copyright (C) 2004-2006 Python Software Foundation
       
     2 # Authors: Baxter, Wouters and Warsaw
       
     3 # Contact: email-sig@python.org
       
     4 
       
     5 """FeedParser - An email feed parser.
       
     6 
       
     7 The feed parser implements an interface for incrementally parsing an email
       
     8 message, line by line.  This has advantages for certain applications, such as
       
     9 those reading email messages off a socket.
       
    10 
       
    11 FeedParser.feed() is the primary interface for pushing new data into the
       
    12 parser.  It returns when there's nothing more it can do with the available
       
    13 data.  When you have no more data to push into the parser, call .close().
       
    14 This completes the parsing and returns the root message object.
       
    15 
       
    16 The other advantage of this parser is that it will never throw a parsing
       
    17 exception.  Instead, when it finds something unexpected, it adds a 'defect' to
       
    18 the current message.  Defects are just instances that live on the message
       
    19 object's .defects attribute.
       
    20 """
       
    21 
       
    22 __all__ = ['FeedParser']
       
    23 
       
    24 import re
       
    25 
       
    26 from email import errors
       
    27 from email import message
       
    28 
       
    29 NLCRE = re.compile('\r\n|\r|\n')
       
    30 NLCRE_bol = re.compile('(\r\n|\r|\n)')
       
    31 NLCRE_eol = re.compile('(\r\n|\r|\n)$')
       
    32 NLCRE_crack = re.compile('(\r\n|\r|\n)')
       
    33 # RFC 2822 $3.6.8 Optional fields.  ftext is %d33-57 / %d59-126, Any character
       
    34 # except controls, SP, and ":".
       
    35 headerRE = re.compile(r'^(From |[\041-\071\073-\176]{1,}:|[\t ])')
       
    36 EMPTYSTRING = ''
       
    37 NL = '\n'
       
    38 
       
    39 NeedMoreData = object()
       
    40 
       
    41 
       
    42 
       
    43 class BufferedSubFile(object):
       
    44     """A file-ish object that can have new data loaded into it.
       
    45 
       
    46     You can also push and pop line-matching predicates onto a stack.  When the
       
    47     current predicate matches the current line, a false EOF response
       
    48     (i.e. empty string) is returned instead.  This lets the parser adhere to a
       
    49     simple abstraction -- it parses until EOF closes the current message.
       
    50     """
       
    51     def __init__(self):
       
    52         # The last partial line pushed into this object.
       
    53         self._partial = ''
       
    54         # The list of full, pushed lines, in reverse order
       
    55         self._lines = []
       
    56         # The stack of false-EOF checking predicates.
       
    57         self._eofstack = []
       
    58         # A flag indicating whether the file has been closed or not.
       
    59         self._closed = False
       
    60 
       
    61     def push_eof_matcher(self, pred):
       
    62         self._eofstack.append(pred)
       
    63 
       
    64     def pop_eof_matcher(self):
       
    65         return self._eofstack.pop()
       
    66 
       
    67     def close(self):
       
    68         # Don't forget any trailing partial line.
       
    69         self._lines.append(self._partial)
       
    70         self._partial = ''
       
    71         self._closed = True
       
    72 
       
    73     def readline(self):
       
    74         if not self._lines:
       
    75             if self._closed:
       
    76                 return ''
       
    77             return NeedMoreData
       
    78         # Pop the line off the stack and see if it matches the current
       
    79         # false-EOF predicate.
       
    80         line = self._lines.pop()
       
    81         # RFC 2046, section 5.1.2 requires us to recognize outer level
       
    82         # boundaries at any level of inner nesting.  Do this, but be sure it's
       
    83         # in the order of most to least nested.
       
    84         for ateof in self._eofstack[::-1]:
       
    85             if ateof(line):
       
    86                 # We're at the false EOF.  But push the last line back first.
       
    87                 self._lines.append(line)
       
    88                 return ''
       
    89         return line
       
    90 
       
    91     def unreadline(self, line):
       
    92         # Let the consumer push a line back into the buffer.
       
    93         assert line is not NeedMoreData
       
    94         self._lines.append(line)
       
    95 
       
    96     def push(self, data):
       
    97         """Push some new data into this object."""
       
    98         # Handle any previous leftovers
       
    99         data, self._partial = self._partial + data, ''
       
   100         # Crack into lines, but preserve the newlines on the end of each
       
   101         parts = NLCRE_crack.split(data)
       
   102         # The *ahem* interesting behaviour of re.split when supplied grouping
       
   103         # parentheses is that the last element of the resulting list is the
       
   104         # data after the final RE.  In the case of a NL/CR terminated string,
       
   105         # this is the empty string.
       
   106         self._partial = parts.pop()
       
   107         # parts is a list of strings, alternating between the line contents
       
   108         # and the eol character(s).  Gather up a list of lines after
       
   109         # re-attaching the newlines.
       
   110         lines = []
       
   111         for i in range(len(parts) // 2):
       
   112             lines.append(parts[i*2] + parts[i*2+1])
       
   113         self.pushlines(lines)
       
   114 
       
   115     def pushlines(self, lines):
       
   116         # Reverse and insert at the front of the lines.
       
   117         self._lines[:0] = lines[::-1]
       
   118 
       
   119     def is_closed(self):
       
   120         return self._closed
       
   121 
       
   122     def __iter__(self):
       
   123         return self
       
   124 
       
   125     def next(self):
       
   126         line = self.readline()
       
   127         if line == '':
       
   128             raise StopIteration
       
   129         return line
       
   130 
       
   131 
       
   132 
       
   133 class FeedParser:
       
   134     """A feed-style parser of email."""
       
   135 
       
   136     def __init__(self, _factory=message.Message):
       
   137         """_factory is called with no arguments to create a new message obj"""
       
   138         self._factory = _factory
       
   139         self._input = BufferedSubFile()
       
   140         self._msgstack = []
       
   141         self._parse = self._parsegen().next
       
   142         self._cur = None
       
   143         self._last = None
       
   144         self._headersonly = False
       
   145 
       
   146     # Non-public interface for supporting Parser's headersonly flag
       
   147     def _set_headersonly(self):
       
   148         self._headersonly = True
       
   149 
       
   150     def feed(self, data):
       
   151         """Push more data into the parser."""
       
   152         self._input.push(data)
       
   153         self._call_parse()
       
   154 
       
   155     def _call_parse(self):
       
   156         try:
       
   157             self._parse()
       
   158         except StopIteration:
       
   159             pass
       
   160 
       
   161     def close(self):
       
   162         """Parse all remaining data and return the root message object."""
       
   163         self._input.close()
       
   164         self._call_parse()
       
   165         root = self._pop_message()
       
   166         assert not self._msgstack
       
   167         # Look for final set of defects
       
   168         if root.get_content_maintype() == 'multipart' \
       
   169                and not root.is_multipart():
       
   170             root.defects.append(errors.MultipartInvariantViolationDefect())
       
   171         return root
       
   172 
       
   173     def _new_message(self):
       
   174         msg = self._factory()
       
   175         if self._cur and self._cur.get_content_type() == 'multipart/digest':
       
   176             msg.set_default_type('message/rfc822')
       
   177         if self._msgstack:
       
   178             self._msgstack[-1].attach(msg)
       
   179         self._msgstack.append(msg)
       
   180         self._cur = msg
       
   181         self._last = msg
       
   182 
       
   183     def _pop_message(self):
       
   184         retval = self._msgstack.pop()
       
   185         if self._msgstack:
       
   186             self._cur = self._msgstack[-1]
       
   187         else:
       
   188             self._cur = None
       
   189         return retval
       
   190 
       
   191     def _parsegen(self):
       
   192         # Create a new message and start by parsing headers.
       
   193         self._new_message()
       
   194         headers = []
       
   195         # Collect the headers, searching for a line that doesn't match the RFC
       
   196         # 2822 header or continuation pattern (including an empty line).
       
   197         for line in self._input:
       
   198             if line is NeedMoreData:
       
   199                 yield NeedMoreData
       
   200                 continue
       
   201             if not headerRE.match(line):
       
   202                 # If we saw the RFC defined header/body separator
       
   203                 # (i.e. newline), just throw it away. Otherwise the line is
       
   204                 # part of the body so push it back.
       
   205                 if not NLCRE.match(line):
       
   206                     self._input.unreadline(line)
       
   207                 break
       
   208             headers.append(line)
       
   209         # Done with the headers, so parse them and figure out what we're
       
   210         # supposed to see in the body of the message.
       
   211         self._parse_headers(headers)
       
   212         # Headers-only parsing is a backwards compatibility hack, which was
       
   213         # necessary in the older parser, which could throw errors.  All
       
   214         # remaining lines in the input are thrown into the message body.
       
   215         if self._headersonly:
       
   216             lines = []
       
   217             while True:
       
   218                 line = self._input.readline()
       
   219                 if line is NeedMoreData:
       
   220                     yield NeedMoreData
       
   221                     continue
       
   222                 if line == '':
       
   223                     break
       
   224                 lines.append(line)
       
   225             self._cur.set_payload(EMPTYSTRING.join(lines))
       
   226             return
       
   227         if self._cur.get_content_type() == 'message/delivery-status':
       
   228             # message/delivery-status contains blocks of headers separated by
       
   229             # a blank line.  We'll represent each header block as a separate
       
   230             # nested message object, but the processing is a bit different
       
   231             # than standard message/* types because there is no body for the
       
   232             # nested messages.  A blank line separates the subparts.
       
   233             while True:
       
   234                 self._input.push_eof_matcher(NLCRE.match)
       
   235                 for retval in self._parsegen():
       
   236                     if retval is NeedMoreData:
       
   237                         yield NeedMoreData
       
   238                         continue
       
   239                     break
       
   240                 msg = self._pop_message()
       
   241                 # We need to pop the EOF matcher in order to tell if we're at
       
   242                 # the end of the current file, not the end of the last block
       
   243                 # of message headers.
       
   244                 self._input.pop_eof_matcher()
       
   245                 # The input stream must be sitting at the newline or at the
       
   246                 # EOF.  We want to see if we're at the end of this subpart, so
       
   247                 # first consume the blank line, then test the next line to see
       
   248                 # if we're at this subpart's EOF.
       
   249                 while True:
       
   250                     line = self._input.readline()
       
   251                     if line is NeedMoreData:
       
   252                         yield NeedMoreData
       
   253                         continue
       
   254                     break
       
   255                 while True:
       
   256                     line = self._input.readline()
       
   257                     if line is NeedMoreData:
       
   258                         yield NeedMoreData
       
   259                         continue
       
   260                     break
       
   261                 if line == '':
       
   262                     break
       
   263                 # Not at EOF so this is a line we're going to need.
       
   264                 self._input.unreadline(line)
       
   265             return
       
   266         if self._cur.get_content_maintype() == 'message':
       
   267             # The message claims to be a message/* type, then what follows is
       
   268             # another RFC 2822 message.
       
   269             for retval in self._parsegen():
       
   270                 if retval is NeedMoreData:
       
   271                     yield NeedMoreData
       
   272                     continue
       
   273                 break
       
   274             self._pop_message()
       
   275             return
       
   276         if self._cur.get_content_maintype() == 'multipart':
       
   277             boundary = self._cur.get_boundary()
       
   278             if boundary is None:
       
   279                 # The message /claims/ to be a multipart but it has not
       
   280                 # defined a boundary.  That's a problem which we'll handle by
       
   281                 # reading everything until the EOF and marking the message as
       
   282                 # defective.
       
   283                 self._cur.defects.append(errors.NoBoundaryInMultipartDefect())
       
   284                 lines = []
       
   285                 for line in self._input:
       
   286                     if line is NeedMoreData:
       
   287                         yield NeedMoreData
       
   288                         continue
       
   289                     lines.append(line)
       
   290                 self._cur.set_payload(EMPTYSTRING.join(lines))
       
   291                 return
       
   292             # Create a line match predicate which matches the inter-part
       
   293             # boundary as well as the end-of-multipart boundary.  Don't push
       
   294             # this onto the input stream until we've scanned past the
       
   295             # preamble.
       
   296             separator = '--' + boundary
       
   297             boundaryre = re.compile(
       
   298                 '(?P<sep>' + re.escape(separator) +
       
   299                 r')(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n|\r|\n)?$')
       
   300             capturing_preamble = True
       
   301             preamble = []
       
   302             linesep = False
       
   303             while True:
       
   304                 line = self._input.readline()
       
   305                 if line is NeedMoreData:
       
   306                     yield NeedMoreData
       
   307                     continue
       
   308                 if line == '':
       
   309                     break
       
   310                 mo = boundaryre.match(line)
       
   311                 if mo:
       
   312                     # If we're looking at the end boundary, we're done with
       
   313                     # this multipart.  If there was a newline at the end of
       
   314                     # the closing boundary, then we need to initialize the
       
   315                     # epilogue with the empty string (see below).
       
   316                     if mo.group('end'):
       
   317                         linesep = mo.group('linesep')
       
   318                         break
       
   319                     # We saw an inter-part boundary.  Were we in the preamble?
       
   320                     if capturing_preamble:
       
   321                         if preamble:
       
   322                             # According to RFC 2046, the last newline belongs
       
   323                             # to the boundary.
       
   324                             lastline = preamble[-1]
       
   325                             eolmo = NLCRE_eol.search(lastline)
       
   326                             if eolmo:
       
   327                                 preamble[-1] = lastline[:-len(eolmo.group(0))]
       
   328                             self._cur.preamble = EMPTYSTRING.join(preamble)
       
   329                         capturing_preamble = False
       
   330                         self._input.unreadline(line)
       
   331                         continue
       
   332                     # We saw a boundary separating two parts.  Consume any
       
   333                     # multiple boundary lines that may be following.  Our
       
   334                     # interpretation of RFC 2046 BNF grammar does not produce
       
   335                     # body parts within such double boundaries.
       
   336                     while True:
       
   337                         line = self._input.readline()
       
   338                         if line is NeedMoreData:
       
   339                             yield NeedMoreData
       
   340                             continue
       
   341                         mo = boundaryre.match(line)
       
   342                         if not mo:
       
   343                             self._input.unreadline(line)
       
   344                             break
       
   345                     # Recurse to parse this subpart; the input stream points
       
   346                     # at the subpart's first line.
       
   347                     self._input.push_eof_matcher(boundaryre.match)
       
   348                     for retval in self._parsegen():
       
   349                         if retval is NeedMoreData:
       
   350                             yield NeedMoreData
       
   351                             continue
       
   352                         break
       
   353                     # Because of RFC 2046, the newline preceding the boundary
       
   354                     # separator actually belongs to the boundary, not the
       
   355                     # previous subpart's payload (or epilogue if the previous
       
   356                     # part is a multipart).
       
   357                     if self._last.get_content_maintype() == 'multipart':
       
   358                         epilogue = self._last.epilogue
       
   359                         if epilogue == '':
       
   360                             self._last.epilogue = None
       
   361                         elif epilogue is not None:
       
   362                             mo = NLCRE_eol.search(epilogue)
       
   363                             if mo:
       
   364                                 end = len(mo.group(0))
       
   365                                 self._last.epilogue = epilogue[:-end]
       
   366                     else:
       
   367                         payload = self._last.get_payload()
       
   368                         if isinstance(payload, basestring):
       
   369                             mo = NLCRE_eol.search(payload)
       
   370                             if mo:
       
   371                                 payload = payload[:-len(mo.group(0))]
       
   372                                 self._last.set_payload(payload)
       
   373                     self._input.pop_eof_matcher()
       
   374                     self._pop_message()
       
   375                     # Set the multipart up for newline cleansing, which will
       
   376                     # happen if we're in a nested multipart.
       
   377                     self._last = self._cur
       
   378                 else:
       
   379                     # I think we must be in the preamble
       
   380                     assert capturing_preamble
       
   381                     preamble.append(line)
       
   382             # We've seen either the EOF or the end boundary.  If we're still
       
   383             # capturing the preamble, we never saw the start boundary.  Note
       
   384             # that as a defect and store the captured text as the payload.
       
   385             # Everything from here to the EOF is epilogue.
       
   386             if capturing_preamble:
       
   387                 self._cur.defects.append(errors.StartBoundaryNotFoundDefect())
       
   388                 self._cur.set_payload(EMPTYSTRING.join(preamble))
       
   389                 epilogue = []
       
   390                 for line in self._input:
       
   391                     if line is NeedMoreData:
       
   392                         yield NeedMoreData
       
   393                         continue
       
   394                 self._cur.epilogue = EMPTYSTRING.join(epilogue)
       
   395                 return
       
   396             # If the end boundary ended in a newline, we'll need to make sure
       
   397             # the epilogue isn't None
       
   398             if linesep:
       
   399                 epilogue = ['']
       
   400             else:
       
   401                 epilogue = []
       
   402             for line in self._input:
       
   403                 if line is NeedMoreData:
       
   404                     yield NeedMoreData
       
   405                     continue
       
   406                 epilogue.append(line)
       
   407             # Any CRLF at the front of the epilogue is not technically part of
       
   408             # the epilogue.  Also, watch out for an empty string epilogue,
       
   409             # which means a single newline.
       
   410             if epilogue:
       
   411                 firstline = epilogue[0]
       
   412                 bolmo = NLCRE_bol.match(firstline)
       
   413                 if bolmo:
       
   414                     epilogue[0] = firstline[len(bolmo.group(0)):]
       
   415             self._cur.epilogue = EMPTYSTRING.join(epilogue)
       
   416             return
       
   417         # Otherwise, it's some non-multipart type, so the entire rest of the
       
   418         # file contents becomes the payload.
       
   419         lines = []
       
   420         for line in self._input:
       
   421             if line is NeedMoreData:
       
   422                 yield NeedMoreData
       
   423                 continue
       
   424             lines.append(line)
       
   425         self._cur.set_payload(EMPTYSTRING.join(lines))
       
   426 
       
   427     def _parse_headers(self, lines):
       
   428         # Passed a list of lines that make up the headers for the current msg
       
   429         lastheader = ''
       
   430         lastvalue = []
       
   431         for lineno, line in enumerate(lines):
       
   432             # Check for continuation
       
   433             if line[0] in ' \t':
       
   434                 if not lastheader:
       
   435                     # The first line of the headers was a continuation.  This
       
   436                     # is illegal, so let's note the defect, store the illegal
       
   437                     # line, and ignore it for purposes of headers.
       
   438                     defect = errors.FirstHeaderLineIsContinuationDefect(line)
       
   439                     self._cur.defects.append(defect)
       
   440                     continue
       
   441                 lastvalue.append(line)
       
   442                 continue
       
   443             if lastheader:
       
   444                 # XXX reconsider the joining of folded lines
       
   445                 lhdr = EMPTYSTRING.join(lastvalue)[:-1].rstrip('\r\n')
       
   446                 self._cur[lastheader] = lhdr
       
   447                 lastheader, lastvalue = '', []
       
   448             # Check for envelope header, i.e. unix-from
       
   449             if line.startswith('From '):
       
   450                 if lineno == 0:
       
   451                     # Strip off the trailing newline
       
   452                     mo = NLCRE_eol.search(line)
       
   453                     if mo:
       
   454                         line = line[:-len(mo.group(0))]
       
   455                     self._cur.set_unixfrom(line)
       
   456                     continue
       
   457                 elif lineno == len(lines) - 1:
       
   458                     # Something looking like a unix-from at the end - it's
       
   459                     # probably the first line of the body, so push back the
       
   460                     # line and stop.
       
   461                     self._input.unreadline(line)
       
   462                     return
       
   463                 else:
       
   464                     # Weirdly placed unix-from line.  Note this as a defect
       
   465                     # and ignore it.
       
   466                     defect = errors.MisplacedEnvelopeHeaderDefect(line)
       
   467                     self._cur.defects.append(defect)
       
   468                     continue
       
   469             # Split the line on the colon separating field name from value.
       
   470             i = line.find(':')
       
   471             if i < 0:
       
   472                 defect = errors.MalformedHeaderDefect(line)
       
   473                 self._cur.defects.append(defect)
       
   474                 continue
       
   475             lastheader = line[:i]
       
   476             lastvalue = [line[i+1:].lstrip()]
       
   477         # Done with all the lines, so handle the last header.
       
   478         if lastheader:
       
   479             # XXX reconsider the joining of folded lines
       
   480             self._cur[lastheader] = EMPTYSTRING.join(lastvalue).rstrip('\r\n')