|
1 # Copyright (C) 2004-2006 Python Software Foundation |
|
2 # Authors: Baxter, Wouters and Warsaw |
|
3 # Contact: email-sig@python.org |
|
4 |
|
5 """FeedParser - An email feed parser. |
|
6 |
|
7 The feed parser implements an interface for incrementally parsing an email |
|
8 message, line by line. This has advantages for certain applications, such as |
|
9 those reading email messages off a socket. |
|
10 |
|
11 FeedParser.feed() is the primary interface for pushing new data into the |
|
12 parser. It returns when there's nothing more it can do with the available |
|
13 data. When you have no more data to push into the parser, call .close(). |
|
14 This completes the parsing and returns the root message object. |
|
15 |
|
16 The other advantage of this parser is that it will never throw a parsing |
|
17 exception. Instead, when it finds something unexpected, it adds a 'defect' to |
|
18 the current message. Defects are just instances that live on the message |
|
19 object's .defects attribute. |
|
20 """ |
|
21 |
|
22 __all__ = ['FeedParser'] |
|
23 |
|
24 import re |
|
25 |
|
26 from email import errors |
|
27 from email import message |
|
28 |
|
29 NLCRE = re.compile('\r\n|\r|\n') |
|
30 NLCRE_bol = re.compile('(\r\n|\r|\n)') |
|
31 NLCRE_eol = re.compile('(\r\n|\r|\n)$') |
|
32 NLCRE_crack = re.compile('(\r\n|\r|\n)') |
|
33 # RFC 2822 $3.6.8 Optional fields. ftext is %d33-57 / %d59-126, Any character |
|
34 # except controls, SP, and ":". |
|
35 headerRE = re.compile(r'^(From |[\041-\071\073-\176]{1,}:|[\t ])') |
|
36 EMPTYSTRING = '' |
|
37 NL = '\n' |
|
38 |
|
39 NeedMoreData = object() |
|
40 |
|
41 |
|
42 |
|
43 class BufferedSubFile(object): |
|
44 """A file-ish object that can have new data loaded into it. |
|
45 |
|
46 You can also push and pop line-matching predicates onto a stack. When the |
|
47 current predicate matches the current line, a false EOF response |
|
48 (i.e. empty string) is returned instead. This lets the parser adhere to a |
|
49 simple abstraction -- it parses until EOF closes the current message. |
|
50 """ |
|
51 def __init__(self): |
|
52 # The last partial line pushed into this object. |
|
53 self._partial = '' |
|
54 # The list of full, pushed lines, in reverse order |
|
55 self._lines = [] |
|
56 # The stack of false-EOF checking predicates. |
|
57 self._eofstack = [] |
|
58 # A flag indicating whether the file has been closed or not. |
|
59 self._closed = False |
|
60 |
|
61 def push_eof_matcher(self, pred): |
|
62 self._eofstack.append(pred) |
|
63 |
|
64 def pop_eof_matcher(self): |
|
65 return self._eofstack.pop() |
|
66 |
|
67 def close(self): |
|
68 # Don't forget any trailing partial line. |
|
69 self._lines.append(self._partial) |
|
70 self._partial = '' |
|
71 self._closed = True |
|
72 |
|
73 def readline(self): |
|
74 if not self._lines: |
|
75 if self._closed: |
|
76 return '' |
|
77 return NeedMoreData |
|
78 # Pop the line off the stack and see if it matches the current |
|
79 # false-EOF predicate. |
|
80 line = self._lines.pop() |
|
81 # RFC 2046, section 5.1.2 requires us to recognize outer level |
|
82 # boundaries at any level of inner nesting. Do this, but be sure it's |
|
83 # in the order of most to least nested. |
|
84 for ateof in self._eofstack[::-1]: |
|
85 if ateof(line): |
|
86 # We're at the false EOF. But push the last line back first. |
|
87 self._lines.append(line) |
|
88 return '' |
|
89 return line |
|
90 |
|
91 def unreadline(self, line): |
|
92 # Let the consumer push a line back into the buffer. |
|
93 assert line is not NeedMoreData |
|
94 self._lines.append(line) |
|
95 |
|
96 def push(self, data): |
|
97 """Push some new data into this object.""" |
|
98 # Handle any previous leftovers |
|
99 data, self._partial = self._partial + data, '' |
|
100 # Crack into lines, but preserve the newlines on the end of each |
|
101 parts = NLCRE_crack.split(data) |
|
102 # The *ahem* interesting behaviour of re.split when supplied grouping |
|
103 # parentheses is that the last element of the resulting list is the |
|
104 # data after the final RE. In the case of a NL/CR terminated string, |
|
105 # this is the empty string. |
|
106 self._partial = parts.pop() |
|
107 # parts is a list of strings, alternating between the line contents |
|
108 # and the eol character(s). Gather up a list of lines after |
|
109 # re-attaching the newlines. |
|
110 lines = [] |
|
111 for i in range(len(parts) // 2): |
|
112 lines.append(parts[i*2] + parts[i*2+1]) |
|
113 self.pushlines(lines) |
|
114 |
|
115 def pushlines(self, lines): |
|
116 # Reverse and insert at the front of the lines. |
|
117 self._lines[:0] = lines[::-1] |
|
118 |
|
119 def is_closed(self): |
|
120 return self._closed |
|
121 |
|
122 def __iter__(self): |
|
123 return self |
|
124 |
|
125 def next(self): |
|
126 line = self.readline() |
|
127 if line == '': |
|
128 raise StopIteration |
|
129 return line |
|
130 |
|
131 |
|
132 |
|
133 class FeedParser: |
|
134 """A feed-style parser of email.""" |
|
135 |
|
136 def __init__(self, _factory=message.Message): |
|
137 """_factory is called with no arguments to create a new message obj""" |
|
138 self._factory = _factory |
|
139 self._input = BufferedSubFile() |
|
140 self._msgstack = [] |
|
141 self._parse = self._parsegen().next |
|
142 self._cur = None |
|
143 self._last = None |
|
144 self._headersonly = False |
|
145 |
|
146 # Non-public interface for supporting Parser's headersonly flag |
|
147 def _set_headersonly(self): |
|
148 self._headersonly = True |
|
149 |
|
150 def feed(self, data): |
|
151 """Push more data into the parser.""" |
|
152 self._input.push(data) |
|
153 self._call_parse() |
|
154 |
|
155 def _call_parse(self): |
|
156 try: |
|
157 self._parse() |
|
158 except StopIteration: |
|
159 pass |
|
160 |
|
161 def close(self): |
|
162 """Parse all remaining data and return the root message object.""" |
|
163 self._input.close() |
|
164 self._call_parse() |
|
165 root = self._pop_message() |
|
166 assert not self._msgstack |
|
167 # Look for final set of defects |
|
168 if root.get_content_maintype() == 'multipart' \ |
|
169 and not root.is_multipart(): |
|
170 root.defects.append(errors.MultipartInvariantViolationDefect()) |
|
171 return root |
|
172 |
|
173 def _new_message(self): |
|
174 msg = self._factory() |
|
175 if self._cur and self._cur.get_content_type() == 'multipart/digest': |
|
176 msg.set_default_type('message/rfc822') |
|
177 if self._msgstack: |
|
178 self._msgstack[-1].attach(msg) |
|
179 self._msgstack.append(msg) |
|
180 self._cur = msg |
|
181 self._last = msg |
|
182 |
|
183 def _pop_message(self): |
|
184 retval = self._msgstack.pop() |
|
185 if self._msgstack: |
|
186 self._cur = self._msgstack[-1] |
|
187 else: |
|
188 self._cur = None |
|
189 return retval |
|
190 |
|
191 def _parsegen(self): |
|
192 # Create a new message and start by parsing headers. |
|
193 self._new_message() |
|
194 headers = [] |
|
195 # Collect the headers, searching for a line that doesn't match the RFC |
|
196 # 2822 header or continuation pattern (including an empty line). |
|
197 for line in self._input: |
|
198 if line is NeedMoreData: |
|
199 yield NeedMoreData |
|
200 continue |
|
201 if not headerRE.match(line): |
|
202 # If we saw the RFC defined header/body separator |
|
203 # (i.e. newline), just throw it away. Otherwise the line is |
|
204 # part of the body so push it back. |
|
205 if not NLCRE.match(line): |
|
206 self._input.unreadline(line) |
|
207 break |
|
208 headers.append(line) |
|
209 # Done with the headers, so parse them and figure out what we're |
|
210 # supposed to see in the body of the message. |
|
211 self._parse_headers(headers) |
|
212 # Headers-only parsing is a backwards compatibility hack, which was |
|
213 # necessary in the older parser, which could throw errors. All |
|
214 # remaining lines in the input are thrown into the message body. |
|
215 if self._headersonly: |
|
216 lines = [] |
|
217 while True: |
|
218 line = self._input.readline() |
|
219 if line is NeedMoreData: |
|
220 yield NeedMoreData |
|
221 continue |
|
222 if line == '': |
|
223 break |
|
224 lines.append(line) |
|
225 self._cur.set_payload(EMPTYSTRING.join(lines)) |
|
226 return |
|
227 if self._cur.get_content_type() == 'message/delivery-status': |
|
228 # message/delivery-status contains blocks of headers separated by |
|
229 # a blank line. We'll represent each header block as a separate |
|
230 # nested message object, but the processing is a bit different |
|
231 # than standard message/* types because there is no body for the |
|
232 # nested messages. A blank line separates the subparts. |
|
233 while True: |
|
234 self._input.push_eof_matcher(NLCRE.match) |
|
235 for retval in self._parsegen(): |
|
236 if retval is NeedMoreData: |
|
237 yield NeedMoreData |
|
238 continue |
|
239 break |
|
240 msg = self._pop_message() |
|
241 # We need to pop the EOF matcher in order to tell if we're at |
|
242 # the end of the current file, not the end of the last block |
|
243 # of message headers. |
|
244 self._input.pop_eof_matcher() |
|
245 # The input stream must be sitting at the newline or at the |
|
246 # EOF. We want to see if we're at the end of this subpart, so |
|
247 # first consume the blank line, then test the next line to see |
|
248 # if we're at this subpart's EOF. |
|
249 while True: |
|
250 line = self._input.readline() |
|
251 if line is NeedMoreData: |
|
252 yield NeedMoreData |
|
253 continue |
|
254 break |
|
255 while True: |
|
256 line = self._input.readline() |
|
257 if line is NeedMoreData: |
|
258 yield NeedMoreData |
|
259 continue |
|
260 break |
|
261 if line == '': |
|
262 break |
|
263 # Not at EOF so this is a line we're going to need. |
|
264 self._input.unreadline(line) |
|
265 return |
|
266 if self._cur.get_content_maintype() == 'message': |
|
267 # The message claims to be a message/* type, then what follows is |
|
268 # another RFC 2822 message. |
|
269 for retval in self._parsegen(): |
|
270 if retval is NeedMoreData: |
|
271 yield NeedMoreData |
|
272 continue |
|
273 break |
|
274 self._pop_message() |
|
275 return |
|
276 if self._cur.get_content_maintype() == 'multipart': |
|
277 boundary = self._cur.get_boundary() |
|
278 if boundary is None: |
|
279 # The message /claims/ to be a multipart but it has not |
|
280 # defined a boundary. That's a problem which we'll handle by |
|
281 # reading everything until the EOF and marking the message as |
|
282 # defective. |
|
283 self._cur.defects.append(errors.NoBoundaryInMultipartDefect()) |
|
284 lines = [] |
|
285 for line in self._input: |
|
286 if line is NeedMoreData: |
|
287 yield NeedMoreData |
|
288 continue |
|
289 lines.append(line) |
|
290 self._cur.set_payload(EMPTYSTRING.join(lines)) |
|
291 return |
|
292 # Create a line match predicate which matches the inter-part |
|
293 # boundary as well as the end-of-multipart boundary. Don't push |
|
294 # this onto the input stream until we've scanned past the |
|
295 # preamble. |
|
296 separator = '--' + boundary |
|
297 boundaryre = re.compile( |
|
298 '(?P<sep>' + re.escape(separator) + |
|
299 r')(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n|\r|\n)?$') |
|
300 capturing_preamble = True |
|
301 preamble = [] |
|
302 linesep = False |
|
303 while True: |
|
304 line = self._input.readline() |
|
305 if line is NeedMoreData: |
|
306 yield NeedMoreData |
|
307 continue |
|
308 if line == '': |
|
309 break |
|
310 mo = boundaryre.match(line) |
|
311 if mo: |
|
312 # If we're looking at the end boundary, we're done with |
|
313 # this multipart. If there was a newline at the end of |
|
314 # the closing boundary, then we need to initialize the |
|
315 # epilogue with the empty string (see below). |
|
316 if mo.group('end'): |
|
317 linesep = mo.group('linesep') |
|
318 break |
|
319 # We saw an inter-part boundary. Were we in the preamble? |
|
320 if capturing_preamble: |
|
321 if preamble: |
|
322 # According to RFC 2046, the last newline belongs |
|
323 # to the boundary. |
|
324 lastline = preamble[-1] |
|
325 eolmo = NLCRE_eol.search(lastline) |
|
326 if eolmo: |
|
327 preamble[-1] = lastline[:-len(eolmo.group(0))] |
|
328 self._cur.preamble = EMPTYSTRING.join(preamble) |
|
329 capturing_preamble = False |
|
330 self._input.unreadline(line) |
|
331 continue |
|
332 # We saw a boundary separating two parts. Consume any |
|
333 # multiple boundary lines that may be following. Our |
|
334 # interpretation of RFC 2046 BNF grammar does not produce |
|
335 # body parts within such double boundaries. |
|
336 while True: |
|
337 line = self._input.readline() |
|
338 if line is NeedMoreData: |
|
339 yield NeedMoreData |
|
340 continue |
|
341 mo = boundaryre.match(line) |
|
342 if not mo: |
|
343 self._input.unreadline(line) |
|
344 break |
|
345 # Recurse to parse this subpart; the input stream points |
|
346 # at the subpart's first line. |
|
347 self._input.push_eof_matcher(boundaryre.match) |
|
348 for retval in self._parsegen(): |
|
349 if retval is NeedMoreData: |
|
350 yield NeedMoreData |
|
351 continue |
|
352 break |
|
353 # Because of RFC 2046, the newline preceding the boundary |
|
354 # separator actually belongs to the boundary, not the |
|
355 # previous subpart's payload (or epilogue if the previous |
|
356 # part is a multipart). |
|
357 if self._last.get_content_maintype() == 'multipart': |
|
358 epilogue = self._last.epilogue |
|
359 if epilogue == '': |
|
360 self._last.epilogue = None |
|
361 elif epilogue is not None: |
|
362 mo = NLCRE_eol.search(epilogue) |
|
363 if mo: |
|
364 end = len(mo.group(0)) |
|
365 self._last.epilogue = epilogue[:-end] |
|
366 else: |
|
367 payload = self._last.get_payload() |
|
368 if isinstance(payload, basestring): |
|
369 mo = NLCRE_eol.search(payload) |
|
370 if mo: |
|
371 payload = payload[:-len(mo.group(0))] |
|
372 self._last.set_payload(payload) |
|
373 self._input.pop_eof_matcher() |
|
374 self._pop_message() |
|
375 # Set the multipart up for newline cleansing, which will |
|
376 # happen if we're in a nested multipart. |
|
377 self._last = self._cur |
|
378 else: |
|
379 # I think we must be in the preamble |
|
380 assert capturing_preamble |
|
381 preamble.append(line) |
|
382 # We've seen either the EOF or the end boundary. If we're still |
|
383 # capturing the preamble, we never saw the start boundary. Note |
|
384 # that as a defect and store the captured text as the payload. |
|
385 # Everything from here to the EOF is epilogue. |
|
386 if capturing_preamble: |
|
387 self._cur.defects.append(errors.StartBoundaryNotFoundDefect()) |
|
388 self._cur.set_payload(EMPTYSTRING.join(preamble)) |
|
389 epilogue = [] |
|
390 for line in self._input: |
|
391 if line is NeedMoreData: |
|
392 yield NeedMoreData |
|
393 continue |
|
394 self._cur.epilogue = EMPTYSTRING.join(epilogue) |
|
395 return |
|
396 # If the end boundary ended in a newline, we'll need to make sure |
|
397 # the epilogue isn't None |
|
398 if linesep: |
|
399 epilogue = [''] |
|
400 else: |
|
401 epilogue = [] |
|
402 for line in self._input: |
|
403 if line is NeedMoreData: |
|
404 yield NeedMoreData |
|
405 continue |
|
406 epilogue.append(line) |
|
407 # Any CRLF at the front of the epilogue is not technically part of |
|
408 # the epilogue. Also, watch out for an empty string epilogue, |
|
409 # which means a single newline. |
|
410 if epilogue: |
|
411 firstline = epilogue[0] |
|
412 bolmo = NLCRE_bol.match(firstline) |
|
413 if bolmo: |
|
414 epilogue[0] = firstline[len(bolmo.group(0)):] |
|
415 self._cur.epilogue = EMPTYSTRING.join(epilogue) |
|
416 return |
|
417 # Otherwise, it's some non-multipart type, so the entire rest of the |
|
418 # file contents becomes the payload. |
|
419 lines = [] |
|
420 for line in self._input: |
|
421 if line is NeedMoreData: |
|
422 yield NeedMoreData |
|
423 continue |
|
424 lines.append(line) |
|
425 self._cur.set_payload(EMPTYSTRING.join(lines)) |
|
426 |
|
427 def _parse_headers(self, lines): |
|
428 # Passed a list of lines that make up the headers for the current msg |
|
429 lastheader = '' |
|
430 lastvalue = [] |
|
431 for lineno, line in enumerate(lines): |
|
432 # Check for continuation |
|
433 if line[0] in ' \t': |
|
434 if not lastheader: |
|
435 # The first line of the headers was a continuation. This |
|
436 # is illegal, so let's note the defect, store the illegal |
|
437 # line, and ignore it for purposes of headers. |
|
438 defect = errors.FirstHeaderLineIsContinuationDefect(line) |
|
439 self._cur.defects.append(defect) |
|
440 continue |
|
441 lastvalue.append(line) |
|
442 continue |
|
443 if lastheader: |
|
444 # XXX reconsider the joining of folded lines |
|
445 lhdr = EMPTYSTRING.join(lastvalue)[:-1].rstrip('\r\n') |
|
446 self._cur[lastheader] = lhdr |
|
447 lastheader, lastvalue = '', [] |
|
448 # Check for envelope header, i.e. unix-from |
|
449 if line.startswith('From '): |
|
450 if lineno == 0: |
|
451 # Strip off the trailing newline |
|
452 mo = NLCRE_eol.search(line) |
|
453 if mo: |
|
454 line = line[:-len(mo.group(0))] |
|
455 self._cur.set_unixfrom(line) |
|
456 continue |
|
457 elif lineno == len(lines) - 1: |
|
458 # Something looking like a unix-from at the end - it's |
|
459 # probably the first line of the body, so push back the |
|
460 # line and stop. |
|
461 self._input.unreadline(line) |
|
462 return |
|
463 else: |
|
464 # Weirdly placed unix-from line. Note this as a defect |
|
465 # and ignore it. |
|
466 defect = errors.MisplacedEnvelopeHeaderDefect(line) |
|
467 self._cur.defects.append(defect) |
|
468 continue |
|
469 # Split the line on the colon separating field name from value. |
|
470 i = line.find(':') |
|
471 if i < 0: |
|
472 defect = errors.MalformedHeaderDefect(line) |
|
473 self._cur.defects.append(defect) |
|
474 continue |
|
475 lastheader = line[:i] |
|
476 lastvalue = [line[i+1:].lstrip()] |
|
477 # Done with all the lines, so handle the last header. |
|
478 if lastheader: |
|
479 # XXX reconsider the joining of folded lines |
|
480 self._cur[lastheader] = EMPTYSTRING.join(lastvalue).rstrip('\r\n') |