Commit | Line | Data |
---|---|---|
920dae64 AT |
1 | # Copyright (C) 2004 Python Software Foundation |
2 | # Authors: Baxter, Wouters and Warsaw | |
3 | # Contact: email-sig@python.org | |
4 | ||
5 | """FeedParser - An email feed parser. | |
6 | ||
7 | The feed parser implements an interface for incrementally parsing an email | |
8 | message, line by line. This has advantages for certain applications, such as | |
9 | those reading email messages off a socket. | |
10 | ||
11 | FeedParser.feed() is the primary interface for pushing new data into the | |
12 | parser. It returns when there's nothing more it can do with the available | |
13 | data. When you have no more data to push into the parser, call .close(). | |
14 | This completes the parsing and returns the root message object. | |
15 | ||
16 | The other advantage of this parser is that it will never throw a parsing | |
17 | exception. Instead, when it finds something unexpected, it adds a 'defect' to | |
18 | the current message. Defects are just instances that live on the message | |
19 | object's .defects attribute. | |
20 | """ | |
21 | ||
22 | import re | |
23 | from email import Errors | |
24 | from email import Message | |
25 | ||
26 | NLCRE = re.compile('\r\n|\r|\n') | |
27 | NLCRE_bol = re.compile('(\r\n|\r|\n)') | |
28 | NLCRE_eol = re.compile('(\r\n|\r|\n)$') | |
29 | NLCRE_crack = re.compile('(\r\n|\r|\n)') | |
30 | # RFC 2822 $3.6.8 Optional fields. ftext is %d33-57 / %d59-126, Any character | |
31 | # except controls, SP, and ":". | |
32 | headerRE = re.compile(r'^(From |[\041-\071\073-\176]{2,}:|[\t ])') | |
33 | EMPTYSTRING = '' | |
34 | NL = '\n' | |
35 | ||
36 | NeedMoreData = object() | |
37 | ||
38 | ||
39 | \f | |
40 | class BufferedSubFile(object): | |
41 | """A file-ish object that can have new data loaded into it. | |
42 | ||
43 | You can also push and pop line-matching predicates onto a stack. When the | |
44 | current predicate matches the current line, a false EOF response | |
45 | (i.e. empty string) is returned instead. This lets the parser adhere to a | |
46 | simple abstraction -- it parses until EOF closes the current message. | |
47 | """ | |
48 | def __init__(self): | |
49 | # The last partial line pushed into this object. | |
50 | self._partial = '' | |
51 | # The list of full, pushed lines, in reverse order | |
52 | self._lines = [] | |
53 | # The stack of false-EOF checking predicates. | |
54 | self._eofstack = [] | |
55 | # A flag indicating whether the file has been closed or not. | |
56 | self._closed = False | |
57 | ||
58 | def push_eof_matcher(self, pred): | |
59 | self._eofstack.append(pred) | |
60 | ||
61 | def pop_eof_matcher(self): | |
62 | return self._eofstack.pop() | |
63 | ||
64 | def close(self): | |
65 | # Don't forget any trailing partial line. | |
66 | self._lines.append(self._partial) | |
67 | self._partial = '' | |
68 | self._closed = True | |
69 | ||
70 | def readline(self): | |
71 | if not self._lines: | |
72 | if self._closed: | |
73 | return '' | |
74 | return NeedMoreData | |
75 | # Pop the line off the stack and see if it matches the current | |
76 | # false-EOF predicate. | |
77 | line = self._lines.pop() | |
78 | # RFC 2046, section 5.1.2 requires us to recognize outer level | |
79 | # boundaries at any level of inner nesting. Do this, but be sure it's | |
80 | # in the order of most to least nested. | |
81 | for ateof in self._eofstack[::-1]: | |
82 | if ateof(line): | |
83 | # We're at the false EOF. But push the last line back first. | |
84 | self._lines.append(line) | |
85 | return '' | |
86 | return line | |
87 | ||
88 | def unreadline(self, line): | |
89 | # Let the consumer push a line back into the buffer. | |
90 | assert line is not NeedMoreData | |
91 | self._lines.append(line) | |
92 | ||
93 | def push(self, data): | |
94 | """Push some new data into this object.""" | |
95 | # Handle any previous leftovers | |
96 | data, self._partial = self._partial + data, '' | |
97 | # Crack into lines, but preserve the newlines on the end of each | |
98 | parts = NLCRE_crack.split(data) | |
99 | # The *ahem* interesting behaviour of re.split when supplied grouping | |
100 | # parentheses is that the last element of the resulting list is the | |
101 | # data after the final RE. In the case of a NL/CR terminated string, | |
102 | # this is the empty string. | |
103 | self._partial = parts.pop() | |
104 | # parts is a list of strings, alternating between the line contents | |
105 | # and the eol character(s). Gather up a list of lines after | |
106 | # re-attaching the newlines. | |
107 | lines = [] | |
108 | for i in range(len(parts) // 2): | |
109 | lines.append(parts[i*2] + parts[i*2+1]) | |
110 | self.pushlines(lines) | |
111 | ||
112 | def pushlines(self, lines): | |
113 | # Reverse and insert at the front of the lines. | |
114 | self._lines[:0] = lines[::-1] | |
115 | ||
116 | def is_closed(self): | |
117 | return self._closed | |
118 | ||
119 | def __iter__(self): | |
120 | return self | |
121 | ||
122 | def next(self): | |
123 | line = self.readline() | |
124 | if line == '': | |
125 | raise StopIteration | |
126 | return line | |
127 | ||
128 | ||
129 | \f | |
130 | class FeedParser: | |
131 | """A feed-style parser of email.""" | |
132 | ||
133 | def __init__(self, _factory=Message.Message): | |
134 | """_factory is called with no arguments to create a new message obj""" | |
135 | self._factory = _factory | |
136 | self._input = BufferedSubFile() | |
137 | self._msgstack = [] | |
138 | self._parse = self._parsegen().next | |
139 | self._cur = None | |
140 | self._last = None | |
141 | self._headersonly = False | |
142 | ||
143 | # Non-public interface for supporting Parser's headersonly flag | |
144 | def _set_headersonly(self): | |
145 | self._headersonly = True | |
146 | ||
147 | def feed(self, data): | |
148 | """Push more data into the parser.""" | |
149 | self._input.push(data) | |
150 | self._call_parse() | |
151 | ||
152 | def _call_parse(self): | |
153 | try: | |
154 | self._parse() | |
155 | except StopIteration: | |
156 | pass | |
157 | ||
158 | def close(self): | |
159 | """Parse all remaining data and return the root message object.""" | |
160 | self._input.close() | |
161 | self._call_parse() | |
162 | root = self._pop_message() | |
163 | assert not self._msgstack | |
164 | # Look for final set of defects | |
165 | if root.get_content_maintype() == 'multipart' \ | |
166 | and not root.is_multipart(): | |
167 | root.defects.append(Errors.MultipartInvariantViolationDefect()) | |
168 | return root | |
169 | ||
170 | def _new_message(self): | |
171 | msg = self._factory() | |
172 | if self._cur and self._cur.get_content_type() == 'multipart/digest': | |
173 | msg.set_default_type('message/rfc822') | |
174 | if self._msgstack: | |
175 | self._msgstack[-1].attach(msg) | |
176 | self._msgstack.append(msg) | |
177 | self._cur = msg | |
178 | self._last = msg | |
179 | ||
180 | def _pop_message(self): | |
181 | retval = self._msgstack.pop() | |
182 | if self._msgstack: | |
183 | self._cur = self._msgstack[-1] | |
184 | else: | |
185 | self._cur = None | |
186 | return retval | |
187 | ||
188 | def _parsegen(self): | |
189 | # Create a new message and start by parsing headers. | |
190 | self._new_message() | |
191 | headers = [] | |
192 | # Collect the headers, searching for a line that doesn't match the RFC | |
193 | # 2822 header or continuation pattern (including an empty line). | |
194 | for line in self._input: | |
195 | if line is NeedMoreData: | |
196 | yield NeedMoreData | |
197 | continue | |
198 | if not headerRE.match(line): | |
199 | # If we saw the RFC defined header/body separator | |
200 | # (i.e. newline), just throw it away. Otherwise the line is | |
201 | # part of the body so push it back. | |
202 | if not NLCRE.match(line): | |
203 | self._input.unreadline(line) | |
204 | break | |
205 | headers.append(line) | |
206 | # Done with the headers, so parse them and figure out what we're | |
207 | # supposed to see in the body of the message. | |
208 | self._parse_headers(headers) | |
209 | # Headers-only parsing is a backwards compatibility hack, which was | |
210 | # necessary in the older parser, which could throw errors. All | |
211 | # remaining lines in the input are thrown into the message body. | |
212 | if self._headersonly: | |
213 | lines = [] | |
214 | while True: | |
215 | line = self._input.readline() | |
216 | if line is NeedMoreData: | |
217 | yield NeedMoreData | |
218 | continue | |
219 | if line == '': | |
220 | break | |
221 | lines.append(line) | |
222 | self._cur.set_payload(EMPTYSTRING.join(lines)) | |
223 | return | |
224 | if self._cur.get_content_type() == 'message/delivery-status': | |
225 | # message/delivery-status contains blocks of headers separated by | |
226 | # a blank line. We'll represent each header block as a separate | |
227 | # nested message object, but the processing is a bit different | |
228 | # than standard message/* types because there is no body for the | |
229 | # nested messages. A blank line separates the subparts. | |
230 | while True: | |
231 | self._input.push_eof_matcher(NLCRE.match) | |
232 | for retval in self._parsegen(): | |
233 | if retval is NeedMoreData: | |
234 | yield NeedMoreData | |
235 | continue | |
236 | break | |
237 | msg = self._pop_message() | |
238 | # We need to pop the EOF matcher in order to tell if we're at | |
239 | # the end of the current file, not the end of the last block | |
240 | # of message headers. | |
241 | self._input.pop_eof_matcher() | |
242 | # The input stream must be sitting at the newline or at the | |
243 | # EOF. We want to see if we're at the end of this subpart, so | |
244 | # first consume the blank line, then test the next line to see | |
245 | # if we're at this subpart's EOF. | |
246 | while True: | |
247 | line = self._input.readline() | |
248 | if line is NeedMoreData: | |
249 | yield NeedMoreData | |
250 | continue | |
251 | break | |
252 | while True: | |
253 | line = self._input.readline() | |
254 | if line is NeedMoreData: | |
255 | yield NeedMoreData | |
256 | continue | |
257 | break | |
258 | if line == '': | |
259 | break | |
260 | # Not at EOF so this is a line we're going to need. | |
261 | self._input.unreadline(line) | |
262 | return | |
263 | if self._cur.get_content_maintype() == 'message': | |
264 | # The message claims to be a message/* type, then what follows is | |
265 | # another RFC 2822 message. | |
266 | for retval in self._parsegen(): | |
267 | if retval is NeedMoreData: | |
268 | yield NeedMoreData | |
269 | continue | |
270 | break | |
271 | self._pop_message() | |
272 | return | |
273 | if self._cur.get_content_maintype() == 'multipart': | |
274 | boundary = self._cur.get_boundary() | |
275 | if boundary is None: | |
276 | # The message /claims/ to be a multipart but it has not | |
277 | # defined a boundary. That's a problem which we'll handle by | |
278 | # reading everything until the EOF and marking the message as | |
279 | # defective. | |
280 | self._cur.defects.append(Errors.NoBoundaryInMultipartDefect()) | |
281 | lines = [] | |
282 | for line in self._input: | |
283 | if line is NeedMoreData: | |
284 | yield NeedMoreData | |
285 | continue | |
286 | lines.append(line) | |
287 | self._cur.set_payload(EMPTYSTRING.join(lines)) | |
288 | return | |
289 | # Create a line match predicate which matches the inter-part | |
290 | # boundary as well as the end-of-multipart boundary. Don't push | |
291 | # this onto the input stream until we've scanned past the | |
292 | # preamble. | |
293 | separator = '--' + boundary | |
294 | boundaryre = re.compile( | |
295 | '(?P<sep>' + re.escape(separator) + | |
296 | r')(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n|\r|\n)?$') | |
297 | capturing_preamble = True | |
298 | preamble = [] | |
299 | linesep = False | |
300 | while True: | |
301 | line = self._input.readline() | |
302 | if line is NeedMoreData: | |
303 | yield NeedMoreData | |
304 | continue | |
305 | if line == '': | |
306 | break | |
307 | mo = boundaryre.match(line) | |
308 | if mo: | |
309 | # If we're looking at the end boundary, we're done with | |
310 | # this multipart. If there was a newline at the end of | |
311 | # the closing boundary, then we need to initialize the | |
312 | # epilogue with the empty string (see below). | |
313 | if mo.group('end'): | |
314 | linesep = mo.group('linesep') | |
315 | break | |
316 | # We saw an inter-part boundary. Were we in the preamble? | |
317 | if capturing_preamble: | |
318 | if preamble: | |
319 | # According to RFC 2046, the last newline belongs | |
320 | # to the boundary. | |
321 | lastline = preamble[-1] | |
322 | eolmo = NLCRE_eol.search(lastline) | |
323 | if eolmo: | |
324 | preamble[-1] = lastline[:-len(eolmo.group(0))] | |
325 | self._cur.preamble = EMPTYSTRING.join(preamble) | |
326 | capturing_preamble = False | |
327 | self._input.unreadline(line) | |
328 | continue | |
329 | # We saw a boundary separating two parts. Consume any | |
330 | # multiple boundary lines that may be following. Our | |
331 | # interpretation of RFC 2046 BNF grammar does not produce | |
332 | # body parts within such double boundaries. | |
333 | while True: | |
334 | line = self._input.readline() | |
335 | if line is NeedMoreData: | |
336 | yield NeedMoreData | |
337 | continue | |
338 | mo = boundaryre.match(line) | |
339 | if not mo: | |
340 | self._input.unreadline(line) | |
341 | break | |
342 | # Recurse to parse this subpart; the input stream points | |
343 | # at the subpart's first line. | |
344 | self._input.push_eof_matcher(boundaryre.match) | |
345 | for retval in self._parsegen(): | |
346 | if retval is NeedMoreData: | |
347 | yield NeedMoreData | |
348 | continue | |
349 | break | |
350 | # Because of RFC 2046, the newline preceding the boundary | |
351 | # separator actually belongs to the boundary, not the | |
352 | # previous subpart's payload (or epilogue if the previous | |
353 | # part is a multipart). | |
354 | if self._last.get_content_maintype() == 'multipart': | |
355 | epilogue = self._last.epilogue | |
356 | if epilogue == '': | |
357 | self._last.epilogue = None | |
358 | elif epilogue is not None: | |
359 | mo = NLCRE_eol.search(epilogue) | |
360 | if mo: | |
361 | end = len(mo.group(0)) | |
362 | self._last.epilogue = epilogue[:-end] | |
363 | else: | |
364 | payload = self._last.get_payload() | |
365 | if isinstance(payload, basestring): | |
366 | mo = NLCRE_eol.search(payload) | |
367 | if mo: | |
368 | payload = payload[:-len(mo.group(0))] | |
369 | self._last.set_payload(payload) | |
370 | self._input.pop_eof_matcher() | |
371 | self._pop_message() | |
372 | # Set the multipart up for newline cleansing, which will | |
373 | # happen if we're in a nested multipart. | |
374 | self._last = self._cur | |
375 | else: | |
376 | # I think we must be in the preamble | |
377 | assert capturing_preamble | |
378 | preamble.append(line) | |
379 | # We've seen either the EOF or the end boundary. If we're still | |
380 | # capturing the preamble, we never saw the start boundary. Note | |
381 | # that as a defect and store the captured text as the payload. | |
382 | # Everything from here to the EOF is epilogue. | |
383 | if capturing_preamble: | |
384 | self._cur.defects.append(Errors.StartBoundaryNotFoundDefect()) | |
385 | self._cur.set_payload(EMPTYSTRING.join(preamble)) | |
386 | epilogue = [] | |
387 | for line in self._input: | |
388 | if line is NeedMoreData: | |
389 | yield NeedMoreData | |
390 | continue | |
391 | self._cur.epilogue = EMPTYSTRING.join(epilogue) | |
392 | return | |
393 | # If the end boundary ended in a newline, we'll need to make sure | |
394 | # the epilogue isn't None | |
395 | if linesep: | |
396 | epilogue = [''] | |
397 | else: | |
398 | epilogue = [] | |
399 | for line in self._input: | |
400 | if line is NeedMoreData: | |
401 | yield NeedMoreData | |
402 | continue | |
403 | epilogue.append(line) | |
404 | # Any CRLF at the front of the epilogue is not technically part of | |
405 | # the epilogue. Also, watch out for an empty string epilogue, | |
406 | # which means a single newline. | |
407 | if epilogue: | |
408 | firstline = epilogue[0] | |
409 | bolmo = NLCRE_bol.match(firstline) | |
410 | if bolmo: | |
411 | epilogue[0] = firstline[len(bolmo.group(0)):] | |
412 | self._cur.epilogue = EMPTYSTRING.join(epilogue) | |
413 | return | |
414 | # Otherwise, it's some non-multipart type, so the entire rest of the | |
415 | # file contents becomes the payload. | |
416 | lines = [] | |
417 | for line in self._input: | |
418 | if line is NeedMoreData: | |
419 | yield NeedMoreData | |
420 | continue | |
421 | lines.append(line) | |
422 | self._cur.set_payload(EMPTYSTRING.join(lines)) | |
423 | ||
424 | def _parse_headers(self, lines): | |
425 | # Passed a list of lines that make up the headers for the current msg | |
426 | lastheader = '' | |
427 | lastvalue = [] | |
428 | for lineno, line in enumerate(lines): | |
429 | # Check for continuation | |
430 | if line[0] in ' \t': | |
431 | if not lastheader: | |
432 | # The first line of the headers was a continuation. This | |
433 | # is illegal, so let's note the defect, store the illegal | |
434 | # line, and ignore it for purposes of headers. | |
435 | defect = Errors.FirstHeaderLineIsContinuationDefect(line) | |
436 | self._cur.defects.append(defect) | |
437 | continue | |
438 | lastvalue.append(line) | |
439 | continue | |
440 | if lastheader: | |
441 | # XXX reconsider the joining of folded lines | |
442 | lhdr = EMPTYSTRING.join(lastvalue)[:-1].rstrip('\r\n') | |
443 | self._cur[lastheader] = lhdr | |
444 | lastheader, lastvalue = '', [] | |
445 | # Check for envelope header, i.e. unix-from | |
446 | if line.startswith('From '): | |
447 | if lineno == 0: | |
448 | # Strip off the trailing newline | |
449 | mo = NLCRE_eol.search(line) | |
450 | if mo: | |
451 | line = line[:-len(mo.group(0))] | |
452 | self._cur.set_unixfrom(line) | |
453 | continue | |
454 | elif lineno == len(lines) - 1: | |
455 | # Something looking like a unix-from at the end - it's | |
456 | # probably the first line of the body, so push back the | |
457 | # line and stop. | |
458 | self._input.unreadline(line) | |
459 | return | |
460 | else: | |
461 | # Weirdly placed unix-from line. Note this as a defect | |
462 | # and ignore it. | |
463 | defect = Errors.MisplacedEnvelopeHeaderDefect(line) | |
464 | self._cur.defects.append(defect) | |
465 | continue | |
466 | # Split the line on the colon separating field name from value. | |
467 | i = line.find(':') | |
468 | if i < 0: | |
469 | defect = Errors.MalformedHeaderDefect(line) | |
470 | self._cur.defects.append(defect) | |
471 | continue | |
472 | lastheader = line[:i] | |
473 | lastvalue = [line[i+1:].lstrip()] | |
474 | # Done with all the lines, so handle the last header. | |
475 | if lastheader: | |
476 | # XXX reconsider the joining of folded lines | |
477 | self._cur[lastheader] = EMPTYSTRING.join(lastvalue).rstrip('\r\n') |