Initial commit of OpenSPARC T2 architecture model.
[OpenSPARC-T2-SAM] / sam-t2 / devtools / v9 / lib / python2.4 / email / FeedParser.py
CommitLineData
920dae64
AT
1# Copyright (C) 2004 Python Software Foundation
2# Authors: Baxter, Wouters and Warsaw
3# Contact: email-sig@python.org
4
5"""FeedParser - An email feed parser.
6
7The feed parser implements an interface for incrementally parsing an email
8message, line by line. This has advantages for certain applications, such as
9those reading email messages off a socket.
10
11FeedParser.feed() is the primary interface for pushing new data into the
12parser. It returns when there's nothing more it can do with the available
13data. When you have no more data to push into the parser, call .close().
14This completes the parsing and returns the root message object.
15
16The other advantage of this parser is that it will never throw a parsing
17exception. Instead, when it finds something unexpected, it adds a 'defect' to
18the current message. Defects are just instances that live on the message
19object's .defects attribute.
20"""
21
22import re
23from email import Errors
24from email import Message
25
26NLCRE = re.compile('\r\n|\r|\n')
27NLCRE_bol = re.compile('(\r\n|\r|\n)')
28NLCRE_eol = re.compile('(\r\n|\r|\n)$')
29NLCRE_crack = re.compile('(\r\n|\r|\n)')
30# RFC 2822 $3.6.8 Optional fields. ftext is %d33-57 / %d59-126, Any character
31# except controls, SP, and ":".
32headerRE = re.compile(r'^(From |[\041-\071\073-\176]{2,}:|[\t ])')
33EMPTYSTRING = ''
34NL = '\n'
35
36NeedMoreData = object()
37
38
39\f
40class BufferedSubFile(object):
41 """A file-ish object that can have new data loaded into it.
42
43 You can also push and pop line-matching predicates onto a stack. When the
44 current predicate matches the current line, a false EOF response
45 (i.e. empty string) is returned instead. This lets the parser adhere to a
46 simple abstraction -- it parses until EOF closes the current message.
47 """
48 def __init__(self):
49 # The last partial line pushed into this object.
50 self._partial = ''
51 # The list of full, pushed lines, in reverse order
52 self._lines = []
53 # The stack of false-EOF checking predicates.
54 self._eofstack = []
55 # A flag indicating whether the file has been closed or not.
56 self._closed = False
57
58 def push_eof_matcher(self, pred):
59 self._eofstack.append(pred)
60
61 def pop_eof_matcher(self):
62 return self._eofstack.pop()
63
64 def close(self):
65 # Don't forget any trailing partial line.
66 self._lines.append(self._partial)
67 self._partial = ''
68 self._closed = True
69
70 def readline(self):
71 if not self._lines:
72 if self._closed:
73 return ''
74 return NeedMoreData
75 # Pop the line off the stack and see if it matches the current
76 # false-EOF predicate.
77 line = self._lines.pop()
78 # RFC 2046, section 5.1.2 requires us to recognize outer level
79 # boundaries at any level of inner nesting. Do this, but be sure it's
80 # in the order of most to least nested.
81 for ateof in self._eofstack[::-1]:
82 if ateof(line):
83 # We're at the false EOF. But push the last line back first.
84 self._lines.append(line)
85 return ''
86 return line
87
88 def unreadline(self, line):
89 # Let the consumer push a line back into the buffer.
90 assert line is not NeedMoreData
91 self._lines.append(line)
92
93 def push(self, data):
94 """Push some new data into this object."""
95 # Handle any previous leftovers
96 data, self._partial = self._partial + data, ''
97 # Crack into lines, but preserve the newlines on the end of each
98 parts = NLCRE_crack.split(data)
99 # The *ahem* interesting behaviour of re.split when supplied grouping
100 # parentheses is that the last element of the resulting list is the
101 # data after the final RE. In the case of a NL/CR terminated string,
102 # this is the empty string.
103 self._partial = parts.pop()
104 # parts is a list of strings, alternating between the line contents
105 # and the eol character(s). Gather up a list of lines after
106 # re-attaching the newlines.
107 lines = []
108 for i in range(len(parts) // 2):
109 lines.append(parts[i*2] + parts[i*2+1])
110 self.pushlines(lines)
111
112 def pushlines(self, lines):
113 # Reverse and insert at the front of the lines.
114 self._lines[:0] = lines[::-1]
115
116 def is_closed(self):
117 return self._closed
118
119 def __iter__(self):
120 return self
121
122 def next(self):
123 line = self.readline()
124 if line == '':
125 raise StopIteration
126 return line
127
128
129\f
130class FeedParser:
131 """A feed-style parser of email."""
132
133 def __init__(self, _factory=Message.Message):
134 """_factory is called with no arguments to create a new message obj"""
135 self._factory = _factory
136 self._input = BufferedSubFile()
137 self._msgstack = []
138 self._parse = self._parsegen().next
139 self._cur = None
140 self._last = None
141 self._headersonly = False
142
143 # Non-public interface for supporting Parser's headersonly flag
144 def _set_headersonly(self):
145 self._headersonly = True
146
147 def feed(self, data):
148 """Push more data into the parser."""
149 self._input.push(data)
150 self._call_parse()
151
152 def _call_parse(self):
153 try:
154 self._parse()
155 except StopIteration:
156 pass
157
158 def close(self):
159 """Parse all remaining data and return the root message object."""
160 self._input.close()
161 self._call_parse()
162 root = self._pop_message()
163 assert not self._msgstack
164 # Look for final set of defects
165 if root.get_content_maintype() == 'multipart' \
166 and not root.is_multipart():
167 root.defects.append(Errors.MultipartInvariantViolationDefect())
168 return root
169
170 def _new_message(self):
171 msg = self._factory()
172 if self._cur and self._cur.get_content_type() == 'multipart/digest':
173 msg.set_default_type('message/rfc822')
174 if self._msgstack:
175 self._msgstack[-1].attach(msg)
176 self._msgstack.append(msg)
177 self._cur = msg
178 self._last = msg
179
180 def _pop_message(self):
181 retval = self._msgstack.pop()
182 if self._msgstack:
183 self._cur = self._msgstack[-1]
184 else:
185 self._cur = None
186 return retval
187
188 def _parsegen(self):
189 # Create a new message and start by parsing headers.
190 self._new_message()
191 headers = []
192 # Collect the headers, searching for a line that doesn't match the RFC
193 # 2822 header or continuation pattern (including an empty line).
194 for line in self._input:
195 if line is NeedMoreData:
196 yield NeedMoreData
197 continue
198 if not headerRE.match(line):
199 # If we saw the RFC defined header/body separator
200 # (i.e. newline), just throw it away. Otherwise the line is
201 # part of the body so push it back.
202 if not NLCRE.match(line):
203 self._input.unreadline(line)
204 break
205 headers.append(line)
206 # Done with the headers, so parse them and figure out what we're
207 # supposed to see in the body of the message.
208 self._parse_headers(headers)
209 # Headers-only parsing is a backwards compatibility hack, which was
210 # necessary in the older parser, which could throw errors. All
211 # remaining lines in the input are thrown into the message body.
212 if self._headersonly:
213 lines = []
214 while True:
215 line = self._input.readline()
216 if line is NeedMoreData:
217 yield NeedMoreData
218 continue
219 if line == '':
220 break
221 lines.append(line)
222 self._cur.set_payload(EMPTYSTRING.join(lines))
223 return
224 if self._cur.get_content_type() == 'message/delivery-status':
225 # message/delivery-status contains blocks of headers separated by
226 # a blank line. We'll represent each header block as a separate
227 # nested message object, but the processing is a bit different
228 # than standard message/* types because there is no body for the
229 # nested messages. A blank line separates the subparts.
230 while True:
231 self._input.push_eof_matcher(NLCRE.match)
232 for retval in self._parsegen():
233 if retval is NeedMoreData:
234 yield NeedMoreData
235 continue
236 break
237 msg = self._pop_message()
238 # We need to pop the EOF matcher in order to tell if we're at
239 # the end of the current file, not the end of the last block
240 # of message headers.
241 self._input.pop_eof_matcher()
242 # The input stream must be sitting at the newline or at the
243 # EOF. We want to see if we're at the end of this subpart, so
244 # first consume the blank line, then test the next line to see
245 # if we're at this subpart's EOF.
246 while True:
247 line = self._input.readline()
248 if line is NeedMoreData:
249 yield NeedMoreData
250 continue
251 break
252 while True:
253 line = self._input.readline()
254 if line is NeedMoreData:
255 yield NeedMoreData
256 continue
257 break
258 if line == '':
259 break
260 # Not at EOF so this is a line we're going to need.
261 self._input.unreadline(line)
262 return
263 if self._cur.get_content_maintype() == 'message':
264 # The message claims to be a message/* type, then what follows is
265 # another RFC 2822 message.
266 for retval in self._parsegen():
267 if retval is NeedMoreData:
268 yield NeedMoreData
269 continue
270 break
271 self._pop_message()
272 return
273 if self._cur.get_content_maintype() == 'multipart':
274 boundary = self._cur.get_boundary()
275 if boundary is None:
276 # The message /claims/ to be a multipart but it has not
277 # defined a boundary. That's a problem which we'll handle by
278 # reading everything until the EOF and marking the message as
279 # defective.
280 self._cur.defects.append(Errors.NoBoundaryInMultipartDefect())
281 lines = []
282 for line in self._input:
283 if line is NeedMoreData:
284 yield NeedMoreData
285 continue
286 lines.append(line)
287 self._cur.set_payload(EMPTYSTRING.join(lines))
288 return
289 # Create a line match predicate which matches the inter-part
290 # boundary as well as the end-of-multipart boundary. Don't push
291 # this onto the input stream until we've scanned past the
292 # preamble.
293 separator = '--' + boundary
294 boundaryre = re.compile(
295 '(?P<sep>' + re.escape(separator) +
296 r')(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n|\r|\n)?$')
297 capturing_preamble = True
298 preamble = []
299 linesep = False
300 while True:
301 line = self._input.readline()
302 if line is NeedMoreData:
303 yield NeedMoreData
304 continue
305 if line == '':
306 break
307 mo = boundaryre.match(line)
308 if mo:
309 # If we're looking at the end boundary, we're done with
310 # this multipart. If there was a newline at the end of
311 # the closing boundary, then we need to initialize the
312 # epilogue with the empty string (see below).
313 if mo.group('end'):
314 linesep = mo.group('linesep')
315 break
316 # We saw an inter-part boundary. Were we in the preamble?
317 if capturing_preamble:
318 if preamble:
319 # According to RFC 2046, the last newline belongs
320 # to the boundary.
321 lastline = preamble[-1]
322 eolmo = NLCRE_eol.search(lastline)
323 if eolmo:
324 preamble[-1] = lastline[:-len(eolmo.group(0))]
325 self._cur.preamble = EMPTYSTRING.join(preamble)
326 capturing_preamble = False
327 self._input.unreadline(line)
328 continue
329 # We saw a boundary separating two parts. Consume any
330 # multiple boundary lines that may be following. Our
331 # interpretation of RFC 2046 BNF grammar does not produce
332 # body parts within such double boundaries.
333 while True:
334 line = self._input.readline()
335 if line is NeedMoreData:
336 yield NeedMoreData
337 continue
338 mo = boundaryre.match(line)
339 if not mo:
340 self._input.unreadline(line)
341 break
342 # Recurse to parse this subpart; the input stream points
343 # at the subpart's first line.
344 self._input.push_eof_matcher(boundaryre.match)
345 for retval in self._parsegen():
346 if retval is NeedMoreData:
347 yield NeedMoreData
348 continue
349 break
350 # Because of RFC 2046, the newline preceding the boundary
351 # separator actually belongs to the boundary, not the
352 # previous subpart's payload (or epilogue if the previous
353 # part is a multipart).
354 if self._last.get_content_maintype() == 'multipart':
355 epilogue = self._last.epilogue
356 if epilogue == '':
357 self._last.epilogue = None
358 elif epilogue is not None:
359 mo = NLCRE_eol.search(epilogue)
360 if mo:
361 end = len(mo.group(0))
362 self._last.epilogue = epilogue[:-end]
363 else:
364 payload = self._last.get_payload()
365 if isinstance(payload, basestring):
366 mo = NLCRE_eol.search(payload)
367 if mo:
368 payload = payload[:-len(mo.group(0))]
369 self._last.set_payload(payload)
370 self._input.pop_eof_matcher()
371 self._pop_message()
372 # Set the multipart up for newline cleansing, which will
373 # happen if we're in a nested multipart.
374 self._last = self._cur
375 else:
376 # I think we must be in the preamble
377 assert capturing_preamble
378 preamble.append(line)
379 # We've seen either the EOF or the end boundary. If we're still
380 # capturing the preamble, we never saw the start boundary. Note
381 # that as a defect and store the captured text as the payload.
382 # Everything from here to the EOF is epilogue.
383 if capturing_preamble:
384 self._cur.defects.append(Errors.StartBoundaryNotFoundDefect())
385 self._cur.set_payload(EMPTYSTRING.join(preamble))
386 epilogue = []
387 for line in self._input:
388 if line is NeedMoreData:
389 yield NeedMoreData
390 continue
391 self._cur.epilogue = EMPTYSTRING.join(epilogue)
392 return
393 # If the end boundary ended in a newline, we'll need to make sure
394 # the epilogue isn't None
395 if linesep:
396 epilogue = ['']
397 else:
398 epilogue = []
399 for line in self._input:
400 if line is NeedMoreData:
401 yield NeedMoreData
402 continue
403 epilogue.append(line)
404 # Any CRLF at the front of the epilogue is not technically part of
405 # the epilogue. Also, watch out for an empty string epilogue,
406 # which means a single newline.
407 if epilogue:
408 firstline = epilogue[0]
409 bolmo = NLCRE_bol.match(firstline)
410 if bolmo:
411 epilogue[0] = firstline[len(bolmo.group(0)):]
412 self._cur.epilogue = EMPTYSTRING.join(epilogue)
413 return
414 # Otherwise, it's some non-multipart type, so the entire rest of the
415 # file contents becomes the payload.
416 lines = []
417 for line in self._input:
418 if line is NeedMoreData:
419 yield NeedMoreData
420 continue
421 lines.append(line)
422 self._cur.set_payload(EMPTYSTRING.join(lines))
423
424 def _parse_headers(self, lines):
425 # Passed a list of lines that make up the headers for the current msg
426 lastheader = ''
427 lastvalue = []
428 for lineno, line in enumerate(lines):
429 # Check for continuation
430 if line[0] in ' \t':
431 if not lastheader:
432 # The first line of the headers was a continuation. This
433 # is illegal, so let's note the defect, store the illegal
434 # line, and ignore it for purposes of headers.
435 defect = Errors.FirstHeaderLineIsContinuationDefect(line)
436 self._cur.defects.append(defect)
437 continue
438 lastvalue.append(line)
439 continue
440 if lastheader:
441 # XXX reconsider the joining of folded lines
442 lhdr = EMPTYSTRING.join(lastvalue)[:-1].rstrip('\r\n')
443 self._cur[lastheader] = lhdr
444 lastheader, lastvalue = '', []
445 # Check for envelope header, i.e. unix-from
446 if line.startswith('From '):
447 if lineno == 0:
448 # Strip off the trailing newline
449 mo = NLCRE_eol.search(line)
450 if mo:
451 line = line[:-len(mo.group(0))]
452 self._cur.set_unixfrom(line)
453 continue
454 elif lineno == len(lines) - 1:
455 # Something looking like a unix-from at the end - it's
456 # probably the first line of the body, so push back the
457 # line and stop.
458 self._input.unreadline(line)
459 return
460 else:
461 # Weirdly placed unix-from line. Note this as a defect
462 # and ignore it.
463 defect = Errors.MisplacedEnvelopeHeaderDefect(line)
464 self._cur.defects.append(defect)
465 continue
466 # Split the line on the colon separating field name from value.
467 i = line.find(':')
468 if i < 0:
469 defect = Errors.MalformedHeaderDefect(line)
470 self._cur.defects.append(defect)
471 continue
472 lastheader = line[:i]
473 lastvalue = [line[i+1:].lstrip()]
474 # Done with all the lines, so handle the last header.
475 if lastheader:
476 # XXX reconsider the joining of folded lines
477 self._cur[lastheader] = EMPTYSTRING.join(lastvalue).rstrip('\r\n')