[OpenSPARC-T2-SAM] / sam-t2 / devtools / v8plus / lib / python2.4 / email / Header.py

# Copyright (C) 2002-2004 Python Software Foundation
# Author: Ben Gertzfield, Barry Warsaw
# Contact: email-sig@python.org

"""Header encoding and decoding functionality."""

import re
import binascii

import email.quopriMIME
import email.base64MIME
from email.Errors import HeaderParseError
from email.Charset import Charset

NL = '\n'
SPACE = ' '
USPACE = u' '
SPACE8 = ' ' * 8
UEMPTYSTRING = u''

MAXLINELEN = 76

USASCII = Charset('us-ascii')
UTF8 = Charset('utf-8')

# Match encoded-word strings in the form =?charset?q?Hello_World?=
ecre = re.compile(r'''
  =\?                   # literal =?
  (?P<charset>[^?]*?)   # non-greedy up to the next ? is the charset
  \?                    # literal ?
  (?P<encoding>[qb])    # either a "q" or a "b", case insensitive
  \?                    # literal ?
  (?P<encoded>.*?)      # non-greedy up to the next ?= is the encoded string
  \?=                   # literal ?=
  ''', re.VERBOSE | re.IGNORECASE)

# Field name regexp, including trailing colon, but not separating whitespace,
# according to RFC 2822.  Character range is from tilde to exclamation mark.
# For use with .match()
fcre = re.compile(r'[\041-\176]+:$')


\f
# Helpers
_max_append = email.quopriMIME._max_append


\f
def decode_header(header):
    """Decode a message header value without converting charset.

    Returns a list of (decoded_string, charset) pairs containing each of the
    decoded parts of the header.  Charset is None for non-encoded parts of the
    header, otherwise a lower-case string containing the name of the character
    set specified in the encoded string.

    An email.Errors.HeaderParseError may be raised when certain decoding error
    occurs (e.g. a base64 decoding exception).
    """
    # If no encoding, just return the header
    header = str(header)
    if not ecre.search(header):
        return [(header, None)]
    decoded = []
    dec = ''
    for line in header.splitlines():
        # This line might not have an encoding in it
        if not ecre.search(line):
            decoded.append((line, None))
            continue
        parts = ecre.split(line)
        while parts:
            unenc = parts.pop(0).strip()
            if unenc:
                # Should we continue a long line?
                if decoded and decoded[-1][1] is None:
                    decoded[-1] = (decoded[-1][0] + SPACE + unenc, None)
                else:
                    decoded.append((unenc, None))
            if parts:
                charset, encoding = [s.lower() for s in parts[0:2]]
                encoded = parts[2]
                dec = None
                if encoding == 'q':
                    dec = email.quopriMIME.header_decode(encoded)
                elif encoding == 'b':
                    try:
                        dec = email.base64MIME.decode(encoded)
                    except binascii.Error:
                        # Turn this into a higher level exception.  BAW: Right
                        # now we throw the lower level exception away but
                        # when/if we get exception chaining, we'll preserve it.
                        raise HeaderParseError
                if dec is None:
                    dec = encoded

                if decoded and decoded[-1][1] == charset:
                    decoded[-1] = (decoded[-1][0] + dec, decoded[-1][1])
                else:
                    decoded.append((dec, charset))
            del parts[0:3]
    return decoded


\f
def make_header(decoded_seq, maxlinelen=None, header_name=None,
                continuation_ws=' '):
    """Create a Header from a sequence of pairs as returned by decode_header()

    decode_header() takes a header value string and returns a sequence of
    pairs of the format (decoded_string, charset) where charset is the string
    name of the character set.

    This function takes one of those sequence of pairs and returns a Header
    instance.  Optional maxlinelen, header_name, and continuation_ws are as in
    the Header constructor.
    """
    h = Header(maxlinelen=maxlinelen, header_name=header_name,
               continuation_ws=continuation_ws)
    for s, charset in decoded_seq:
        # None means us-ascii but we can simply pass it on to h.append()
        if charset is not None and not isinstance(charset, Charset):
            charset = Charset(charset)
        h.append(s, charset)
    return h


\f
class Header:
    def __init__(self, s=None, charset=None,
                 maxlinelen=None, header_name=None,
                 continuation_ws=' ', errors='strict'):
        """Create a MIME-compliant header that can contain many character sets.

        Optional s is the initial header value.  If None, the initial header
        value is not set.  You can later append to the header with .append()
        method calls.  s may be a byte string or a Unicode string, but see the
        .append() documentation for semantics.

        Optional charset serves two purposes: it has the same meaning as the
        charset argument to the .append() method.  It also sets the default
        character set for all subsequent .append() calls that omit the charset
        argument.  If charset is not provided in the constructor, the us-ascii
        charset is used both as s's initial charset and as the default for
        subsequent .append() calls.

        The maximum line length can be specified explicit via maxlinelen.  For
        splitting the first line to a shorter value (to account for the field
        header which isn't included in s, e.g. `Subject') pass in the name of
        the field in header_name.  The default maxlinelen is 76.

        continuation_ws must be RFC 2822 compliant folding whitespace (usually
        either a space or a hard tab) which will be prepended to continuation
        lines.

        errors is passed through to the .append() call.
        """
        if charset is None:
            charset = USASCII
        if not isinstance(charset, Charset):
            charset = Charset(charset)
        self._charset = charset
        self._continuation_ws = continuation_ws
        cws_expanded_len = len(continuation_ws.replace('\t', SPACE8))
        # BAW: I believe `chunks' and `maxlinelen' should be non-public.
        self._chunks = []
        if s is not None:
            self.append(s, charset, errors)
        if maxlinelen is None:
            maxlinelen = MAXLINELEN
        if header_name is None:
            # We don't know anything about the field header so the first line
            # is the same length as subsequent lines.
            self._firstlinelen = maxlinelen
        else:
            # The first line should be shorter to take into account the field
            # header.  Also subtract off 2 extra for the colon and space.
            self._firstlinelen = maxlinelen - len(header_name) - 2
        # Second and subsequent lines should subtract off the length in
        # columns of the continuation whitespace prefix.
        self._maxlinelen = maxlinelen - cws_expanded_len

    def __str__(self):
        """A synonym for self.encode()."""
        return self.encode()

    def __unicode__(self):
        """Helper for the built-in unicode function."""
        uchunks = []
        lastcs = None
        for s, charset in self._chunks:
            # We must preserve spaces between encoded and non-encoded word
            # boundaries, which means for us we need to add a space when we go
            # from a charset to None/us-ascii, or from None/us-ascii to a
            # charset.  Only do this for the second and subsequent chunks.
            nextcs = charset
            if uchunks:
                if lastcs not in (None, 'us-ascii'):
                    if nextcs in (None, 'us-ascii'):
                        uchunks.append(USPACE)
                        nextcs = None
                elif nextcs not in (None, 'us-ascii'):
                    uchunks.append(USPACE)
            lastcs = nextcs
            uchunks.append(unicode(s, str(charset)))
        return UEMPTYSTRING.join(uchunks)

    # Rich comparison operators for equality only.  BAW: does it make sense to
    # have or explicitly disable <, <=, >, >= operators?
    def __eq__(self, other):
        # other may be a Header or a string.  Both are fine so coerce
        # ourselves to a string, swap the args and do another comparison.
        return other == self.encode()

    def __ne__(self, other):
        return not self == other

    def append(self, s, charset=None, errors='strict'):
        """Append a string to the MIME header.

        Optional charset, if given, should be a Charset instance or the name
        of a character set (which will be converted to a Charset instance).  A
        value of None (the default) means that the charset given in the
        constructor is used.

        s may be a byte string or a Unicode string.  If it is a byte string
        (i.e. isinstance(s, str) is true), then charset is the encoding of
        that byte string, and a UnicodeError will be raised if the string
        cannot be decoded with that charset.  If s is a Unicode string, then
        charset is a hint specifying the character set of the characters in
        the string.  In this case, when producing an RFC 2822 compliant header
        using RFC 2047 rules, the Unicode string will be encoded using the
        following charsets in order: us-ascii, the charset hint, utf-8.  The
        first character set not to provoke a UnicodeError is used.

        Optional `errors' is passed as the third argument to any unicode() or
        ustr.encode() call.
        """
        if charset is None:
            charset = self._charset
        elif not isinstance(charset, Charset):
            charset = Charset(charset)
        # If the charset is our faux 8bit charset, leave the string unchanged
        if charset <> '8bit':
            # We need to test that the string can be converted to unicode and
            # back to a byte string, given the input and output codecs of the
            # charset.
            if isinstance(s, str):
                # Possibly raise UnicodeError if the byte string can't be
                # converted to a unicode with the input codec of the charset.
                incodec = charset.input_codec or 'us-ascii'
                ustr = unicode(s, incodec, errors)
                # Now make sure that the unicode could be converted back to a
                # byte string with the output codec, which may be different
                # than the iput coded.  Still, use the original byte string.
                outcodec = charset.output_codec or 'us-ascii'
                ustr.encode(outcodec, errors)
            elif isinstance(s, unicode):
                # Now we have to be sure the unicode string can be converted
                # to a byte string with a reasonable output codec.  We want to
                # use the byte string in the chunk.
                for charset in USASCII, charset, UTF8:
                    try:
                        outcodec = charset.output_codec or 'us-ascii'
                        s = s.encode(outcodec, errors)
                        break
                    except UnicodeError:
                        pass
                else:
                    assert False, 'utf-8 conversion failed'
        self._chunks.append((s, charset))

    def _split(self, s, charset, maxlinelen, splitchars):
        # Split up a header safely for use with encode_chunks.
        splittable = charset.to_splittable(s)
        encoded = charset.from_splittable(splittable, True)
        elen = charset.encoded_header_len(encoded)
        # If the line's encoded length first, just return it
        if elen <= maxlinelen:
            return [(encoded, charset)]
        # If we have undetermined raw 8bit characters sitting in a byte
        # string, we really don't know what the right thing to do is.  We
        # can't really split it because it might be multibyte data which we
        # could break if we split it between pairs.  The least harm seems to
        # be to not split the header at all, but that means they could go out
        # longer than maxlinelen.
        if charset == '8bit':
            return [(s, charset)]
        # BAW: I'm not sure what the right test here is.  What we're trying to
        # do is be faithful to RFC 2822's recommendation that ($2.2.3):
        #
        # "Note: Though structured field bodies are defined in such a way that
        #  folding can take place between many of the lexical tokens (and even
        #  within some of the lexical tokens), folding SHOULD be limited to
        #  placing the CRLF at higher-level syntactic breaks."
        #
        # For now, I can only imagine doing this when the charset is us-ascii,
        # although it's possible that other charsets may also benefit from the
        # higher-level syntactic breaks.
        elif charset == 'us-ascii':
            return self._split_ascii(s, charset, maxlinelen, splitchars)
        # BAW: should we use encoded?
        elif elen == len(s):
            # We can split on _maxlinelen boundaries because we know that the
            # encoding won't change the size of the string
            splitpnt = maxlinelen
            first = charset.from_splittable(splittable[:splitpnt], False)
            last = charset.from_splittable(splittable[splitpnt:], False)
        else:
            # Binary search for split point
            first, last = _binsplit(splittable, charset, maxlinelen)
        # first is of the proper length so just wrap it in the appropriate
        # chrome.  last must be recursively split.
        fsplittable = charset.to_splittable(first)
        fencoded = charset.from_splittable(fsplittable, True)
        chunk = [(fencoded, charset)]
        return chunk + self._split(last, charset, self._maxlinelen, splitchars)

    def _split_ascii(self, s, charset, firstlen, splitchars):
        chunks = _split_ascii(s, firstlen, self._maxlinelen,
                              self._continuation_ws, splitchars)
        return zip(chunks, [charset]*len(chunks))

    def _encode_chunks(self, newchunks, maxlinelen):
        # MIME-encode a header with many different charsets and/or encodings.
        #
        # Given a list of pairs (string, charset), return a MIME-encoded
        # string suitable for use in a header field.  Each pair may have
        # different charsets and/or encodings, and the resulting header will
        # accurately reflect each setting.
        #
        # Each encoding can be email.Utils.QP (quoted-printable, for
        # ASCII-like character sets like iso-8859-1), email.Utils.BASE64
        # (Base64, for non-ASCII like character sets like KOI8-R and
        # iso-2022-jp), or None (no encoding).
        #
        # Each pair will be represented on a separate line; the resulting
        # string will be in the format:
        #
        # =?charset1?q?Mar=EDa_Gonz=E1lez_Alonso?=\n
        #  =?charset2?b?SvxyZ2VuIEL2aW5n?="
        chunks = []
        for header, charset in newchunks:
            if not header:
                continue
            if charset is None or charset.header_encoding is None:
                s = header
            else:
                s = charset.header_encode(header)
            # Don't add more folding whitespace than necessary
            if chunks and chunks[-1].endswith(' '):
                extra = ''
            else:
                extra = ' '
            _max_append(chunks, s, maxlinelen, extra)
        joiner = NL + self._continuation_ws
        return joiner.join(chunks)

    def encode(self, splitchars=';, '):
        """Encode a message header into an RFC-compliant format.

        There are many issues involved in converting a given string for use in
        an email header.  Only certain character sets are readable in most
        email clients, and as header strings can only contain a subset of
        7-bit ASCII, care must be taken to properly convert and encode (with
        Base64 or quoted-printable) header strings.  In addition, there is a
        75-character length limit on any given encoded header field, so
        line-wrapping must be performed, even with double-byte character sets.

        This method will do its best to convert the string to the correct
        character set used in email, and encode and line wrap it safely with
        the appropriate scheme for that character set.

        If the given charset is not known or an error occurs during
        conversion, this function will return the header untouched.

        Optional splitchars is a string containing characters to split long
        ASCII lines on, in rough support of RFC 2822's `highest level
        syntactic breaks'.  This doesn't affect RFC 2047 encoded lines.
        """
        newchunks = []
        maxlinelen = self._firstlinelen
        lastlen = 0
        for s, charset in self._chunks:
            # The first bit of the next chunk should be just long enough to
            # fill the next line.  Don't forget the space separating the
            # encoded words.
            targetlen = maxlinelen - lastlen - 1
            if targetlen < charset.encoded_header_len(''):
                # Stick it on the next line
                targetlen = maxlinelen
            newchunks += self._split(s, charset, targetlen, splitchars)
            lastchunk, lastcharset = newchunks[-1]
            lastlen = lastcharset.encoded_header_len(lastchunk)
        return self._encode_chunks(newchunks, maxlinelen)


\f
def _split_ascii(s, firstlen, restlen, continuation_ws, splitchars):
    lines = []
    maxlen = firstlen
    for line in s.splitlines():
        # Ignore any leading whitespace (i.e. continuation whitespace) already
        # on the line, since we'll be adding our own.
        line = line.lstrip()
        if len(line) < maxlen:
            lines.append(line)
            maxlen = restlen
            continue
        # Attempt to split the line at the highest-level syntactic break
        # possible.  Note that we don't have a lot of smarts about field
        # syntax; we just try to break on semi-colons, then commas, then
        # whitespace.
        for ch in splitchars:
            if ch in line:
                break
        else:
            # There's nothing useful to split the line on, not even spaces, so
            # just append this line unchanged
            lines.append(line)
            maxlen = restlen
            continue
        # Now split the line on the character plus trailing whitespace
        cre = re.compile(r'%s\s*' % ch)
        if ch in ';,':
            eol = ch
        else:
            eol = ''
        joiner = eol + ' '
        joinlen = len(joiner)
        wslen = len(continuation_ws.replace('\t', SPACE8))
        this = []
        linelen = 0
        for part in cre.split(line):
            curlen = linelen + max(0, len(this)-1) * joinlen
            partlen = len(part)
            onfirstline = not lines
            # We don't want to split after the field name, if we're on the
            # first line and the field name is present in the header string.
            if ch == ' ' and onfirstline and \
                   len(this) == 1 and fcre.match(this[0]):
                this.append(part)
                linelen += partlen
            elif curlen + partlen > maxlen:
                if this:
                    lines.append(joiner.join(this) + eol)
                # If this part is longer than maxlen and we aren't already
                # splitting on whitespace, try to recursively split this line
                # on whitespace.
                if partlen > maxlen and ch <> ' ':
                    subl = _split_ascii(part, maxlen, restlen,
                                        continuation_ws, ' ')
                    lines.extend(subl[:-1])
                    this = [subl[-1]]
                else:
                    this = [part]
                linelen = wslen + len(this[-1])
                maxlen = restlen
            else:
                this.append(part)
                linelen += partlen
        # Put any left over parts on a line by themselves
        if this:
            lines.append(joiner.join(this))
    return lines


\f
def _binsplit(splittable, charset, maxlinelen):
    i = 0
    j = len(splittable)
    while i < j:
        # Invariants:
        # 1. splittable[:k] fits for all k <= i (note that we *assume*,
        #    at the start, that splittable[:0] fits).
        # 2. splittable[:k] does not fit for any k > j (at the start,
        #    this means we shouldn't look at any k > len(splittable)).
        # 3. We don't know about splittable[:k] for k in i+1..j.
        # 4. We want to set i to the largest k that fits, with i <= k <= j.
        #
        m = (i+j+1) >> 1  # ceiling((i+j)/2); i < m <= j
        chunk = charset.from_splittable(splittable[:m], True)
        chunklen = charset.encoded_header_len(chunk)
        if chunklen <= maxlinelen:
            # m is acceptable, so is a new lower bound.
            i = m
        else:
            # m is not acceptable, so final i must be < m.
            j = m - 1
    # i == j.  Invariant #1 implies that splittable[:i] fits, and
    # invariant #2 implies that splittable[:i+1] does not fit, so i
    # is what we're looking for.
    first = charset.from_splittable(splittable[:i], False)
    last  = charset.from_splittable(splittable[i:], False)
    return first, last
Commit	Line	Data
920dae64 AT	1	# Copyright (C) 2002-2004 Python Software Foundation
	2	# Author: Ben Gertzfield, Barry Warsaw
	3	# Contact: email-sig@python.org
	4
	5	"""Header encoding and decoding functionality."""
	6
	7	import re
	8	import binascii
	9
	10	import email.quopriMIME
	11	import email.base64MIME
	12	from email.Errors import HeaderParseError
	13	from email.Charset import Charset
	14
	15	NL = '\n'
	16	SPACE = ' '
	17	USPACE = u' '
	18	SPACE8 = ' ' * 8
	19	UEMPTYSTRING = u''
	20
	21	MAXLINELEN = 76
	22
	23	USASCII = Charset('us-ascii')
	24	UTF8 = Charset('utf-8')
	25
	26	# Match encoded-word strings in the form =?charset?q?Hello_World?=
	27	ecre = re.compile(r'''
	28	=\? # literal =?
	29	(?P<charset>[^?]*?) # non-greedy up to the next ? is the charset
	30	\? # literal ?
	31	(?P<encoding>[qb]) # either a "q" or a "b", case insensitive
	32	\? # literal ?
	33	(?P<encoded>.*?) # non-greedy up to the next ?= is the encoded string
	34	\?= # literal ?=
	35	''', re.VERBOSE \| re.IGNORECASE)
	36
	37	# Field name regexp, including trailing colon, but not separating whitespace,
	38	# according to RFC 2822. Character range is from tilde to exclamation mark.
	39	# For use with .match()
	40	fcre = re.compile(r'[\041-\176]+:$')
	41
	42
	43	\f
	44	# Helpers
	45	_max_append = email.quopriMIME._max_append
	46
	47
	48	\f
	49	def decode_header(header):
	50	"""Decode a message header value without converting charset.
	51
	52	Returns a list of (decoded_string, charset) pairs containing each of the
	53	decoded parts of the header. Charset is None for non-encoded parts of the
	54	header, otherwise a lower-case string containing the name of the character
	55	set specified in the encoded string.
	56
	57	An email.Errors.HeaderParseError may be raised when certain decoding error
	58	occurs (e.g. a base64 decoding exception).
	59	"""
	60	# If no encoding, just return the header
	61	header = str(header)
	62	if not ecre.search(header):
	63	return [(header, None)]
	64	decoded = []
65	dec = ''
66	for line in header.splitlines():
67	# This line might not have an encoding in it
68	if not ecre.search(line):
69	decoded.append((line, None))
70	continue
71	parts = ecre.split(line)
72	while parts:
73	unenc = parts.pop(0).strip()
74	if unenc:
75	# Should we continue a long line?
76	if decoded and decoded[-1][1] is None:
77	decoded[-1] = (decoded[-1][0] + SPACE + unenc, None)
78	else:
79	decoded.append((unenc, None))
80	if parts:
81	charset, encoding = [s.lower() for s in parts[0:2]]
82	encoded = parts[2]
83	dec = None
84	if encoding == 'q':
85	dec = email.quopriMIME.header_decode(encoded)
86	elif encoding == 'b':
87	try:
88	dec = email.base64MIME.decode(encoded)
89	except binascii.Error:
90	# Turn this into a higher level exception. BAW: Right
91	# now we throw the lower level exception away but
92	# when/if we get exception chaining, we'll preserve it.
93	raise HeaderParseError
94	if dec is None:
95	dec = encoded
96
97	if decoded and decoded[-1][1] == charset:
98	decoded[-1] = (decoded[-1][0] + dec, decoded[-1][1])
99	else:
100	decoded.append((dec, charset))
101	del parts[0:3]
102	return decoded
103
104
105	\f
106	def make_header(decoded_seq, maxlinelen=None, header_name=None,
107	continuation_ws=' '):
108	"""Create a Header from a sequence of pairs as returned by decode_header()
109
110	decode_header() takes a header value string and returns a sequence of
111	pairs of the format (decoded_string, charset) where charset is the string
112	name of the character set.
113
114	This function takes one of those sequence of pairs and returns a Header
115	instance. Optional maxlinelen, header_name, and continuation_ws are as in
116	the Header constructor.
117	"""
118	h = Header(maxlinelen=maxlinelen, header_name=header_name,
119	continuation_ws=continuation_ws)
120	for s, charset in decoded_seq:
121	# None means us-ascii but we can simply pass it on to h.append()
122	if charset is not None and not isinstance(charset, Charset):
123	charset = Charset(charset)
124	h.append(s, charset)
125	return h
126
127
128	\f
129	class Header:
130	def __init__(self, s=None, charset=None,
131	maxlinelen=None, header_name=None,
132	continuation_ws=' ', errors='strict'):
133	"""Create a MIME-compliant header that can contain many character sets.
134
135	Optional s is the initial header value. If None, the initial header
136	value is not set. You can later append to the header with .append()
137	method calls. s may be a byte string or a Unicode string, but see the
138	.append() documentation for semantics.
139
140	Optional charset serves two purposes: it has the same meaning as the
141	charset argument to the .append() method. It also sets the default
142	character set for all subsequent .append() calls that omit the charset
143	argument. If charset is not provided in the constructor, the us-ascii
144	charset is used both as s's initial charset and as the default for
145	subsequent .append() calls.
146
147	The maximum line length can be specified explicit via maxlinelen. For
148	splitting the first line to a shorter value (to account for the field
149	header which isn't included in s, e.g. `Subject') pass in the name of
150	the field in header_name. The default maxlinelen is 76.
151
152	continuation_ws must be RFC 2822 compliant folding whitespace (usually
153	either a space or a hard tab) which will be prepended to continuation
154	lines.
155
156	errors is passed through to the .append() call.
157	"""
158	if charset is None:
159	charset = USASCII
160	if not isinstance(charset, Charset):
161	charset = Charset(charset)
162	self._charset = charset
163	self._continuation_ws = continuation_ws
164	cws_expanded_len = len(continuation_ws.replace('\t', SPACE8))
165	# BAW: I believe `chunks' and `maxlinelen' should be non-public.
166	self._chunks = []
167	if s is not None:
168	self.append(s, charset, errors)
169	if maxlinelen is None:
170	maxlinelen = MAXLINELEN
171	if header_name is None:
172	# We don't know anything about the field header so the first line
173	# is the same length as subsequent lines.
174	self._firstlinelen = maxlinelen
175	else:
176	# The first line should be shorter to take into account the field
177	# header. Also subtract off 2 extra for the colon and space.
178	self._firstlinelen = maxlinelen - len(header_name) - 2
179	# Second and subsequent lines should subtract off the length in
180	# columns of the continuation whitespace prefix.
181	self._maxlinelen = maxlinelen - cws_expanded_len
182
183	def __str__(self):
184	"""A synonym for self.encode()."""
185	return self.encode()
186
187	def __unicode__(self):
188	"""Helper for the built-in unicode function."""
189	uchunks = []
190	lastcs = None
191	for s, charset in self._chunks:
192	# We must preserve spaces between encoded and non-encoded word
193	# boundaries, which means for us we need to add a space when we go
194	# from a charset to None/us-ascii, or from None/us-ascii to a
195	# charset. Only do this for the second and subsequent chunks.
196	nextcs = charset
197	if uchunks:
198	if lastcs not in (None, 'us-ascii'):
199	if nextcs in (None, 'us-ascii'):
200	uchunks.append(USPACE)
201	nextcs = None
202	elif nextcs not in (None, 'us-ascii'):
203	uchunks.append(USPACE)
204	lastcs = nextcs
205	uchunks.append(unicode(s, str(charset)))
206	return UEMPTYSTRING.join(uchunks)
207
208	# Rich comparison operators for equality only. BAW: does it make sense to
209	# have or explicitly disable <, <=, >, >= operators?
210	def __eq__(self, other):
211	# other may be a Header or a string. Both are fine so coerce
212	# ourselves to a string, swap the args and do another comparison.
213	return other == self.encode()
214
215	def __ne__(self, other):
216	return not self == other
217
218	def append(self, s, charset=None, errors='strict'):
219	"""Append a string to the MIME header.
220
221	Optional charset, if given, should be a Charset instance or the name
222	of a character set (which will be converted to a Charset instance). A
223	value of None (the default) means that the charset given in the
224	constructor is used.
225
226	s may be a byte string or a Unicode string. If it is a byte string
227	(i.e. isinstance(s, str) is true), then charset is the encoding of
228	that byte string, and a UnicodeError will be raised if the string
229	cannot be decoded with that charset. If s is a Unicode string, then
230	charset is a hint specifying the character set of the characters in
231	the string. In this case, when producing an RFC 2822 compliant header
232	using RFC 2047 rules, the Unicode string will be encoded using the
233	following charsets in order: us-ascii, the charset hint, utf-8. The
234	first character set not to provoke a UnicodeError is used.
235
236	Optional `errors' is passed as the third argument to any unicode() or
237	ustr.encode() call.
238	"""
239	if charset is None:
240	charset = self._charset
241	elif not isinstance(charset, Charset):
242	charset = Charset(charset)
243	# If the charset is our faux 8bit charset, leave the string unchanged
244	if charset <> '8bit':
245	# We need to test that the string can be converted to unicode and
246	# back to a byte string, given the input and output codecs of the
247	# charset.
248	if isinstance(s, str):
249	# Possibly raise UnicodeError if the byte string can't be
250	# converted to a unicode with the input codec of the charset.
251	incodec = charset.input_codec or 'us-ascii'
252	ustr = unicode(s, incodec, errors)
253	# Now make sure that the unicode could be converted back to a
254	# byte string with the output codec, which may be different
255	# than the iput coded. Still, use the original byte string.
256	outcodec = charset.output_codec or 'us-ascii'
257	ustr.encode(outcodec, errors)
258	elif isinstance(s, unicode):
259	# Now we have to be sure the unicode string can be converted
260	# to a byte string with a reasonable output codec. We want to
261	# use the byte string in the chunk.
262	for charset in USASCII, charset, UTF8:
263	try:
264	outcodec = charset.output_codec or 'us-ascii'
265	s = s.encode(outcodec, errors)
266	break
267	except UnicodeError:
268	pass
269	else:
270	assert False, 'utf-8 conversion failed'
271	self._chunks.append((s, charset))
272
273	def _split(self, s, charset, maxlinelen, splitchars):
274	# Split up a header safely for use with encode_chunks.
275	splittable = charset.to_splittable(s)
276	encoded = charset.from_splittable(splittable, True)
277	elen = charset.encoded_header_len(encoded)
278	# If the line's encoded length first, just return it
279	if elen <= maxlinelen:
280	return [(encoded, charset)]
281	# If we have undetermined raw 8bit characters sitting in a byte
282	# string, we really don't know what the right thing to do is. We
283	# can't really split it because it might be multibyte data which we
284	# could break if we split it between pairs. The least harm seems to
285	# be to not split the header at all, but that means they could go out
286	# longer than maxlinelen.
287	if charset == '8bit':
288	return [(s, charset)]
289	# BAW: I'm not sure what the right test here is. What we're trying to
290	# do is be faithful to RFC 2822's recommendation that ($2.2.3):
291	#
292	# "Note: Though structured field bodies are defined in such a way that
293	# folding can take place between many of the lexical tokens (and even
294	# within some of the lexical tokens), folding SHOULD be limited to
295	# placing the CRLF at higher-level syntactic breaks."
296	#
297	# For now, I can only imagine doing this when the charset is us-ascii,
298	# although it's possible that other charsets may also benefit from the
299	# higher-level syntactic breaks.
300	elif charset == 'us-ascii':
301	return self._split_ascii(s, charset, maxlinelen, splitchars)
302	# BAW: should we use encoded?
303	elif elen == len(s):
304	# We can split on _maxlinelen boundaries because we know that the
305	# encoding won't change the size of the string
306	splitpnt = maxlinelen
307	first = charset.from_splittable(splittable[:splitpnt], False)
308	last = charset.from_splittable(splittable[splitpnt:], False)
309	else:
310	# Binary search for split point
311	first, last = _binsplit(splittable, charset, maxlinelen)
312	# first is of the proper length so just wrap it in the appropriate
313	# chrome. last must be recursively split.
314	fsplittable = charset.to_splittable(first)
315	fencoded = charset.from_splittable(fsplittable, True)
316	chunk = [(fencoded, charset)]
317	return chunk + self._split(last, charset, self._maxlinelen, splitchars)
318
319	def _split_ascii(self, s, charset, firstlen, splitchars):
320	chunks = _split_ascii(s, firstlen, self._maxlinelen,
321	self._continuation_ws, splitchars)
322	return zip(chunks, [charset]*len(chunks))
323
324	def _encode_chunks(self, newchunks, maxlinelen):
325	# MIME-encode a header with many different charsets and/or encodings.
326	#
327	# Given a list of pairs (string, charset), return a MIME-encoded
328	# string suitable for use in a header field. Each pair may have
329	# different charsets and/or encodings, and the resulting header will
330	# accurately reflect each setting.
331	#
332	# Each encoding can be email.Utils.QP (quoted-printable, for
333	# ASCII-like character sets like iso-8859-1), email.Utils.BASE64
334	# (Base64, for non-ASCII like character sets like KOI8-R and
335	# iso-2022-jp), or None (no encoding).
336	#
337	# Each pair will be represented on a separate line; the resulting
338	# string will be in the format:
339	#
340	# =?charset1?q?Mar=EDa_Gonz=E1lez_Alonso?=\n
341	# =?charset2?b?SvxyZ2VuIEL2aW5n?="
342	chunks = []
343	for header, charset in newchunks:
344	if not header:
345	continue
346	if charset is None or charset.header_encoding is None:
347	s = header
348	else:
349	s = charset.header_encode(header)
350	# Don't add more folding whitespace than necessary
351	if chunks and chunks[-1].endswith(' '):
352	extra = ''
353	else:
354	extra = ' '
355	_max_append(chunks, s, maxlinelen, extra)
356	joiner = NL + self._continuation_ws
357	return joiner.join(chunks)
358
359	def encode(self, splitchars=';, '):
360	"""Encode a message header into an RFC-compliant format.
361
362	There are many issues involved in converting a given string for use in
363	an email header. Only certain character sets are readable in most
364	email clients, and as header strings can only contain a subset of
365	7-bit ASCII, care must be taken to properly convert and encode (with
366	Base64 or quoted-printable) header strings. In addition, there is a
367	75-character length limit on any given encoded header field, so
368	line-wrapping must be performed, even with double-byte character sets.
369
370	This method will do its best to convert the string to the correct
371	character set used in email, and encode and line wrap it safely with
372	the appropriate scheme for that character set.
373
374	If the given charset is not known or an error occurs during
375	conversion, this function will return the header untouched.
376
377	Optional splitchars is a string containing characters to split long
378	ASCII lines on, in rough support of RFC 2822's `highest level
379	syntactic breaks'. This doesn't affect RFC 2047 encoded lines.
380	"""
381	newchunks = []
382	maxlinelen = self._firstlinelen
383	lastlen = 0
384	for s, charset in self._chunks:
385	# The first bit of the next chunk should be just long enough to
386	# fill the next line. Don't forget the space separating the
387	# encoded words.
388	targetlen = maxlinelen - lastlen - 1
389	if targetlen < charset.encoded_header_len(''):
390	# Stick it on the next line
391	targetlen = maxlinelen
392	newchunks += self._split(s, charset, targetlen, splitchars)
393	lastchunk, lastcharset = newchunks[-1]
394	lastlen = lastcharset.encoded_header_len(lastchunk)
395	return self._encode_chunks(newchunks, maxlinelen)
396
397
398	\f
399	def _split_ascii(s, firstlen, restlen, continuation_ws, splitchars):
400	lines = []
401	maxlen = firstlen
402	for line in s.splitlines():
403	# Ignore any leading whitespace (i.e. continuation whitespace) already
404	# on the line, since we'll be adding our own.
405	line = line.lstrip()
406	if len(line) < maxlen:
407	lines.append(line)
408	maxlen = restlen
409	continue
410	# Attempt to split the line at the highest-level syntactic break
411	# possible. Note that we don't have a lot of smarts about field
412	# syntax; we just try to break on semi-colons, then commas, then
413	# whitespace.
414	for ch in splitchars:
415	if ch in line:
416	break
417	else:
418	# There's nothing useful to split the line on, not even spaces, so
419	# just append this line unchanged
420	lines.append(line)
421	maxlen = restlen
422	continue
423	# Now split the line on the character plus trailing whitespace
424	cre = re.compile(r'%s\s*' % ch)
425	if ch in ';,':
426	eol = ch
427	else:
428	eol = ''
429	joiner = eol + ' '
430	joinlen = len(joiner)
431	wslen = len(continuation_ws.replace('\t', SPACE8))
432	this = []
433	linelen = 0
434	for part in cre.split(line):
435	curlen = linelen + max(0, len(this)-1) * joinlen
436	partlen = len(part)
437	onfirstline = not lines
438	# We don't want to split after the field name, if we're on the
439	# first line and the field name is present in the header string.
440	if ch == ' ' and onfirstline and \
441	len(this) == 1 and fcre.match(this[0]):
442	this.append(part)
443	linelen += partlen
444	elif curlen + partlen > maxlen:
445	if this:
446	lines.append(joiner.join(this) + eol)
447	# If this part is longer than maxlen and we aren't already
448	# splitting on whitespace, try to recursively split this line
449	# on whitespace.
450	if partlen > maxlen and ch <> ' ':
451	subl = _split_ascii(part, maxlen, restlen,
452	continuation_ws, ' ')
453	lines.extend(subl[:-1])
454	this = [subl[-1]]
455	else:
456	this = [part]
457	linelen = wslen + len(this[-1])
458	maxlen = restlen
459	else:
460	this.append(part)
461	linelen += partlen
462	# Put any left over parts on a line by themselves
463	if this:
464	lines.append(joiner.join(this))
465	return lines
466
467
468	\f
469	def _binsplit(splittable, charset, maxlinelen):
470	i = 0
471	j = len(splittable)
472	while i < j:
473	# Invariants:
474	# 1. splittable[:k] fits for all k <= i (note that we assume,
475	# at the start, that splittable[:0] fits).
476	# 2. splittable[:k] does not fit for any k > j (at the start,
477	# this means we shouldn't look at any k > len(splittable)).
478	# 3. We don't know about splittable[:k] for k in i+1..j.
479	# 4. We want to set i to the largest k that fits, with i <= k <= j.
480	#
481	m = (i+j+1) >> 1 # ceiling((i+j)/2); i < m <= j
482	chunk = charset.from_splittable(splittable[:m], True)
483	chunklen = charset.encoded_header_len(chunk)
484	if chunklen <= maxlinelen:
485	# m is acceptable, so is a new lower bound.
486	i = m
487	else:
488	# m is not acceptable, so final i must be < m.
489	j = m - 1
490	# i == j. Invariant #1 implies that splittable[:i] fits, and
491	# invariant #2 implies that splittable[:i+1] does not fit, so i
492	# is what we're looking for.
493	first = charset.from_splittable(splittable[:i], False)
494	last = charset.from_splittable(splittable[i:], False)
495	return first, last