diff -r e0f8bed0435c Lib/email/__init__.py --- a/Lib/email/__init__.py Tue Sep 21 14:28:43 2010 +0200 +++ b/Lib/email/__init__.py Thu Sep 30 22:28:31 2010 -0400 @@ -4,7 +4,7 @@ """A package for parsing, handling, and generating email messages.""" -__version__ = '5.0.0' +__version__ = '5.1.0' __all__ = [ 'base64mime', @@ -36,6 +36,14 @@ from email.parser import Parser return Parser(*args, **kws).parsestr(s) +def message_from_bytes(s, *args, **kws): + """Parse a bytes string into a Message object model. + + Optional _class and strict are passed to the Parser constructor. + """ + from email.parser import Parser + return Parser(*args, **kws).parsebytes(s) + def message_from_file(fp, *args, **kws): """Read a file and parse its contents into a Message object model. diff -r e0f8bed0435c Lib/email/message.py --- a/Lib/email/message.py Tue Sep 21 14:28:43 2010 +0200 +++ b/Lib/email/message.py Thu Sep 30 22:28:31 2010 -0400 @@ -24,8 +24,26 @@ # existence of which force quoting of the parameter value. tspecials = re.compile(r'[ \(\)<>@,;:\\"/\[\]\?=]') +# How to figure out if we are processing strings that come from a byte +# source with undecodable characters. +has_surrogates = re.compile( + '([^\ud800-\udbff]|\A)[\udc00-\udfff]([^\udc00-\udfff]|\Z)').search + # Helper functions +def _sanitize_surrogates(value): + # If the value contains surrogates, re-decode and replace the original + # non-ascii bytes with '?'s. Used to sanitize header values before letting + # them escape as strings. + if not isinstance(value, str): + # Header object + return value + if has_surrogates(value): + original_bytes = value.encode('ascii', 'surrogateescape') + return original_bytes.decode('ascii', 'replace').replace('�', '?') + else: + return value + def _splitparam(param): # Split header parameters. BAW: this may be too simple. It isn't # strictly RFC 2045 (section 5.1) compliant, but it catches most headers @@ -98,7 +116,7 @@ objects, otherwise it is a string. Message objects implement part of the `mapping' interface, which assumes - there is exactly one occurrance of the header per message. Some headers + there is exactly one occurrence of the header per message. Some headers do in fact appear multiple times (e.g. Received) and for those headers, you must use the explicit API to set or get all the headers. Not all of the mapping methods are implemented. @@ -184,44 +202,72 @@ If the message is a multipart and the decode flag is True, then None is returned. """ - if i is None: - payload = self._payload - elif not isinstance(self._payload, list): + # Here is the logic table for this code, based on the email5.0.0 code: + # i decode is_multipart result + # ------ ------ ------------ ------------------------------ + # None True True None + # i True True None + # None False True _payload (a list) + # i False True _payload element i (a Message) + # i False False error (not a list) + # i True False error (not a list) + # None False False _payload + # None True False _payload decoded (bytes) + # Note that Barry planned to factor out the 'decode' case, but that + # isn't so easy now that we handle the 8 bit data, which needs to be + # converted in both the decode and non-decode path. + if self.is_multipart(): + if decode: + return None + if i is None: + return self._payload + else: + return self._payload[i] + # For backward compatibility, Use isinstance and this error message + # instead of the more logical is_multipart test. + if i is not None and not isinstance(self._payload, list): raise TypeError('Expected list, got %s' % type(self._payload)) - else: - payload = self._payload[i] + payload = self._payload + cte = self.get('content-transfer-encoding', '').lower() + # payload can be bytes here, (I wonder if that is actually a bug?) + if isinstance(payload, str): + if has_surrogates(payload): + bpayload = payload.encode('ascii', 'surrogateescape') + if not decode: + try: + payload = bpayload.decode(str(self.get_param('charset', 'ascii')), 'replace') + except LookupError: + payload = bpayload.decode('ascii', 'replace') + elif decode: + try: + bpayload = payload.encode('ascii') + except UnicodeError: + # This won't happen for RFC compliant messages (messages + # containing only ASCII codepoints in the unicode input). + # If it does happen, turn the string into bytes in a way + # guaranteed not to fail. + bpayload = payload.encode('raw-unicode-escape') if not decode: return payload - # Decoded payloads always return bytes. XXX split this part out into - # a new method called .get_decoded_payload(). - if self.is_multipart(): - return None - cte = self.get('content-transfer-encoding', '').lower() if cte == 'quoted-printable': - if isinstance(payload, str): - payload = payload.encode('ascii') - return utils._qdecode(payload) + return utils._qdecode(bpayload) elif cte == 'base64': try: - if isinstance(payload, str): - payload = payload.encode('ascii') - return base64.b64decode(payload) + return base64.b64decode(bpayload) except binascii.Error: # Incorrect padding - pass + return bpayload elif cte in ('x-uuencode', 'uuencode', 'uue', 'x-uue'): - in_file = BytesIO(payload.encode('ascii')) + in_file = BytesIO(bpayload) out_file = BytesIO() try: uu.decode(in_file, out_file, quiet=True) return out_file.getvalue() except uu.Error: # Some decoding problem - pass - # Is there a better way to do this? We can't use the bytes - # constructor. + return bpayload if isinstance(payload, str): - return payload.encode('raw-unicode-escape') + return bpayload return payload def set_payload(self, payload, charset=None): @@ -290,7 +336,7 @@ Return None if the header is missing instead of raising an exception. Note that if the header appeared multiple times, exactly which - occurrance gets returned is undefined. Use get_all() to get all + occurrence gets returned is undefined. Use get_all() to get all the values matching a header field name. """ return self.get(name) @@ -322,9 +368,6 @@ for field, value in self._headers: yield field - def __len__(self): - return len(self._headers) - def keys(self): """Return a list of all the message's header field names. @@ -343,7 +386,7 @@ Any fields deleted and re-inserted are always appended to the header list. """ - return [v for k, v in self._headers] + return [_sanitize_surrogates(v) for k, v in self._headers] def items(self): """Get all the message's header fields and values. @@ -353,6 +396,7 @@ Any fields deleted and re-inserted are always appended to the header list. """ + return [(k, _sanitize_surrogates(v)) for k, v in self._headers] return self._headers[:] def get(self, name, failobj=None): @@ -364,7 +408,7 @@ name = name.lower() for k, v in self._headers: if k.lower() == name: - return v + return _sanitize_surrogates(v) return failobj # @@ -384,7 +428,7 @@ name = name.lower() for k, v in self._headers: if k.lower() == name: - values.append(v) + values.append(_sanitize_surrogates(v)) if not values: return failobj return values diff -r e0f8bed0435c Lib/email/parser.py --- a/Lib/email/parser.py Tue Sep 21 14:28:43 2010 +0200 +++ b/Lib/email/parser.py Thu Sep 30 22:28:31 2010 -0400 @@ -71,6 +71,17 @@ feedparser.feed(data) return feedparser.close() + def parsebytes(self, text, headersonly=False): + """Create a message structure from a byte string. + + Returns the root of the message structure. Optional headersonly is a + flag specifying whether to stop parsing after reading the headers or + not. The default is False, meaning it parses the entire contents of + the file. + """ + text = text.decode('ASCII', errors='surrogateescape') + return self.parsestr(text, headersonly) + def parsestr(self, text, headersonly=False): """Create a message structure from a string. diff -r e0f8bed0435c Lib/email/test/test_email.py --- a/Lib/email/test/test_email.py Tue Sep 21 14:28:43 2010 +0200 +++ b/Lib/email/test/test_email.py Thu Sep 30 22:28:31 2010 -0400 @@ -9,6 +9,7 @@ import difflib import unittest import warnings +import textwrap from io import StringIO from itertools import chain @@ -2663,6 +2664,135 @@ self.assertTrue(msg.get_payload(0).get_payload().endswith('\r\n')) +class Test8BitBytesHandling(unittest.TestCase): + # In Python3 all input is string, but that doesn't work if the actual input + # uses an 8bit transfer encoding. To hack around that, in email 5.1 we + # decode byte streams using the surrogateescape error handler, and + # reconvert to binary at appropriate places if we detect surrogates. This + # doesn't allow us to transform headers with 8bit bytes (they get munged), + # but it does allow us to parse and preserve them, and to decode body + # parts that use an 8bit CTE. + + bodytest_msg = textwrap.dedent("""\ + From: foo@bar.com + To: baz + Mime-Version: 1.0 + Content-Type: text/plain; charset={charset} + Content-Transfer-Encoding: {cte} + + {bodyline} + """) + + def test_known_8bit_CTE(self): + m = self.bodytest_msg.format(charset='utf-8', + cte='8bit', + bodyline='pöstal').encode('utf-8') + msg = email.message_from_bytes(m) + self.assertEqual(msg.get_payload(), "pöstal\n") + + def test_unknown_8bit_CTE(self): + m = self.bodytest_msg.format(charset='notavalidcharset', + cte='8bit', + bodyline='pöstal').encode('utf-8') + msg = email.message_from_bytes(m) + self.assertEqual(msg.get_payload(), "p��stal\n") + + def test_8bit_in_quopri_body(self): + # This is non-RFC compliant data...without 'decode' the library code + # decodes the body using the charset from the headers, and because the + # source byte really is utf-8 this works. This is likely to fail + # against real dirty data (ie: produce mojibake), but the data is + # invalid anyway so it is as good a guess as any. But this means that + # this test just confrms the current behavior; that behavior is not + # necessarily the best possible behavior. With 'decode' it is + # returning the raw bytes, so that test should be of correct behavior, + # or at least produce the same result that email4 did. + m = self.bodytest_msg.format(charset='utf-8', + cte='quoted-printable', + bodyline='p=C3=B6stál').encode('utf-8') + msg = email.message_from_bytes(m) + self.assertEqual(msg.get_payload(), 'p=C3=B6stál\n') + self.assertEqual(msg.get_payload(decode=True), + 'pöstál\n'.encode('utf-8')) + + def test_invalid_8bit_in_non_8bit_cte_uses_replace(self): + # This is similar to the previous test, but proves that if the 8bit + # byte is undecodeable in the specified charset, it gets replaced + # by the unicode 'unknown' character. Again, this may or may not + # be the ideal behavior. Note that if decode=False none of the + # decoders will get involved, so this is the only test we need + # for this behavior. + m = self.bodytest_msg.format(charset='ascii', + cte='quoted-printable', + bodyline='p=C3=B6stál').encode('utf-8') + msg = email.message_from_bytes(m) + self.assertEqual(msg.get_payload(), 'p=C3=B6st��l\n') + + def test_8bit_in_base64_body(self): + # Sticking an 8bit byte in a base64 block makes it undecodable by + # normal means, so the block is returned undecoded, but as bytes. + m = self.bodytest_msg.format(charset='utf-8', + cte='base64', + bodyline='cMO2c3RhbAá=').encode('utf-8') + msg = email.message_from_bytes(m) + self.assertEqual(msg.get_payload(decode=True), + 'cMO2c3RhbAá=\n'.encode('utf-8')) + + def test_8bit_in_uuencode_body(self): + # Sticking an 8bit byte in a uuencode block makes it undecodable by + # normal means, so the block is returned undecoded, but as bytes. + m = self.bodytest_msg.format(charset='utf-8', + cte='uuencode', + bodyline='<,.V