diff -r e0f8bed0435c Lib/email/__init__.py --- a/Lib/email/__init__.py Tue Sep 21 14:28:43 2010 +0200 +++ b/Lib/email/__init__.py Tue Sep 21 18:03:27 2010 -0400 @@ -36,6 +36,14 @@ from email.parser import Parser return Parser(*args, **kws).parsestr(s) +def message_from_bytes(s, *args, **kws): + """Parse a bytes string into a Message object model. + + Optional _class and strict are passed to the Parser constructor. + """ + from email.parser import Parser + return Parser(*args, **kws).parsebytes(s) + def message_from_file(fp, *args, **kws): """Read a file and parse its contents into a Message object model. diff -r e0f8bed0435c Lib/email/message.py --- a/Lib/email/message.py Tue Sep 21 14:28:43 2010 +0200 +++ b/Lib/email/message.py Tue Sep 21 18:03:27 2010 -0400 @@ -24,6 +24,11 @@ # existence of which force quoting of the parameter value. tspecials = re.compile(r'[ \(\)<>@,;:\\"/\[\]\?=]') +# How to figure out if we are processing strings that come from a byte +# source with undecodable characters. +has_surrogates = re.compile( + r'([^\ud800-\udbff]|\A)[\udc00-\udfff]([^\udc00-\udfff]|\Z)').search + # Helper functions def _splitparam(param): @@ -184,44 +189,71 @@ If the message is a multipart and the decode flag is True, then None is returned. """ - if i is None: - payload = self._payload - elif not isinstance(self._payload, list): + # Here is the logic table for this code, based on the email5.0.0 code: + # i decode is_multipart result + # ------ ------ ------------ ------------------------------ + # None True True None + # i True True None + # None False True _payload (a list) + # i False True _payload element i (a Message) + # i False False error (not a list) + # i True False error (not a list) + # None False False _payload + # None True False _payload decoded (bytes) + # Note that Barry planned to factor out the 'decode' case, but that + # isn't so easy now that we handle the 8 bit data, which needs to be + # converted in both the decode and non-decode path. + if self.is_multipart(): + if decode: + return None + if i is None: + return self._payload + else: + return self._payload[i] + # For backward compatibility, Use isinstance and this error message + # instead of the more logical is_multipart test. + if i is not None and not isinstance(self._payload, list): raise TypeError('Expected list, got %s' % type(self._payload)) - else: - payload = self._payload[i] + payload = self._payload + cte = self.get('content-transfer-encoding', '').lower() + # payload can be bytes here, (I wonder if that is actually a bug?) + if isinstance(payload, str): + if has_surrogates(payload): + bpayload = payload.encode('ascii', 'surrogateescape') + if not decode: + try: + payload = bpayload.decode(str(self.get_param('charset', 'ascii')), 'replace') + except LookupError: + payload = bpayload.decode('ascii', 'replace') + elif decode: + try: + bpayload = payload.encode('ascii') + except UnicodeError: + # This won't happen for RFC compliant messages. + # If it does happen, turn the string into bytes in + # a way guaranteed not to fail. + bpayload = payload.encode('raw-unicode-escape') if not decode: return payload - # Decoded payloads always return bytes. XXX split this part out into - # a new method called .get_decoded_payload(). - if self.is_multipart(): - return None - cte = self.get('content-transfer-encoding', '').lower() if cte == 'quoted-printable': - if isinstance(payload, str): - payload = payload.encode('ascii') - return utils._qdecode(payload) + return utils._qdecode(bpayload) elif cte == 'base64': try: - if isinstance(payload, str): - payload = payload.encode('ascii') - return base64.b64decode(payload) + return base64.b64decode(bpayload) except binascii.Error: # Incorrect padding - pass + return bpayload elif cte in ('x-uuencode', 'uuencode', 'uue', 'x-uue'): - in_file = BytesIO(payload.encode('ascii')) + in_file = BytesIO(bpayload) out_file = BytesIO() try: uu.decode(in_file, out_file, quiet=True) return out_file.getvalue() except uu.Error: # Some decoding problem - pass - # Is there a better way to do this? We can't use the bytes - # constructor. + return bpayload if isinstance(payload, str): - return payload.encode('raw-unicode-escape') + return bpayload return payload def set_payload(self, payload, charset=None): diff -r e0f8bed0435c Lib/email/parser.py --- a/Lib/email/parser.py Tue Sep 21 14:28:43 2010 +0200 +++ b/Lib/email/parser.py Tue Sep 21 18:03:27 2010 -0400 @@ -71,6 +71,17 @@ feedparser.feed(data) return feedparser.close() + def parsebytes(self, text, headersonly=False): + """Create a message structure from a byte string. + + Returns the root of the message structure. Optional headersonly is a + flag specifying whether to stop parsing after reading the headers or + not. The default is False, meaning it parses the entire contents of + the file. + """ + text = text.decode('ASCII', errors='surrogateescape') + return self.parsestr(text, headersonly) + def parsestr(self, text, headersonly=False): """Create a message structure from a string. diff -r e0f8bed0435c Lib/email/test/test_email.py --- a/Lib/email/test/test_email.py Tue Sep 21 14:28:43 2010 +0200 +++ b/Lib/email/test/test_email.py Tue Sep 21 18:03:27 2010 -0400 @@ -2662,6 +2662,25 @@ msg = email.message_from_string(m) self.assertTrue(msg.get_payload(0).get_payload().endswith('\r\n')) + def test_8bit_transfer_encoding(self): + # In Python3 all input is string, but that doesn't work if the + # actual input uses an 8bit transfer encoding. To hack around that + # we decode byte streams using the surrogateescape error handler, + # and reconvert to binary before decoding the part. This test makes + # sure 8bit data actually survives the trip. + m = (b"""\ +From: foo@bar.com +To: baz +Mime-Version: 1.0 +Content-Type: text/plain; charset=utf-8 +Content-Transfer-Encoding: 8bit + +p\xc3\xb6stal +""") + msg = email.message_from_bytes(m) + self.assertEqual(msg.get_payload(), "pöstal\n") + + class TestBase64(unittest.TestCase): def test_len(self):