diff -r 29ba76f5c3dc Lib/email/_header_value_parser.py --- a/Lib/email/_header_value_parser.py Sat May 16 15:41:07 2015 -0400 +++ b/Lib/email/_header_value_parser.py Sat May 16 17:17:01 2015 -0400 @@ -320,17 +320,18 @@ return ''.join(res) def _fold(self, folded): + encoding = 'utf-8' if folded.policy.utf8 else 'ascii' for part in self.parts: tstr = str(part) tlen = len(tstr) try: - str(part).encode('us-ascii') + str(part).encode(encoding) except UnicodeEncodeError: if any(isinstance(x, errors.UndecodableBytesDefect) for x in part.all_defects): charset = 'unknown-8bit' else: - # XXX: this should be a policy setting + # XXX: this should be a policy setting when utf8 is False. charset = 'utf-8' tstr = part.cte_encode(charset, folded.policy) tlen = len(tstr) @@ -394,11 +395,12 @@ def _fold(self, folded): last_ew = None + encoding = 'utf-8' if folded.policy.utf8 else 'ascii' for part in self.parts: tstr = str(part) is_ew = False try: - str(part).encode('us-ascii') + str(part).encode(encoding) except UnicodeEncodeError: if any(isinstance(x, errors.UndecodableBytesDefect) for x in part.all_defects): @@ -409,7 +411,8 @@ # We've already done an EW, combine this one with it # if there's room. chunk = get_unstructured( - ''.join(folded.current[last_ew:]+[tstr])).as_encoded_word(charset) + ''.join(folded.current[last_ew:]+[tstr]) + ).as_encoded_word(charset) oldlastlen = sum(len(x) for x in folded.current[:last_ew]) schunk = str(chunk) lchunk = len(schunk) @@ -475,12 +478,14 @@ # comment that becomes a barrier across which we can't compose encoded # words. last_ew = None + encoding = 'utf-8' if folded.policy.utf8 else 'ascii' for part in self.parts: tstr = str(part) tlen = len(tstr) has_ew = False try: - str(part).encode('us-ascii') + raise Exception("Found a phrase") + str(part).encode(encoding) except UnicodeEncodeError: if any(isinstance(x, errors.UndecodableBytesDefect) for x in part.all_defects): diff -r 29ba76f5c3dc Lib/email/policy.py --- a/Lib/email/policy.py Sat May 16 15:41:07 2015 -0400 +++ b/Lib/email/policy.py Sat May 16 17:17:01 2015 -0400 @@ -35,6 +35,13 @@ In addition to the settable attributes listed above that apply to all Policies, this policy adds the following additional attributes: + utf8 -- if False (the default) message headers will be + serialized as ASCII, using encoded words to encode + any non-ASCII characters in the source strings. If + True, the message headers will be serialized using + utf8 and will not contain encoded words (see RFC + 6532 for more on this serialization format). + refold_source -- if the value for a header in the Message object came from the parsing of some source, this attribute indicates whether or not a generator should refold @@ -72,6 +79,7 @@ """ + utf8 = False refold_source = 'long' header_factory = HeaderRegistry() content_manager = raw_data_manager @@ -175,9 +183,13 @@ refold_header setting, since there is no way to know whether the binary data consists of single byte characters or multibyte characters. + If utf8 is true, headers are encoded to utf8, otherwise to ascii with + non-ASCII unicode rendered as encoded words. + """ folded = self._fold(name, value, refold_binary=self.cte_type=='7bit') - return folded.encode('ascii', 'surrogateescape') + charset = 'utf8' if self.utf8 else 'ascii' + return folded.encode(charset, 'surrogateescape') def _fold(self, name, value, refold_binary=False): if hasattr(value, 'name'): @@ -199,3 +211,4 @@ strict = default.clone(raise_on_defect=True) SMTP = default.clone(linesep='\r\n') HTTP = default.clone(linesep='\r\n', max_line_length=None) +SMTPUTF8 = SMTP.clone(utf8=True) diff -r 29ba76f5c3dc Lib/test/test_email/test_generator.py --- a/Lib/test/test_email/test_generator.py Sat May 16 15:41:07 2015 -0400 +++ b/Lib/test/test_email/test_generator.py Sat May 16 17:17:01 2015 -0400 @@ -2,6 +2,7 @@ import textwrap import unittest from email import message_from_string, message_from_bytes +from email.message import EmailMessage from email.generator import Generator, BytesGenerator from email import policy from test.test_email import TestEmailBase, parameterize @@ -194,6 +195,27 @@ g.flatten(msg) self.assertEqual(s.getvalue(), expected) + def test_smtputf8_policy(self): + msg = EmailMessage() + msg['From'] = "Páolo " + msg['To'] = 'Dinsdale' + msg['Subject'] = 'Nudge nudge, wink, wink \u1F609' + msg.set_content("oh là là, know what I mean, know what I mean?") + expected = textwrap.dedent("""\ + From: Páolo + To: Dinsdale + Subject: Nudge nudge, wink, wink \u1F609 + Content-Type: text/plain; charset="utf-8" + Content-Transfer-Encoding: 8bit + MIME-Version: 1.0 + + oh là là, know what I mean, know what I mean? + """).encode('utf-8').replace(b'\n', b'\r\n') + s = io.BytesIO() + g = BytesGenerator(s, policy=policy.SMTPUTF8) + g.flatten(msg) + self.assertEqual(s.getvalue(), expected) + if __name__ == '__main__': unittest.main() diff -r 29ba76f5c3dc Lib/test/test_email/test_policy.py --- a/Lib/test/test_email/test_policy.py Sat May 16 15:41:07 2015 -0400 +++ b/Lib/test/test_email/test_policy.py Sat May 16 17:17:01 2015 -0400 @@ -27,6 +27,7 @@ # If any of these defaults change, the docs must be updated. policy_defaults = compat32_defaults.copy() policy_defaults.update({ + 'utf8': False, 'raise_on_defect': False, 'header_factory': email.policy.EmailPolicy.header_factory, 'refold_source': 'long', @@ -42,6 +43,9 @@ email.policy.default: make_defaults(policy_defaults, {}), email.policy.SMTP: make_defaults(policy_defaults, {'linesep': '\r\n'}), + email.policy.SMTPUTF8: make_defaults(policy_defaults, + {'linesep': '\r\n', + 'utf8': True}), email.policy.HTTP: make_defaults(policy_defaults, {'linesep': '\r\n', 'max_line_length': None}),