diff -r 3609d32cec46 Lib/email/header.py --- a/Lib/email/header.py Tue Jan 03 17:48:19 2012 +0100 +++ b/Lib/email/header.py Tue Jan 03 22:19:30 2012 +0100 @@ -40,7 +40,6 @@ \? # literal ? (?P.*?) # non-greedy up to the next ?= is the encoded string \?= # literal ?= - (?=[ \t]|$) # whitespace or the end of the string ''', re.VERBOSE | re.IGNORECASE | re.MULTILINE) # Field name regexp, including trailing colon, but not separating whitespace, @@ -86,8 +85,12 @@ words = [] for line in header.splitlines(): parts = ecre.split(line) + first = True while parts: - unencoded = parts.pop(0).strip() + unencoded = parts.pop(0) + if first: + unencoded = unencoded.lstrip() + first = False if unencoded: words.append((unencoded, None, None)) if parts: @@ -95,6 +98,16 @@ encoding = parts.pop(0).lower() encoded = parts.pop(0) words.append((encoded, encoding, charset)) + # Now loop over words and remove words that consist of whitespace + # between two encoded strings. + import sys + droplist = [] + for n, w in enumerate(words): + if n>1 and w[1] and words[n-2][1] and words[n-1][0].isspace(): + droplist.append(n-1) + for d in reversed(droplist): + del words[d] + # The next step is to decode each encoded word by applying the reverse # base64 or quopri transformation. decoded_words is now a list of the # form (decoded_word, charset). @@ -217,22 +230,27 @@ self._normalize() uchunks = [] lastcs = None + lastspace = None for string, charset in self._chunks: # We must preserve spaces between encoded and non-encoded word # boundaries, which means for us we need to add a space when we go # from a charset to None/us-ascii, or from None/us-ascii to a # charset. Only do this for the second and subsequent chunks. + # Don't add a space if the None/us-ascii string already has + # a space (trailing or leading depending on transition) nextcs = charset if nextcs == _charset.UNKNOWN8BIT: original_bytes = string.encode('ascii', 'surrogateescape') string = original_bytes.decode('ascii', 'replace') if uchunks: + hasspace = string and self._nonctext(string[0]) if lastcs not in (None, 'us-ascii'): - if nextcs in (None, 'us-ascii'): + if nextcs in (None, 'us-ascii') and not hasspace: uchunks.append(SPACE) nextcs = None - elif nextcs not in (None, 'us-ascii'): + elif nextcs not in (None, 'us-ascii') and not lastspace: uchunks.append(SPACE) + lastspace = string and self._nonctext(string[-1]) lastcs = nextcs uchunks.append(string) return EMPTYSTRING.join(uchunks) @@ -286,6 +304,11 @@ s.encode(output_charset, errors) self._chunks.append((s, charset)) + def _nonctext(self, s): + """True if string s is not a ctext character of RFC822. + """ + return s.isspace() or s in ('(', ')', '\\') + def encode(self, splitchars=';, \t', maxlinelen=None, linesep='\n'): r"""Encode a message header into an RFC-compliant format. @@ -329,7 +352,20 @@ maxlinelen = 1000000 formatter = _ValueFormatter(self._headerlen, maxlinelen, self._continuation_ws, splitchars) + lastcs = None + hasspace = lastspace = None for string, charset in self._chunks: + if hasspace is not None: + hasspace = string and self._nonctext(string[0]) + import sys + if lastcs not in (None, 'us-ascii'): + if not hasspace or charset not in (None, 'us-ascii'): + formatter.add_transition() + elif charset not in (None, 'us-ascii') and not lastspace: + formatter.add_transition() + lastspace = string and self._nonctext(string[-1]) + lastcs = charset + hasspace = False lines = string.splitlines() if lines: formatter.feed('', lines[0], charset) @@ -346,6 +382,7 @@ formatter.feed(fws, sline, charset) if len(lines) > 1: formatter.newline() + if self._chunks: formatter.add_transition() value = formatter._str(linesep) if _embeded_header.search(value): diff -r 3609d32cec46 Lib/test/test_email/test_asian_codecs.py --- a/Lib/test/test_email/test_asian_codecs.py Tue Jan 03 17:48:19 2012 +0100 +++ b/Lib/test/test_email/test_asian_codecs.py Tue Jan 03 22:19:30 2012 +0100 @@ -41,7 +41,7 @@ Hello World! =?iso-2022-jp?b?GyRCJU8lbSE8JW8hPCVrJUkhKhsoQg==?= =?iso-8859-1?q?Gr=FC=DF_Gott!?=""") eq(decode_header(h.encode()), - [(b'Hello World!', None), + [(b'Hello World! ', None), (b'\x1b$B%O%m!<%o!<%k%I!*\x1b(B', 'iso-2022-jp'), (b'Gr\xfc\xdf Gott!', gcode)]) subject_bytes = (b'test-ja \xa4\xd8\xc5\xea\xb9\xc6\xa4\xb5' diff -r 3609d32cec46 Lib/test/test_email/test_email.py --- a/Lib/test/test_email/test_email.py Tue Jan 03 17:48:19 2012 +0100 +++ b/Lib/test/test_email/test_email.py Tue Jan 03 22:19:30 2012 +0100 @@ -1999,9 +1999,9 @@ foo bar =?mac-iceland?q?r=8Aksm=9Arg=8Cs?=""" dh = decode_header(s) eq(dh, [ - (b'Re:', None), + (b'Re: ', None), (b'r\x8aksm\x9arg\x8cs', 'mac-iceland'), - (b'baz foo bar', None), + (b' baz foo bar ', None), (b'r\x8aksm\x9arg\x8cs', 'mac-iceland')]) header = make_header(dh) eq(str(header), @@ -2010,36 +2010,38 @@ Re: =?mac-iceland?q?r=8Aksm=9Arg=8Cs?= baz foo bar =?mac-iceland?q?r=8Aksm?= =?mac-iceland?q?=9Arg=8Cs?=""") - def test_whitespace_eater_unicode(self): + def test_whitespace_keeper_unicode(self): eq = self.assertEqual s = '=?ISO-8859-1?Q?Andr=E9?= Pirard ' dh = decode_header(s) eq(dh, [(b'Andr\xe9', 'iso-8859-1'), - (b'Pirard ', None)]) + (b' Pirard ', None)]) header = str(make_header(dh)) eq(header, 'Andr\xe9 Pirard ') - def test_whitespace_eater_unicode_2(self): + def test_whitespace_keeper_unicode_2(self): eq = self.assertEqual s = 'The =?iso-8859-1?b?cXVpY2sgYnJvd24gZm94?= jumped over the =?iso-8859-1?b?bGF6eSBkb2c=?=' dh = decode_header(s) - eq(dh, [(b'The', None), (b'quick brown fox', 'iso-8859-1'), - (b'jumped over the', None), (b'lazy dog', 'iso-8859-1')]) + eq(dh, [(b'The ', None), (b'quick brown fox', 'iso-8859-1'), + (b' jumped over the ', None), (b'lazy dog', 'iso-8859-1')]) hu = str(make_header(dh)) eq(hu, 'The quick brown fox jumped over the lazy dog') def test_rfc2047_missing_whitespace(self): s = 'Sm=?ISO-8859-1?B?9g==?=rg=?ISO-8859-1?B?5Q==?=sbord' dh = decode_header(s) - self.assertEqual(dh, [(s, None)]) - - def test_rfc2047_with_whitespace(self): - s = 'Sm =?ISO-8859-1?B?9g==?= rg =?ISO-8859-1?B?5Q==?= sbord' - dh = decode_header(s) self.assertEqual(dh, [(b'Sm', None), (b'\xf6', 'iso-8859-1'), (b'rg', None), (b'\xe5', 'iso-8859-1'), (b'sbord', None)]) + def test_rfc2047_with_whitespace(self): + s = 'Sm =?ISO-8859-1?B?9g==?= rg =?ISO-8859-1?B?5Q==?= sbord' + dh = decode_header(s) + self.assertEqual(dh, [(b'Sm ', None), (b'\xf6', 'iso-8859-1'), + (b' rg ', None), (b'\xe5', 'iso-8859-1'), + (b' sbord', None)]) + def test_rfc2047_B_bad_padding(self): s = '=?iso-8859-1?B?%s?=' data = [ # only test complete bytes @@ -2056,62 +2058,56 @@ self.assertEqual(decode_header(s), [(b'andr\xe9=zz', 'iso-8659-1')]) - @unittest.expectedFailure def test_rfc2047_rfc2047_1(self): # 1st testcase at end of rfc2047 s = '(=?ISO-8859-1?Q?a?=)' self.assertEqual(decode_header(s), [(b'(', None), (b'a', 'iso-8859-1'), (b')', None)]) - @unittest.expectedFailure def test_rfc2047_rfc2047_2(self): # 2nd testcase at end of rfc2047 s = '(=?ISO-8859-1?Q?a?= b)' self.assertEqual(decode_header(s), [(b'(', None), (b'a', 'iso-8859-1'), (b' b)', None)]) - @unittest.expectedFailure def test_rfc2047_rfc2047_3(self): # 3rd testcase at end of rfc2047 s = '(=?ISO-8859-1?Q?a?= =?ISO-8859-1?Q?b?=)' self.assertEqual(decode_header(s), [(b'(', None), (b'ab', 'iso-8859-1'), (b')', None)]) - @unittest.expectedFailure def test_rfc2047_rfc2047_4(self): # 4th testcase at end of rfc2047 s = '(=?ISO-8859-1?Q?a?= =?ISO-8859-1?Q?b?=)' self.assertEqual(decode_header(s), [(b'(', None), (b'ab', 'iso-8859-1'), (b')', None)]) - @unittest.expectedFailure def test_rfc2047_rfc2047_5a(self): # 5th testcase at end of rfc2047 newline is \r\n s = '(=?ISO-8859-1?Q?a?=\r\n =?ISO-8859-1?Q?b?=)' self.assertEqual(decode_header(s), [(b'(', None), (b'ab', 'iso-8859-1'), (b')', None)]) - @unittest.expectedFailure def test_rfc2047_rfc2047_5b(self): # 5th testcase at end of rfc2047 newline is \n s = '(=?ISO-8859-1?Q?a?=\n =?ISO-8859-1?Q?b?=)' self.assertEqual(decode_header(s), [(b'(', None), (b'ab', 'iso-8859-1'), (b')', None)]) - @unittest.expectedFailure def test_rfc2047_rfc2047_6(self): # 6th testcase at end of rfc2047 s = '(=?ISO-8859-1?Q?a_b?=)' self.assertEqual(decode_header(s), [(b'(', None), (b'a b', 'iso-8859-1'), (b')', None)]) - @unittest.expectedFailure def test_rfc2047_rfc2047_7(self): # 7th testcase at end of rfc2047 s = '(=?ISO-8859-1?Q?a?= =?ISO-8859-2?Q?_b?=)' self.assertEqual(decode_header(s), [(b'(', None), (b'a', 'iso-8859-1'), (b' b', 'iso-8859-2'), (b')', None)]) + self.assertEqual(make_header(decode_header(s)).encode(), s.lower()) + self.assertEqual(str(make_header(decode_header(s))), '(a b)') # Test the MIMEMessage class @@ -4463,11 +4459,11 @@ h = make_header(decode_header(s)) eq(h.encode(), s) - def test_whitespace_eater(self): + def test_whitespace_keeper(self): eq = self.assertEqual s = 'Subject: =?koi8-r?b?8NLP18XSy8EgzsEgxsnOwczYztk=?= =?koi8-r?q?=CA?= zz.' parts = decode_header(s) - eq(parts, [(b'Subject:', None), (b'\xf0\xd2\xcf\xd7\xc5\xd2\xcb\xc1 \xce\xc1 \xc6\xc9\xce\xc1\xcc\xd8\xce\xd9\xca', 'koi8-r'), (b'zz.', None)]) + eq(parts, [(b'Subject: ', None), (b'\xf0\xd2\xcf\xd7\xc5\xd2\xcb\xc1 \xce\xc1 \xc6\xc9\xce\xc1\xcc\xd8\xce\xd9\xca', 'koi8-r'), (b' zz.', None)]) hdr = make_header(parts) eq(hdr.encode(), 'Subject: =?koi8-r?b?8NLP18XSy8EgzsEgxsnOwczYztnK?= zz.')