diff -r 096dfac57e44 Lib/email/feedparser.py --- a/Lib/email/feedparser.py Wed Sep 07 09:00:48 2016 -0700 +++ b/Lib/email/feedparser.py Wed Sep 07 13:05:25 2016 -0400 @@ -27,6 +27,7 @@ from email import message from email._policybase import compat32 from collections import deque +from io import StringIO NLCRE = re.compile('\r\n|\r|\n') NLCRE_bol = re.compile('(\r\n|\r|\n)') @@ -51,8 +52,8 @@ simple abstraction -- it parses until EOF closes the current message. """ def __init__(self): - # Chunks of the last partial line pushed into this object. - self._partial = [] + # Text stream of the last partial line pushed into this object. + self._partial = StringIO(newline='') # A deque of full, pushed lines self._lines = deque() # The stack of false-EOF checking predicates. @@ -68,8 +69,10 @@ def close(self): # Don't forget any trailing partial line. - self.pushlines(''.join(self._partial).splitlines(True)) - self._partial = [] + self._partial.seek(0) + self.pushlines(self._partial.readlines()) + self._partial.seek(0) + self._partial.truncate() self._closed = True def readline(self): @@ -97,26 +100,24 @@ def push(self, data): """Push some new data into this object.""" - # Crack into lines, but preserve the linesep characters on the end of each - parts = data.splitlines(True) - - if not parts or not parts[0].endswith(('\n', '\r')): + self._partial.write(data) + if '\n' not in data and '\r' not in data: # No new complete lines, so just accumulate partials - self._partial += parts + # This avoids quadratic copying when building a very long line return - if self._partial: - # If there are previous leftovers, complete them now - self._partial.append(parts[0]) - parts[0:1] = ''.join(self._partial).splitlines(True) - del self._partial[:] + # Crack into lines, but preserve the linesep characters on the end of each + self._partial.seek(0) + parts = self._partial.readlines() + self._partial.seek(0) + self._partial.truncate() # If the last element of the list does not end in a newline, then treat # it as a partial line. We only check for '\n' here because a line # ending with '\r' might be a line that was split in the middle of a # '\r\n' sequence (see bugs 1555570 and 1721862). if not parts[-1].endswith('\n'): - self._partial = [parts.pop()] + self._partial.write(parts.pop()) self.pushlines(parts) def pushlines(self, lines): diff -r 096dfac57e44 Lib/email/policy.py --- a/Lib/email/policy.py Wed Sep 07 09:00:48 2016 -0700 +++ b/Lib/email/policy.py Wed Sep 07 13:05:25 2016 -0400 @@ -2,6 +2,7 @@ code that adds all the email6 features. """ +import re from email._policybase import Policy, Compat32, compat32, _extend_docstrings from email.utils import _has_surrogates from email.headerregistry import HeaderRegistry as HeaderRegistry @@ -18,6 +19,8 @@ 'HTTP', ] +linesep_splitter = re.compile(r'\n|\r') + @_extend_docstrings class EmailPolicy(Policy): @@ -135,6 +138,8 @@ if hasattr(value, 'name') and value.name.lower() == name.lower(): return (name, value) if isinstance(value, str) and len(value.splitlines())>1: + # XXX this error message isn't quite right when we use splitlines + # (see issue 22233), but I'm not sure what should happen here. raise ValueError("Header values may not contain linefeed " "or carriage return characters") return (name, self.header_factory(name, value)) @@ -150,7 +155,9 @@ """ if hasattr(value, 'name'): return value - return self.header_factory(name, ''.join(value.splitlines())) + # We can't use splitlines here because it splits on more than + value = ''.join(linesep_splitter.split(value)) + return self.header_factory(name, value) def fold(self, name, value): """+ diff -r 096dfac57e44 Lib/test/test_email/test_email.py --- a/Lib/test/test_email/test_email.py Wed Sep 07 09:00:48 2016 -0700 +++ b/Lib/test/test_email/test_email.py Wed Sep 07 13:05:25 2016 -0400 @@ -3444,10 +3444,12 @@ self.assertEqual(m.keys(), ['a', 'b']) m = self.parse(['a:\r', '\nb:\n']) self.assertEqual(m.keys(), ['a', 'b']) + + # Only CR and LF should break header fields m = self.parse(['a:\x85b:\u2028c:\n']) - self.assertEqual(m.items(), [('a', '\x85'), ('b', '\u2028'), ('c', '')]) + self.assertEqual(m.items(), [('a', '\x85b:\u2028c:')]) m = self.parse(['a:\r', 'b:\x85', 'c:\n']) - self.assertEqual(m.items(), [('a', ''), ('b', '\x85'), ('c', '')]) + self.assertEqual(m.items(), [('a', ''), ('b', '\x85c:')]) def test_long_lines(self): # Expected peak memory use on 32-bit platform: 6*N*M bytes. diff -r 096dfac57e44 Lib/test/test_email/test_parser.py --- a/Lib/test/test_email/test_parser.py Wed Sep 07 09:00:48 2016 -0700 +++ b/Lib/test/test_email/test_parser.py Wed Sep 07 13:05:25 2016 -0400 @@ -2,6 +2,7 @@ import email import unittest from email.message import Message +from email.policy import default from test.test_email import TestEmailBase @@ -32,5 +33,45 @@ # XXX add tests for other functions that take Message arg. +class TestParserBase: + + def test_only_split_on_cr_lf(self): + # The unicode line splitter splits on unicode linebreaks, which are + # more numerous than allowed by the email RFCs; make sure we are only + # splitting on those two. + msg = self.parser( + "Next-Line: not\x85broken\r\n" + "Null: not\x00broken\r\n" + "Vertical-Tab: not\vbroken\r\n" + "Form-Feed: not\fbroken\r\n" + "File-Separator: not\x1Cbroken\r\n" + "Group-Separator: not\x1Dbroken\r\n" + "Record-Separator: not\x1Ebroken\r\n" + "Line-Separator: not\u2028broken\r\n" + "Paragraph-Separator: not\u2029broken\r\n" + "\r\n", + policy=default, + ) + self.assertEqual(msg.items(), [ + ("Next-Line", "not\x85broken"), + ("Null", "not\x00broken"), + ("Vertical-Tab", "not\vbroken"), + ("Form-Feed", "not\fbroken"), + ("File-Separator", "not\x1Cbroken"), + ("Group-Separator", "not\x1Dbroken"), + ("Record-Separator", "not\x1Ebroken"), + ("Line-Separator", "not\u2028broken"), + ("Paragraph-Separator", "not\u2029broken"), + ]) + self.assertEqual(msg.get_payload(), "") + +class TestParser(TestParserBase, TestEmailBase): + parser = staticmethod(email.message_from_string) + +class TestBytesParser(TestParserBase, TestEmailBase): + def parser(self, s, *args, **kw): + return email.message_from_bytes(s.encode(), *args, **kw) + + if __name__ == '__main__': unittest.main() diff -r 096dfac57e44 Lib/test/test_email/test_policy.py --- a/Lib/test/test_email/test_policy.py Wed Sep 07 09:00:48 2016 -0700 +++ b/Lib/test/test_email/test_policy.py Wed Sep 07 13:05:25 2016 -0400 @@ -334,5 +334,6 @@ 'From', 'spam\negg@foo.py') + if __name__ == '__main__': unittest.main() diff -r 096dfac57e44 Lib/test/test_httplib.py --- a/Lib/test/test_httplib.py Wed Sep 07 09:00:48 2016 -0700 +++ b/Lib/test/test_httplib.py Wed Sep 07 13:05:25 2016 -0400 @@ -283,6 +283,36 @@ self.assertEqual(resp.getheader('First'), 'val') self.assertEqual(resp.getheader('Second'), 'val') + def test_parse_all_octets(self): + # Ensure no valid header field octet breaks the parser + body = ( + b'HTTP/1.1 200 OK\r\n' + b"!#$%&'*+-.^_`|~: value\r\n" # Special token characters + b'VCHAR: ' + bytes(range(0x21, 0x7E + 1)) + b'\r\n' + b'obs-text: ' + bytes(range(0x80, 0xFF + 1)) + b'\r\n' + b'obs-fold: text\r\n' + b' folded with space\r\n' + b'\tfolded with tab\r\n' + b'Content-Length: 0\r\n' + b'\r\n' + ) + sock = FakeSocket(body) + resp = client.HTTPResponse(sock) + resp.begin() + self.assertEqual(resp.getheader('Content-Length'), '0') + self.assertEqual(resp.msg['Content-Length'], '0') + self.assertEqual(resp.getheader("!#$%&'*+-.^_`|~"), 'value') + self.assertEqual(resp.msg["!#$%&'*+-.^_`|~"], 'value') + vchar = ''.join(map(chr, range(0x21, 0x7E + 1))) + self.assertEqual(resp.getheader('VCHAR'), vchar) + self.assertEqual(resp.msg['VCHAR'], vchar) + self.assertIsNotNone(resp.getheader('obs-text')) + self.assertIn('obs-text', resp.msg) + for folded in (resp.getheader('obs-fold'), resp.msg['obs-fold']): + self.assertTrue(folded.startswith('text')) + self.assertIn(' folded with space', folded) + self.assertTrue(folded.endswith('folded with tab')) + def test_invalid_headers(self): conn = client.HTTPConnection('example.com') conn.sock = FakeSocket('') diff -r 096dfac57e44 Misc/NEWS --- a/Misc/NEWS Wed Sep 07 09:00:48 2016 -0700 +++ b/Misc/NEWS Wed Sep 07 13:05:25 2016 -0400 @@ -794,6 +794,11 @@ - Issue #25718: Fixed copying object with state with boolean value is false. +- Issue #22233: Change the email package's parser to only accept CR, LF, and + CRLF sequences as breaking header fields, rather than anything accepted by + str.splitlines(). This allows HTTP headers to be parsed when they include + the byte 0x85. + - Issue #10131: Fixed deep copying of minidom documents. Based on patch by Marian Ganisin.