# HG changeset patch # Parent 60085c8f01fe4eb19a1c38a2d27fd77698b5a5ec Issue #24363: Add policy flag to avoid parsing HTTP header as email body diff -r 60085c8f01fe Doc/library/email.errors.rst --- a/Doc/library/email.errors.rst Thu Sep 08 22:37:34 2016 -0400 +++ b/Doc/library/email.errors.rst Mon Jan 23 23:39:53 2017 +0000 @@ -85,17 +85,15 @@ middle of a header block. * :class:`MissingHeaderBodySeparatorDefect` - A line was found while parsing - headers that had no leading white space but contained no ':'. Parsing - continues assuming that the line represents the first line of the body. + headers that had no leading white space but contained no ':'. If parsing + continues assuming that the line represents the first line of the body, + this defect is used. .. versionadded:: 3.3 * :class:`MalformedHeaderDefect` -- A header was found that was missing a colon, or was otherwise malformed. - .. deprecated:: 3.3 - This defect has not been used for several Python versions. - * :class:`MultipartInvariantViolationDefect` -- A message claimed to be a :mimetype:`multipart`, but no subparts were found. Note that when a message has this defect, its :meth:`~email.message.Message.is_multipart` method may diff -r 60085c8f01fe Lib/email/errors.py --- a/Lib/email/errors.py Thu Sep 08 22:37:34 2016 -0400 +++ b/Lib/email/errors.py Mon Jan 23 23:39:53 2017 +0000 @@ -55,8 +55,9 @@ class MissingHeaderBodySeparatorDefect(MessageDefect): """Found line with no leading whitespace and no colon before blank line.""" -# XXX: backward compatibility, just in case (it was never emitted). -MalformedHeaderDefect = MissingHeaderBodySeparatorDefect + +class MalformedHeaderDefect(MessageDefect): + """An ordinary header line did not match the expected format.""" class MultipartInvariantViolationDefect(MessageDefect): """A message claimed to be a multipart but no subparts were found.""" diff -r 60085c8f01fe Lib/email/feedparser.py --- a/Lib/email/feedparser.py Thu Sep 08 22:37:34 2016 -0400 +++ b/Lib/email/feedparser.py Mon Jan 23 23:39:53 2017 +0000 @@ -169,6 +169,9 @@ self._last = None self._headersonly = False + # True to parse a HTTP header section that is detached from any body + self.__body_detached = getattr(policy, "_py_body_detached", False) + # Non-public interface for supporting Parser's headersonly flag def _set_headersonly(self): self._headersonly = True @@ -233,9 +236,18 @@ # (i.e. newline), just throw it away. Otherwise the line is # part of the body so push it back. if not NLCRE.match(line): - defect = errors.MissingHeaderBodySeparatorDefect() + if self.__body_detached: + defect = "Invalid header line: " + repr(line) + defect = errors.MalformedHeaderDefect(defect) + else: + defect = errors.MissingHeaderBodySeparatorDefect() + self._input.unreadline(line) self.policy.handle_defect(self._cur, defect) - self._input.unreadline(line) + if self.__body_detached: + # Even in the case of a blank line, this could be "\r", + # which the HTTP parser does not consider to be the last + # line, so ignore it in case other header fields follow + continue break headers.append(line) # Done with the headers, so parse them and figure out what we're @@ -322,7 +334,7 @@ lines.append(line) self._cur.set_payload(EMPTYSTRING.join(lines)) return - # Make sure a valid content type was specified per RFC 2045:6.4. + # Make sure a valid encoding was specified per RFC 2045:6.4. if (self._cur.get('content-transfer-encoding', '8bit').lower() not in ('7bit', '8bit', 'binary')): defect = errors.InvalidMultipartContentTransferEncodingDefect() @@ -499,7 +511,7 @@ line = line[:-len(mo.group(0))] self._cur.set_unixfrom(line) continue - elif lineno == len(lines) - 1: + elif not self.__body_detached and lineno == len(lines) - 1: # Something looking like a unix-from at the end - it's # probably the first line of the body, so push back the # line and stop. @@ -511,10 +523,13 @@ defect = errors.MisplacedEnvelopeHeaderDefect(line) self._cur.defects.append(defect) continue + # Split the line on the colon separating field name from value. - # There will always be a colon, because if there wasn't the part of - # the parser that calls us would have started parsing the body. - i = line.find(':') + # There will always be a colon, because if there wasn't, + # it would have been picked up by the part of + # the parser that calls us, or the continuation or envelope + # checks above. + i = line.index(':') # If the colon is on the start of the line the header is clearly # malformed, but we might be able to salvage the rest of the @@ -524,7 +539,6 @@ self._cur.defects.append(defect) continue - assert i>0, "_parse_headers fed line with no : and no leading WS" lastheader = line[:i] lastvalue = [line] # Done with all the lines, so handle the last header. diff -r 60085c8f01fe Lib/http/client.py --- a/Lib/http/client.py Thu Sep 08 22:37:34 2016 -0400 +++ b/Lib/http/client.py Mon Jan 23 23:39:53 2017 +0000 @@ -69,6 +69,7 @@ """ import email.parser +import email.policy import email.message import http import io @@ -191,14 +192,16 @@ lst.append(line) return lst +class _Policy(email.policy.Compat32): + _py_body_detached = True + def parse_headers(fp, _class=HTTPMessage): """Parses only RFC2822 headers from a file pointer. - email Parser wants to see strings rather than bytes. - But a TextIOWrapper around self.rfile would buffer too many bytes - from the stream, bytes which we later need to read as bytes. - So we read the correct bytes here, as bytes, for email Parser - to parse. + The parser works with text strings rather than bytes. + But a TextIOWrapper may internally buffer too many bytes from the stream, + bytes which we later need to read. So we read the correct number of + bytes from the stream before decoding them to text to be parsed. """ headers = [] @@ -212,7 +215,8 @@ if line in (b'\r\n', b'\n', b''): break hstring = b''.join(headers).decode('iso-8859-1') - return email.parser.Parser(_class=_class).parsestr(hstring) + parser = email.parser.Parser(_class=_class, policy=_Policy()) + return parser.parsestr(hstring) class HTTPResponse(io.BufferedIOBase): diff -r 60085c8f01fe Lib/test/test_httplib.py --- a/Lib/test/test_httplib.py Thu Sep 08 22:37:34 2016 -0400 +++ b/Lib/test/test_httplib.py Mon Jan 23 23:39:53 2017 +0000 @@ -1,3 +1,4 @@ +import email.errors import errno from http import client import io @@ -283,6 +284,106 @@ self.assertEqual(resp.getheader('First'), 'val') self.assertEqual(resp.getheader('Second'), 'val') + def test_malformed_truncation(self): + # Other malformed header lines, especially without colons, used to + # cause the rest of the header section to be truncated + resp = ( + b'HTTP/1.1 200 OK\r\n' + b'Public-Key-Pins: \n' + b'pin-sha256="xxx=";\n' + b'report-uri="https://..."\r\n' + b'Transfer-Encoding: chunked\r\n' + b'\r\n' + b'4\r\nbody\r\n0\r\n\r\n' + ) + resp = client.HTTPResponse(FakeSocket(resp)) + resp.begin() + self.assertIsNotNone(resp.getheader('Public-Key-Pins')) + self.assertEqual(resp.getheader('Transfer-Encoding'), 'chunked') + for defect in resp.msg.defects: + self.assertIsInstance(defect, email.errors.MalformedHeaderDefect) + self.assertEqual(resp.msg.get_payload(), '') + self.assertEqual(resp.read(), b'body') + + def test_blank_line_forms(self): + # There are two independent bits of logic that detect the blank line + # at the end, so test they agree + for blank in (b'\r\n', b'\n'): + resp = b'HTTP/1.1 200 OK\r\n' b'Transfer-Encoding: chunked\r\n' + resp += blank + resp += b'4\r\nbody\r\n0\r\n\r\n' + resp = client.HTTPResponse(FakeSocket(resp)) + resp.begin() + self.assertEqual(resp.getheader('Transfer-Encoding'), 'chunked') + self.assertSequenceEqual(resp.msg.defects, ()) + self.assertEqual(resp.msg.get_payload(), '') + self.assertEqual(resp.read(), b'body') + + resp = b'HTTP/1.0 200 OK\r\n' + blank + b'body' + resp = client.HTTPResponse(FakeSocket(resp)) + resp.begin() + self.assertSequenceEqual(resp.msg.defects, ()) + self.assertEqual(resp.msg.get_payload(), '') + self.assertEqual(resp.read(), b'body') + + # A blank line ending in CR is not treated as the end of the HTTP + # header section, therefore any header fields following it should not + # be parsed as an email body + resp = ( + b'HTTP/1.1 200 OK\r\n' + b'\r' + b'Transfer-Encoding: chunked\r\n' + b'\r\n' + b'4\r\nbody\r\n0\r\n\r\n' + ) + resp = client.HTTPResponse(FakeSocket(resp)) + resp.begin() + self.assertEqual(resp.getheader('Transfer-Encoding'), 'chunked') + self.assertEqual(resp.msg.get_payload(), '') + self.assertEqual(resp.read(), b'body') + + # No header fields nor blank line + resp = b'HTTP/1.0 200 OK\r\n' + resp = client.HTTPResponse(FakeSocket(resp)) + resp.begin() + self.assertEqual(resp.msg.get_payload(), '') + self.assertEqual(resp.read(), b'') + + def test_from_line(self): + # The parser handles "From" lines specially, so test this does not + # affect parsing the rest of the header section + resp = ( + b'HTTP/1.1 200 OK\r\n' + b'From start\r\n' + b' continued\r\n' + b'Name: value\r\n' + b'From middle\r\n' + b' continued\r\n' + b'Transfer-Encoding: chunked\r\n' + b'From end\r\n' + b'\r\n' + b'4\r\nbody\r\n0\r\n\r\n' + ) + resp = client.HTTPResponse(FakeSocket(resp)) + resp.begin() + self.assertIsNotNone(resp.getheader('Name')) + self.assertEqual(resp.getheader('Transfer-Encoding'), 'chunked') + defects = tuple(map(type, resp.msg.defects)) + self.assertIn(email.errors.MisplacedEnvelopeHeaderDefect, defects) + self.assertEqual(resp.msg.get_payload(), '') + self.assertEqual(resp.read(), b'body') + + resp = ( + b'HTTP/1.0 200 OK\r\n' + b'From alone\r\n' + b'\r\n' + b'body' + ) + resp = client.HTTPResponse(FakeSocket(resp)) + resp.begin() + self.assertEqual(resp.msg.get_payload(), '') + self.assertEqual(resp.read(), b'body') + def test_parse_all_octets(self): # Ensure no valid header field octet breaks the parser body = ( diff -r 60085c8f01fe Misc/NEWS --- a/Misc/NEWS Thu Sep 08 22:37:34 2016 -0400 +++ b/Misc/NEWS Mon Jan 23 23:39:53 2017 +0000 @@ -129,6 +129,12 @@ characters, not on arbitrary unicode line breaks. This also fixes a bug in HTTP header parsing. +- Issue #24363: When parsing HTTP header fields, if an invalid line is + encountered, record it as a MalformedHeaderDefect and continue parsing. + Previously, no more header fields were parsed, which could lead to fields + for HTTP framing like Content-Length and Transfer-Encoding being + overlooked. + - Issue 27331: The email.mime classes now all accept an optional policy keyword. - Issue 27988: Fix email iter_attachments incorrect mutation of payload list.