# HG changeset patch
# Parent  60085c8f01fe4eb19a1c38a2d27fd77698b5a5ec
Issue #24363: Bypass parser to avoid parsing HTTP headers as email body

Now http.client.parse_headers() calls the internal FeedParser._parse_
headers() method directly, bypassing the possibility of the main parsing
logic interpreting invalid HTTP header fields as an email body.

diff -r 60085c8f01fe Doc/library/email.errors.rst
--- a/Doc/library/email.errors.rst	Thu Sep 08 22:37:34 2016 -0400
+++ b/Doc/library/email.errors.rst	Fri Sep 09 03:43:01 2016 +0000
@@ -85,17 +85,15 @@
   middle of a header block.
 
 * :class:`MissingHeaderBodySeparatorDefect` - A line was found while parsing
-  headers that had no leading white space but contained no ':'.  Parsing
-  continues assuming that the line represents the first line of the body.
+  headers that had no leading white space but contained no ':'.  If parsing
+  continues assuming that the line represents the first line of the body,
+  this defect is used.
 
   .. versionadded:: 3.3
 
 * :class:`MalformedHeaderDefect` -- A header was found that was missing a colon,
   or was otherwise malformed.
 
-  .. deprecated:: 3.3
-     This defect has not been used for several Python versions.
-
 * :class:`MultipartInvariantViolationDefect` -- A message claimed to be a
   :mimetype:`multipart`, but no subparts were found.  Note that when a message
   has this defect, its :meth:`~email.message.Message.is_multipart` method may
diff -r 60085c8f01fe Lib/email/errors.py
--- a/Lib/email/errors.py	Thu Sep 08 22:37:34 2016 -0400
+++ b/Lib/email/errors.py	Fri Sep 09 03:43:01 2016 +0000
@@ -55,8 +55,9 @@
 
 class MissingHeaderBodySeparatorDefect(MessageDefect):
     """Found line with no leading whitespace and no colon before blank line."""
-# XXX: backward compatibility, just in case (it was never emitted).
-MalformedHeaderDefect = MissingHeaderBodySeparatorDefect
+
+class MalformedHeaderDefect(MessageDefect):
+    """An ordinary header line did not match the expected format."""
 
 class MultipartInvariantViolationDefect(MessageDefect):
     """A message claimed to be a multipart but no subparts were found."""
diff -r 60085c8f01fe Lib/email/feedparser.py
--- a/Lib/email/feedparser.py	Thu Sep 08 22:37:34 2016 -0400
+++ b/Lib/email/feedparser.py	Fri Sep 09 03:43:01 2016 +0000
@@ -238,6 +238,10 @@
                     self._input.unreadline(line)
                 break
             headers.append(line)
+        if len(headers) > 1 and headers[-1].startswith('From '):
+            # Something looking like a unix-from at the end - it's
+            # probably the first line of the body, so push the line back.
+            self._input.unreadline(headers.pop())
         # Done with the headers, so parse them and figure out what we're
         # supposed to see in the body of the message.
         self._parse_headers(headers)
@@ -322,11 +326,7 @@
                     lines.append(line)
                 self._cur.set_payload(EMPTYSTRING.join(lines))
                 return
-            # Make sure a valid content type was specified per RFC 2045:6.4.
-            if (self._cur.get('content-transfer-encoding', '8bit').lower()
-                    not in ('7bit', '8bit', 'binary')):
-                defect = errors.InvalidMultipartContentTransferEncodingDefect()
-                self.policy.handle_defect(self._cur, defect)
+            self._validate_composite_cte()
             # Create a line match predicate which matches the inter-part
             # boundary as well as the end-of-multipart boundary.  Don't push
             # this onto the input stream until we've scanned past the
@@ -473,6 +473,7 @@
 
     def _parse_headers(self, lines):
         # Passed a list of lines that make up the headers for the current msg
+        # This method is also called directly by http.client.parse_headers()
         lastheader = ''
         lastvalue = []
         for lineno, line in enumerate(lines):
@@ -498,23 +499,18 @@
                     if mo:
                         line = line[:-len(mo.group(0))]
                     self._cur.set_unixfrom(line)
-                    continue
-                elif lineno == len(lines) - 1:
-                    # Something looking like a unix-from at the end - it's
-                    # probably the first line of the body, so push back the
-                    # line and stop.
-                    self._input.unreadline(line)
-                    return
                 else:
                     # Weirdly placed unix-from line.  Note this as a defect
                     # and ignore it.
                     defect = errors.MisplacedEnvelopeHeaderDefect(line)
                     self._cur.defects.append(defect)
-                    continue
+                continue
+
             # Split the line on the colon separating field name from value.
-            # There will always be a colon, because if there wasn't the part of
-            # the parser that calls us would have started parsing the body.
-            i = line.find(':')
+            # There will always be a colon, because if there wasn't, it would
+            # have been picked up by a headerRE check in the caller, or the
+            # continuation or envelope checks above.
+            i = line.index(':')
 
             # If the colon is on the start of the line the header is clearly
             # malformed, but we might be able to salvage the rest of the
@@ -524,13 +520,20 @@
                 self._cur.defects.append(defect)
                 continue
 
-            assert i>0, "_parse_headers fed line with no : and no leading WS"
             lastheader = line[:i]
             lastvalue = [line]
         # Done with all the lines, so handle the last header.
         if lastheader:
             self._cur.set_raw(*self.policy.header_source_parse(lastvalue))
 
+    def _validate_composite_cte(self):
+        # Make sure a valid Content-Transfer-Encoding was specified per
+        # RFC 2045:6.4.
+        if (self._cur.get('content-transfer-encoding', '8bit').lower()
+                not in ('7bit', '8bit', 'binary')):
+            defect = errors.InvalidMultipartContentTransferEncodingDefect()
+            self.policy.handle_defect(self._cur, defect)
+
 
 class BytesFeedParser(FeedParser):
     """Like FeedParser, but feed accepts bytes."""
diff -r 60085c8f01fe Lib/http/client.py
--- a/Lib/http/client.py	Thu Sep 08 22:37:34 2016 -0400
+++ b/Lib/http/client.py	Fri Sep 09 03:43:01 2016 +0000
@@ -68,7 +68,8 @@
 Req-sent-unread-response       _CS_REQ_SENT       <response_class>
 """
 
-import email.parser
+import email.errors
+import email.feedparser
 import email.message
 import http
 import io
@@ -194,25 +195,49 @@
 def parse_headers(fp, _class=HTTPMessage):
     """Parses only RFC2822 headers from a file pointer.
 
-    email Parser wants to see strings rather than bytes.
-    But a TextIOWrapper around self.rfile would buffer too many bytes
-    from the stream, bytes which we later need to read as bytes.
-    So we read the correct bytes here, as bytes, for email Parser
-    to parse.
+    The FeedParser class works with text strings rather than bytes.
+    But a TextIOWrapper may internally buffer too many bytes from the stream,
+    bytes which we later need to read.  So we read the correct number of
+    bytes from the stream before decoding them to text to be parsed.
 
     """
+    parser = email.feedparser.FeedParser()
+    parser._cur = _class()
     headers = []
+    count = 0
     while True:
         line = fp.readline(_MAXLINE + 1)
         if len(line) > _MAXLINE:
             raise LineTooLong("header line")
-        headers.append(line)
-        if len(headers) > _MAXHEADERS:
+        count += 1
+        if count > _MAXHEADERS:
             raise HTTPException("got more than %d headers" % _MAXHEADERS)
         if line in (b'\r\n', b'\n', b''):
             break
-    hstring = b''.join(headers).decode('iso-8859-1')
-    return email.parser.Parser(_class=_class).parsestr(hstring)
+        line = line.decode('iso-8859-1')
+        if not email.feedparser.headerRE.match(line):
+            defect = "Invalid header line: " + repr(line)
+            defect = email.errors.MalformedHeaderDefect(defect)
+            parser.policy.handle_defect(parser._cur, defect)
+            continue
+        headers.append(line)
+    parser._parse_headers(headers)
+
+    # Imitate normal email parser behaviour, but set an empty payload
+    type = parser._cur.get_content_maintype()
+    if type == "message":
+        # Payload must be a list, and for types other than message/delivery-
+        # status, it must have at least one submessage.
+        payload = _class()
+        payload.set_payload("")
+        parser._cur.set_payload([payload])
+    else:
+        if type == "multipart":
+            parser._validate_composite_cte()
+            parser._cur.epilogue = ""
+        parser._cur.set_payload("")
+
+    return parser._cur
 
 
 class HTTPResponse(io.BufferedIOBase):
diff -r 60085c8f01fe Lib/test/test_httplib.py
--- a/Lib/test/test_httplib.py	Thu Sep 08 22:37:34 2016 -0400
+++ b/Lib/test/test_httplib.py	Fri Sep 09 03:43:01 2016 +0000
@@ -1,3 +1,4 @@
+import email.errors
 import errno
 from http import client
 import io
@@ -283,6 +284,30 @@
         self.assertEqual(resp.getheader('First'), 'val')
         self.assertEqual(resp.getheader('Second'), 'val')
 
+    def test_malformed_truncation(self):
+        # Other malformed header lines, especially without colons, used to
+        # cause the rest of the header section to be truncated
+        resp = (
+            b'HTTP/1.1 200 OK\r\n'
+            b'Public-Key-Pins: \n'
+            b'pin-sha256="xxx=";\n'
+            b'report-uri="https://..."\r\n'
+            b'Transfer-Encoding: chunked\r\n'
+            b'\r\n'
+            b'4\r\n'
+            b'body\r\n'
+            b'0\r\n'
+            b'\r\n'
+        )
+        sock = FakeSocket(resp)
+        resp = client.HTTPResponse(sock)
+        resp.begin()
+        self.assertIsNotNone(resp.getheader('Public-Key-Pins'))
+        self.assertEqual(resp.getheader('Transfer-Encoding'), 'chunked')
+        for defect in resp.msg.defects:
+            self.assertIsInstance(defect, email.errors.MalformedHeaderDefect)
+        self.assertEqual(resp.read(), b'body')
+
     def test_parse_all_octets(self):
         # Ensure no valid header field octet breaks the parser
         body = (
diff -r 60085c8f01fe Misc/NEWS
--- a/Misc/NEWS	Thu Sep 08 22:37:34 2016 -0400
+++ b/Misc/NEWS	Fri Sep 09 03:43:01 2016 +0000
@@ -129,6 +129,12 @@
   characters, not on arbitrary unicode line breaks.  This also fixes a bug in
   HTTP header parsing.
 
+- Issue #24363: When parsing HTTP header fields, if an invalid line is
+  encountered, record it as a MalformedHeaderDefect and continue parsing.
+  Previously, no more header fields were parsed, which could lead to fields
+  for HTTP framing like Content-Length and Transfer-Encoding being
+  overlooked.
+
 - Issue 27331: The email.mime classes now all accept an optional policy keyword.
 
 - Issue 27988: Fix email iter_attachments incorrect mutation of payload list.