# HG changeset patch # Parent 304c61263ae6025379eb0f55c0bbca45e851be94 Issue #26686: Stop parsing defective header lines as the start of the body diff -r 304c61263ae6 Doc/library/email.errors.rst --- a/Doc/library/email.errors.rst Mon Jun 13 09:24:11 2016 +0300 +++ b/Doc/library/email.errors.rst Mon Jun 13 12:52:07 2016 +0000 @@ -90,18 +90,16 @@ * :class:`MisplacedEnvelopeHeaderDefect` - A "Unix From" header was found in the middle of a header block. -* :class:`MissingHeaderBodySeparatorDefect` - A line was found while parsing - headers that had no leading white space but contained no ':'. Parsing - continues assuming that the line represents the first line of the body. +* :class:`MissingHeaderBodySeparatorDefect` - Indicates that the body was not + separated from the header section by an empty line. However this defect + cannot occur, because a body is currently only recognized when it is + non-empty and is introduced by an empty line. .. versionadded:: 3.3 * :class:`MalformedHeaderDefect` -- A header was found that was missing a colon, or was otherwise malformed. - .. deprecated:: 3.3 - This defect has not been used for several Python versions. - * :class:`MultipartInvariantViolationDefect` -- A message claimed to be a :mimetype:`multipart`, but no subparts were found. Note that when a message has this defect, its :meth:`~email.message.Message.is_multipart` method may diff -r 304c61263ae6 Lib/email/errors.py --- a/Lib/email/errors.py Mon Jun 13 09:24:11 2016 +0300 +++ b/Lib/email/errors.py Mon Jun 13 12:52:07 2016 +0000 @@ -53,10 +53,12 @@ class MisplacedEnvelopeHeaderDefect(MessageDefect): """A 'Unix-from' header was found in the middle of a header block.""" +# Kept for backward compatibility class MissingHeaderBodySeparatorDefect(MessageDefect): - """Found line with no leading whitespace and no colon before blank line.""" -# XXX: backward compatibility, just in case (it was never emitted). -MalformedHeaderDefect = MissingHeaderBodySeparatorDefect + """Body not separated from header section by empty line.""" + +class MalformedHeaderDefect(MessageDefect): + """An ordinary header line did not match the expected format.""" class MultipartInvariantViolationDefect(MessageDefect): """A message claimed to be a multipart but no subparts were found.""" diff -r 304c61263ae6 Lib/email/feedparser.py --- a/Lib/email/feedparser.py Mon Jun 13 09:24:11 2016 +0300 +++ b/Lib/email/feedparser.py Mon Jun 13 12:52:07 2016 +0000 @@ -34,7 +34,7 @@ NLCRE_crack = re.compile('(\r\n|\r|\n)') # RFC 2822 $3.6.8 Optional fields. ftext is %d33-57 / %d59-126, Any character # except controls, SP, and ":". -headerRE = re.compile(r'^(From |[\041-\071\073-\176]*:|[\t ])') +headerRE = re.compile(r'^[\041-\071\073-\176]*:') EMPTYSTRING = '' NL = '\n' @@ -224,14 +224,9 @@ if line is NeedMoreData: yield NeedMoreData continue - if not headerRE.match(line): + if NLCRE.match(line): # If we saw the RFC defined header/body separator - # (i.e. newline), just throw it away. Otherwise the line is - # part of the body so push it back. - if not NLCRE.match(line): - defect = errors.MissingHeaderBodySeparatorDefect() - self.policy.handle_defect(self._cur, defect) - self._input.unreadline(line) + # (i.e. newline), just throw it away. break headers.append(line) # Done with the headers, so parse them and figure out what we're @@ -494,23 +489,23 @@ if mo: line = line[:-len(mo.group(0))] self._cur.set_unixfrom(line) - continue - elif lineno == len(lines) - 1: - # Something looking like a unix-from at the end - it's - # probably the first line of the body, so push back the - # line and stop. - self._input.unreadline(line) - return else: # Weirdly placed unix-from line. Note this as a defect # and ignore it. defect = errors.MisplacedEnvelopeHeaderDefect(line) self._cur.defects.append(defect) - continue + continue + + if not headerRE.match(line): + defect = "Invalid header line: " + repr(line) + defect = errors.MalformedHeaderDefect(defect) + self.policy.handle_defect(self._cur, defect) + continue + # Split the line on the colon separating field name from value. - # There will always be a colon, because if there wasn't the part of - # the parser that calls us would have started parsing the body. - i = line.find(':') + # There will always be a colon, because if there wasn't, it would + # have been picked up by the headerRE test above. + i = line.index(':') # If the colon is on the start of the line the header is clearly # malformed, but we might be able to salvage the rest of the @@ -519,8 +514,6 @@ defect = errors.InvalidHeaderDefect("Missing header name.") self._cur.defects.append(defect) continue - - assert i>0, "_parse_headers fed line with no : and no leading WS" lastheader = line[:i] lastvalue = [line] # Done with all the lines, so handle the last header. diff -r 304c61263ae6 Lib/test/test_email/test_defect_handling.py --- a/Lib/test/test_email/test_defect_handling.py Mon Jun 13 09:24:11 2016 +0300 +++ b/Lib/test/test_email/test_defect_handling.py Mon Jun 13 12:52:07 2016 +0000 @@ -207,18 +207,15 @@ [errors.FirstHeaderLineIsContinuationDefect]) self.assertEqual(self.get_defects(msg)[0].line, ' Line 1\n') - def test_missing_header_body_separator(self): - # Our heuristic if we see a line that doesn't look like a header (no - # leading whitespace but no ':') is to assume that the blank line that - # separates the header from the body is missing, and to stop parsing - # headers and start parsing the body. - with self._raise_point(errors.MissingHeaderBodySeparatorDefect): + def test_malformed_header_line(self): + # Similar to test_email.TestNonConformant.test_malformed_header_line + with self._raise_point(errors.MalformedHeaderDefect): msg = self._str_msg('Subject: test\nnot a header\nTo: abc\n\nb\n') if self.raise_expected: return - self.assertEqual(msg.keys(), ['Subject']) - self.assertEqual(msg.get_payload(), 'not a header\nTo: abc\n\nb\n') + self.assertEqual(msg.keys(), ['Subject', 'To']) + self.assertEqual(msg.get_payload(), 'b\n') self.assertDefectsEqual(self.get_defects(msg), - [errors.MissingHeaderBodySeparatorDefect]) + [errors.MalformedHeaderDefect]) def test_bad_padding_in_base64_payload(self): source = textwrap.dedent("""\ diff -r 304c61263ae6 Lib/test/test_email/test_email.py --- a/Lib/test/test_email/test_email.py Mon Jun 13 09:24:11 2016 +0300 +++ b/Lib/test/test_email/test_email.py Mon Jun 13 12:52:07 2016 +0000 @@ -2180,12 +2180,12 @@ def test_no_separating_blank_line(self): eq = self.ndiffAssertEqual msg = self._msgobj('msg_35.txt') + self.assertDefectsEqual(msg.defects, [errors.MalformedHeaderDefect]) eq(msg.as_string(), """\ From: aperson@dom.ain To: bperson@dom.ain Subject: here's something interesting -counter to RFC 2822, there's no separating newline here """) # test_defect_handling @@ -2226,17 +2226,22 @@ [errors.FirstHeaderLineIsContinuationDefect]) eq(msg.defects[0].line, ' Line 1\n') + def test_missing_header_body_separator(self): + self.assertTrue(issubclass(errors.MissingHeaderBodySeparatorDefect, + errors.MessageDefect)) + # test_defect_handling - def test_missing_header_body_separator(self): - # Our heuristic if we see a line that doesn't look like a header (no - # leading whitespace but no ':') is to assume that the blank line that - # separates the header from the body is missing, and to stop parsing - # headers and start parsing the body. - msg = self._str_msg('Subject: test\nnot a header\nTo: abc\n\nb\n') - self.assertEqual(msg.keys(), ['Subject']) - self.assertEqual(msg.get_payload(), 'not a header\nTo: abc\n\nb\n') + def test_malformed_header_line(self): + # An ordinary header line that is missing a colon (:) is a defect + input = 'Subject: test\nnot a header\nTo: abc\n\nb\n' + msg = self._str_msg(input) + self.assertEqual(msg.keys(), ['Subject', 'To']) + self.assertEqual(msg.get_payload(), 'b\n') self.assertDefectsEqual(msg.defects, - [errors.MissingHeaderBodySeparatorDefect]) + [errors.MalformedHeaderDefect]) + policy = email.policy.Compat32(raise_on_defect=True) + with self.assertRaises(errors.MalformedHeaderDefect): + self._str_msg(input, policy=policy) # Test RFC 2047 header encoding and decoding @@ -3635,7 +3640,9 @@ eq = self.assertEqual m = '>From foo@example.com 11:25:53\nFrom: bar\n!"#QUX;~: zoo\n\nbody' msg = email.message_from_string(m) - eq(len(msg.keys()), 0) + self.assertDefectsEqual(msg.defects, [errors.MalformedHeaderDefect]) + self.assertSequenceEqual(msg.keys(), ['From', '!"#QUX;~']) + eq(msg.get_payload(), 'body') def test_rfc2822_one_character_header(self): eq = self.assertEqual @@ -3772,7 +3779,7 @@ ('From: göst', ('From', '=?unknown-8bit?b?Z8O2c3Q=?=')), ) headertest_msg = ('\n'.join([src for (src, _) in headertest_headers]) + - '\nYes, they are flying.\n').encode('utf-8') + '\n\nYes, they are flying.\n').encode('utf-8') def test_get_8bit_header(self): msg = email.message_from_bytes(self.headertest_msg) diff -r 304c61263ae6 Lib/test/test_httplib.py --- a/Lib/test/test_httplib.py Mon Jun 13 09:24:11 2016 +0300 +++ b/Lib/test/test_httplib.py Mon Jun 13 12:52:07 2016 +0000 @@ -1,3 +1,4 @@ +import email.errors import errno from http import client import io @@ -283,6 +284,30 @@ self.assertEqual(resp.getheader('First'), 'val') self.assertEqual(resp.getheader('Second'), 'val') + def test_malformed_truncation(self): + # Other malformed header lines, especially without colons, used to + # cause the rest of the header section to be truncated + resp = ( + b'HTTP/1.1 200 OK\r\n' + b'Public-Key-Pins: \n' + b'pin-sha256="xxx=";\n' + b'report-uri="https://..."\r\n' + b'Transfer-Encoding: chunked\r\n' + b'\r\n' + b'4\r\n' + b'body\r\n' + b'0\r\n' + b'\r\n' + ) + sock = FakeSocket(resp) + resp = client.HTTPResponse(sock) + resp.begin() + self.assertIsNotNone(resp.getheader('Public-Key-Pins')) + self.assertEqual(resp.getheader('Transfer-Encoding'), 'chunked') + for defect in resp.msg.defects: + self.assertIsInstance(defect, email.errors.MalformedHeaderDefect) + self.assertEqual(resp.read(), b'body') + def test_invalid_headers(self): conn = client.HTTPConnection('example.com') conn.sock = FakeSocket('') diff -r 304c61263ae6 Misc/NEWS --- a/Misc/NEWS Mon Jun 13 09:24:11 2016 +0300 +++ b/Misc/NEWS Mon Jun 13 12:52:07 2016 +0000 @@ -50,6 +50,11 @@ Library ------- +- Issue #14925: In the email.parser module, only transition from the header + section to the body via a blank line, rather than any invalid header line. + This also affects HTTP header parsing, which would previously ignore + subsequent header fields. + - Issue #25455: Fixed crashes in repr of recursive ElementTree.Element and functools.partial objects.