Index: Lib/email/test/test_email.py =================================================================== --- Lib/email/test/test_email.py (revision 74225) +++ Lib/email/test/test_email.py (working copy) @@ -18,6 +18,7 @@ from email.charset import Charset from email.header import Header, decode_header, make_header from email.parser import Parser, HeaderParser +from email.feedparser import BufferedSubFile, NeedMoreData from email.generator import Generator, DecodedGenerator from email.message import Message from email.mime.application import MIMEApplication @@ -2510,6 +2511,37 @@ eq(msg.get('Header'), value1) eq(msg.get('Next-Header'), value2) + def test_detect_microsoft_linebreak_at_end_of_chunk(self): + # Addresses: [ 1721862 ] email.FeedParser.BufferedSubFile + # improperly handles "\r\n" + # Test code based on example provided by Sye van der Veen. + # First line contains previously buggy data; second and third must + # continue to pass, post-alteration. + input = "1\r\n10\r\n100\r\n1000\r\n10000\r\n" + \ + "1\r10\r100\r1000\r10000\r" + \ + "1\n10\n100\n1000\n10000\n" + data = StringIO(input) + results = [] + buffered_sub_file = BufferedSubFile() + while True: + sample = data.read(3) # Read the next three characters. + if not sample: + break + + buffered_sub_file.push(sample) + for line in buffered_sub_file: + if line is NeedMoreData: + break + else: + results.append(line) + buffered_sub_file.close() + data.close() + + targets = ['1' + target for target in input.split('1')[1:]] + assert len(results) == len(targets) + for (result, target) in zip(results, targets): + assert result == target + def test_rfc2822_header_syntax(self): eq = self.assertEqual m = '>From: foo\nFrom: bar\n!"#QUX;~: zoo\n\nbody' Index: Lib/email/feedparser.py =================================================================== --- Lib/email/feedparser.py (revision 74225) +++ Lib/email/feedparser.py (working copy) @@ -26,6 +26,8 @@ from email import errors from email import message +NLC_mac = '\r' +NLC_unix = '\n' NLCRE = re.compile('\r\n|\r|\n') NLCRE_bol = re.compile('(\r\n|\r|\n)') NLCRE_eol = re.compile('(\r\n|\r|\n)$') @@ -98,12 +100,28 @@ # Handle any previous leftovers data, self._partial = self._partial + data, '' # Crack into lines, but preserve the newlines on the end of each + # First, the possibility that the data was split in the middle of a + # Microsoft linebreak must be addressed. + if data.endswith(NLC_mac): + split_point = max( + data[:-len(NLC_mac)].rfind(NLC_mac), + data.rfind(NLC_unix) + ) + if split_point > -1: + # Treat everything after the last sure split-point as partial. + self._partial = data[split_point + 1:] + data = data[:split_point + 1] + else: + # Treat the entire line as partial data and abort. + self._partial = data + return parts = NLCRE_crack.split(data) # The *ahem* interesting behaviour of re.split when supplied grouping # parentheses is that the last element of the resulting list is the # data after the final RE. In the case of a NL/CR terminated string, # this is the empty string. - self._partial = parts.pop() + if not self._partial: # Partial data may have been set earlier. + self._partial = parts.pop() # parts is a list of strings, alternating between the line contents # and the eol character(s). Gather up a list of lines after # re-attaching the newlines.