Index: Lib/io.py =================================================================== --- Lib/io.py (revision 58888) +++ Lib/io.py (working copy) @@ -1037,6 +1037,35 @@ return None +class IncrementalNewlineDecoder(codecs.BufferedIncrementalDecoder): + def __init__(self, decoder, errors='strict'): + codecs.BufferedIncrementalDecoder.__init__(self, errors=errors) + self.decoder = decoder + + def _buffer_decode(self, input, errors, final): + output = self.decoder.decode(input, final=final) + consumed = len(input) + if output.endswith("\r") and not final: + output = output[:-1] + consumed -= 1 + + output = output.replace("\r\n", "\n") + output = output.replace("\r", "\n") + return output, consumed + + def getstate(self): + buf, flag = self.decoder.getstate() + return buf + self.buffer, flag + + def setstate(self, state): + buf, flag = state + if buf.endswith(b'\r'): + self.buffer = b'\r' + buf = buf[:-1] + else: + self.buffer = b'' + self.decoder.setstate((buf, flag)) + class TextIOWrapper(TextIOBase): """Buffered text stream. @@ -1133,7 +1160,10 @@ if make_decoder is None: raise IOError("Can't find an incremental decoder for encoding %s" % self._encoding) - decoder = self._decoder = make_decoder() # XXX: errors + decoder = make_decoder() # XXX: errors + if self._readtranslate: + decoder = IncrementalNewlineDecoder(decoder) + self._decoder = decoder return decoder def _read_chunk(self): @@ -1376,10 +1406,6 @@ lf = data.count('\n') - crlf self._seennl |= (lf and self._LF) | (cr and self._CR) \ | (crlf and self._CRLF) - if crlf: - data = data.replace("\r\n", "\n") - if cr: - data = data.replace("\r", "\n") elif self._readnl == '\n': # Only need to detect if \n was seen. if data.count('\n'): Index: Lib/test/test_io.py =================================================================== --- Lib/test/test_io.py (revision 58888) +++ Lib/test/test_io.py (working copy) @@ -485,6 +485,10 @@ class TextIOWrapperTest(unittest.TestCase): + def setUp(self): + self.testdata = b"AAA\r\nBBB\rCCC\r\nDDD\nEEE\r\n" + self.normalized = b"AAA\nBBB\nCCC\nDDD\nEEE\n".decode("ASCII") + def tearDown(self): test_support.unlink(test_support.TESTFN) @@ -741,6 +745,110 @@ print("Reading using readline(): %6.3f seconds" % (t3-t2)) print("Using readline()+tell(): %6.3f seconds" % (t4-t3)) + def testReadOneByOne(self): + txt = io.TextIOWrapper(io.BytesIO(b"AA\r\nBB")) + reads = "" + while True: + c = txt.read(1) + if not c: + break + reads += c + self.assertEquals(reads, "AA\nBB") + + # read in amounts equal to TextIOWrapper._CHUNK_SIZE which is 128. + def testReadByChunk(self): + # make sure "\r\n" straddles 128 char boundary. + txt = io.TextIOWrapper(io.BytesIO(b"A" * 127 + b"\r\nB")) + reads = "" + while True: + c = txt.read(128) + if not c: + break + reads += c + self.assertEquals(reads, "A"*127+"\nB") + + def test_issue1395_1(self): + txt = io.TextIOWrapper(io.BytesIO(self.testdata), encoding="ASCII") + + # read one char at a time + reads = "" + while True: + c = txt.read(1) + if not c: + break + reads += c + self.assertEquals(reads, self.normalized) + + def test_issue1395_2(self): + txt = io.TextIOWrapper(io.BytesIO(self.testdata), encoding="ASCII") + txt._CHUNK_SIZE = 4 + + reads = "" + while True: + c = txt.read(4) + if not c: + break + reads += c + self.assertEquals(reads, self.normalized) + + def test_issue1395_3(self): + txt = io.TextIOWrapper(io.BytesIO(self.testdata), encoding="ASCII") + txt._CHUNK_SIZE = 4 + + reads = txt.read(4) + reads += txt.read(4) + reads += txt.readline() + reads += txt.readline() + reads += txt.readline() + self.assertEquals(reads, self.normalized) + + def test_issue1395_4(self): + txt = io.TextIOWrapper(io.BytesIO(self.testdata), encoding="ASCII") + txt._CHUNK_SIZE = 4 + + reads = txt.read(4) + reads += txt.read() + self.assertEquals(reads, self.normalized) + + def test_issue1395_5(self): + txt = io.TextIOWrapper(io.BytesIO(self.testdata), encoding="ASCII") + txt._CHUNK_SIZE = 4 + + reads = txt.read(4) + pos = txt.tell() + txt.seek(0) + txt.seek(pos) + self.assertEquals(txt.read(4), "BBB\n") + + def test_newline_decoder(self): + import codecs + decoder = codecs.getincrementaldecoder("utf-8")() + decoder = io.IncrementalNewlineDecoder(decoder) + + self.assertEquals(decoder.decode(b'\xe8\xa2\x88'), "\u8888") + + self.assertEquals(decoder.decode(b'\xe8'), "") + self.assertEquals(decoder.decode(b'\xa2'), "") + self.assertEquals(decoder.decode(b'\x88'), "\u8888") + + self.assertEquals(decoder.decode(b'\xe8'), "") + self.assertRaises(UnicodeDecodeError, decoder.decode, b'', final=True) + + decoder.reset() + self.assertEquals(decoder.decode(b'\n'), "\n") + self.assertEquals(decoder.decode(b'\r'), "") + self.assertEquals(decoder.decode(b'', final=True), "\n") + + self.assertEquals(decoder.decode(b'\r\r\n'), "\n\n") + self.assertEquals(decoder.decode(b'\r'), "") + self.assertEquals(decoder.decode(b'\r'), "\n") + self.assertEquals(decoder.decode(b'\n'), "\n") + + self.assertEquals(decoder.decode(b'\xe8\xa2\x88\r\n'), "\u8888\n") + self.assertEquals(decoder.decode(b'\xe8\xa2\x88'), "\u8888") + self.assertEquals(decoder.decode(b'\n'), "\n") + self.assertEquals(decoder.decode(b'\xe8\xa2\x88\r'), "\u8888") + self.assertEquals(decoder.decode(b'\n'), "\n") # XXX Tests for open()