Index: Lib/io.py =================================================================== --- Lib/io.py (revision 58888) +++ Lib/io.py (working copy) @@ -1036,6 +1036,73 @@ """ return None +_NEWLINE_LF = 1 +_NEWLINE_CR = 2 +_NEWLINE_CRLF = 4 +def _format_newlines(seennl): + return (None, + "\n", + "\r", + ("\r", "\n"), + "\r\n", + ("\n", "\r\n"), + ("\r", "\r\n"), + ("\r", "\n", "\r\n") + )[seennl] + + +class IncrementalNewlineDecoder(codecs.IncrementalDecoder): + def __init__(self, decoder, translate, errors='strict'): + codecs.IncrementalDecoder.__init__(self, errors=errors) + self.buffer = b'' + self.translate = translate + self.decoder = decoder + self.seennl = 0 + + def decode(self, input, final=False): + # decode input (taking the buffer into account) + data = self.buffer + input + + output = self.decoder.decode(data, final=final) + consumed = len(data) + if self.translate and output.endswith("\r") and not final: + output = output[:-1] + consumed -= 1 + + # Record which newlines are read + crlf = output.count('\r\n') + cr = output.count('\r') - crlf + lf = output.count('\n') - crlf + self.seennl |= (lf and _NEWLINE_LF) | (cr and _NEWLINE_CR) \ + | (crlf and _NEWLINE_CRLF) + + if self.translate: + if crlf: + output = output.replace("\r\n", "\n") + if cr: + output = output.replace("\r", "\n") + + # keep undecoded input until the next call + self.buffer = data[consumed:] + return output + + def getstate(self): + buf, flag = self.decoder.getstate() + return buf + self.buffer, flag + + def setstate(self, state): + buf, flag = state + if buf.endswith(b'\r'): + self.buffer = b'\r' + buf = buf[:-1] + else: + self.buffer = b'' + self.decoder.setstate((buf, flag)) + + def reset(self): + self.buffer = b'' + self.decoder.reset() + class TextIOWrapper(TextIOBase): @@ -1117,6 +1184,7 @@ if not isinstance(s, str): raise TypeError("can't write %s to text stream" % s.__class__.__name__) + length = len(s) haslf = "\n" in s if haslf and self._writetranslate and self._writenl != "\n": s = s.replace("\n", self._writenl) @@ -1125,15 +1193,20 @@ self.buffer.write(b) if haslf and self.isatty(): self.flush() - self._snapshot = self._decoder = None - return len(s) + self._snapshot = None + if self._decoder: + self._decoder.reset() + return length def _get_decoder(self): make_decoder = codecs.getincrementaldecoder(self._encoding) if make_decoder is None: raise IOError("Can't find an incremental decoder for encoding %s" % self._encoding) - decoder = self._decoder = make_decoder() # XXX: errors + decoder = make_decoder() # XXX: errors + if self._readuniversal: + decoder = IncrementalNewlineDecoder(decoder, self._readtranslate) + self._decoder = decoder return decoder def _read_chunk(self): @@ -1193,7 +1266,7 @@ decoder_buffer, decoder_state = decoder.getstate() return self._encode_decoder_state( decoder_state, - position + (i+1) - len(decoder_buffer)) + position + (i+1) - len(decoder_buffer) - (n-needed)) raise IOError("Can't reconstruct logical file position") finally: decoder.setstate(saved_state) @@ -1213,7 +1286,8 @@ pos = self.buffer.seek(0, 2) self._snapshot = None self._pending = "" - self._decoder = None + if self._decoder: + self._decoder.reset() return pos if whence != 0: raise ValueError("Invalid whence (%r, should be 0, 1 or 2)" % @@ -1227,7 +1301,8 @@ self.buffer.seek(pos) self._snapshot = None self._pending = "" - self._decoder = None + if self._decoder: + self._decoder.reset() return pos decoder = self._decoder or self._get_decoder() decoder.set_state(("", ds)) @@ -1246,7 +1321,9 @@ res += decoder.decode(self.buffer.read(), True) self._pending = "" self._snapshot = None - return self._replacenl(res) + if self._readuniversal: + self._seennl |= decoder.seennl + return res else: while len(res) < n: readahead, pending = self._read_chunk() @@ -1254,7 +1331,9 @@ if not readahead: break self._pending = res[n:] - return self._replacenl(res[:n]) + if self._readuniversal: + self._seennl |= decoder.seennl + return res[:n] def __next__(self): self._telling = False @@ -1297,7 +1376,7 @@ # Found \n pos = nlpos endpos = pos + 1 - ending = self._LF + ending = _NEWLINE_LF break elif nlpos == -1: if crpos == len(line) - 1: @@ -1306,7 +1385,7 @@ cr_eof = True else: # Found lone \r - ending = self._CR + ending = _NEWLINE_CR pos = crpos endpos = pos + 1 break @@ -1314,11 +1393,11 @@ # Found \n pos = nlpos endpos = pos + 1 - ending = self._LF + ending = _NEWLINE_LF break elif nlpos == crpos + 1: # Found \r\n - ending = self._CRLF + ending = _NEWLINE_CRLF pos = crpos endpos = pos + 2 break @@ -1326,7 +1405,7 @@ # Found \r pos = crpos endpos = pos + 1 - ending = self._CR + ending = _NEWLINE_CR break else: # non-universal @@ -1340,6 +1419,8 @@ more_line = '' while True: readahead, pending = self._read_chunk() + if self._readuniversal: + self._seennl |= self._decoder.seennl more_line = pending if more_line or not readahead: break @@ -1350,61 +1431,22 @@ self._pending = '' self._snapshot = None if cr_eof: - self._seennl |= self._CR return line[:-1] + '\n' else: return line self._pending = line[endpos:] if self._readtranslate: - self._seennl |= ending - if ending != self._LF: + if ending != _NEWLINE_LF: return line[:pos] + '\n' else: return line[:endpos] else: return line[:endpos] - def _replacenl(self, data): - # Replace newlines in data as needed and record that they have - # been seen. - if not self._readtranslate: - return data - if self._readuniversal: - crlf = data.count('\r\n') - cr = data.count('\r') - crlf - lf = data.count('\n') - crlf - self._seennl |= (lf and self._LF) | (cr and self._CR) \ - | (crlf and self._CRLF) - if crlf: - data = data.replace("\r\n", "\n") - if cr: - data = data.replace("\r", "\n") - elif self._readnl == '\n': - # Only need to detect if \n was seen. - if data.count('\n'): - self._seennl |= self._LF - else: - newdata = data.replace(self._readnl, '\n') - if newdata is not data: - self._seennl |= self._nlflag(self._readnl) - data = newdata - return data - - _LF = 1 - _CR = 2 - _CRLF = 4 @property def newlines(self): - return (None, - "\n", - "\r", - ("\r", "\n"), - "\r\n", - ("\n", "\r\n"), - ("\r", "\r\n"), - ("\r", "\n", "\r\n") - )[self._seennl] + return _format_newlines(self._seennl) def _nlflag(self, nlstr): return [None, "\n", "\r", None, "\r\n"].index(nlstr) Index: Lib/test/test_io.py =================================================================== --- Lib/test/test_io.py (revision 58888) +++ Lib/test/test_io.py (working copy) @@ -485,6 +485,10 @@ class TextIOWrapperTest(unittest.TestCase): + def setUp(self): + self.testdata = b"AAA\r\nBBB\rCCC\r\nDDD\nEEE\r\n" + self.normalized = b"AAA\nBBB\nCCC\nDDD\nEEE\n".decode("ASCII") + def tearDown(self): test_support.unlink(test_support.TESTFN) @@ -741,6 +745,114 @@ print("Reading using readline(): %6.3f seconds" % (t3-t2)) print("Using readline()+tell(): %6.3f seconds" % (t4-t3)) + def testReadOneByOne(self): + txt = io.TextIOWrapper(io.BytesIO(b"AA\r\nBB")) + reads = "" + while True: + c = txt.read(1) + if not c: + break + reads += c + self.assertEquals(reads, "AA\nBB") + + # read in amounts equal to TextIOWrapper._CHUNK_SIZE which is 128. + def testReadByChunk(self): + # make sure "\r\n" straddles 128 char boundary. + txt = io.TextIOWrapper(io.BytesIO(b"A" * 127 + b"\r\nB")) + reads = "" + while True: + c = txt.read(128) + if not c: + break + reads += c + self.assertEquals(reads, "A"*127+"\nB") + + def test_issue1395_1(self): + txt = io.TextIOWrapper(io.BytesIO(self.testdata), encoding="ASCII") + + # read one char at a time + reads = "" + while True: + c = txt.read(1) + if not c: + break + reads += c + self.assertEquals(reads, self.normalized) + + def test_issue1395_2(self): + txt = io.TextIOWrapper(io.BytesIO(self.testdata), encoding="ASCII") + txt._CHUNK_SIZE = 4 + + reads = "" + while True: + c = txt.read(4) + if not c: + break + reads += c + self.assertEquals(reads, self.normalized) + + def test_issue1395_3(self): + txt = io.TextIOWrapper(io.BytesIO(self.testdata), encoding="ASCII") + txt._CHUNK_SIZE = 4 + + reads = txt.read(4) + reads += txt.read(4) + reads += txt.readline() + reads += txt.readline() + reads += txt.readline() + self.assertEquals(reads, self.normalized) + + def test_issue1395_4(self): + txt = io.TextIOWrapper(io.BytesIO(self.testdata), encoding="ASCII") + txt._CHUNK_SIZE = 4 + + reads = txt.read(4) + reads += txt.read() + self.assertEquals(reads, self.normalized) + + def test_issue1395_5(self): + txt = io.TextIOWrapper(io.BytesIO(self.testdata), encoding="ASCII") + txt._CHUNK_SIZE = 4 + + reads = txt.read(4) + pos = txt.tell() + txt.seek(0) + txt.seek(pos) + self.assertEquals(txt.read(4), "BBB\n") + + def test_newline_decoder(self): + import codecs + decoder = codecs.getincrementaldecoder("utf-8")() + decoder = io.IncrementalNewlineDecoder(decoder, translate=True) + + self.assertEquals(decoder.decode(b'\xe8\xa2\x88'), "\u8888") + + self.assertEquals(decoder.decode(b'\xe8'), "") + self.assertEquals(decoder.decode(b'\xa2'), "") + self.assertEquals(decoder.decode(b'\x88'), "\u8888") + + self.assertEquals(decoder.decode(b'\xe8'), "") + self.assertRaises(UnicodeDecodeError, decoder.decode, b'', final=True) + + decoder.setstate((b'', 0)) + self.assertEquals(decoder.decode(b'\n'), "\n") + self.assertEquals(decoder.decode(b'\r'), "") + self.assertEquals(decoder.decode(b'', final=True), "\n") + self.assertEquals(decoder.decode(b'\r', final=True), "\n") + + self.assertEquals(decoder.decode(b'\r'), "") + self.assertEquals(decoder.decode(b'a'), "\na") + + self.assertEquals(decoder.decode(b'\r\r\n'), "\n\n") + self.assertEquals(decoder.decode(b'\r'), "") + self.assertEquals(decoder.decode(b'\r'), "\n") + self.assertEquals(decoder.decode(b'\na'), "\na") + + self.assertEquals(decoder.decode(b'\xe8\xa2\x88\r\n'), "\u8888\n") + self.assertEquals(decoder.decode(b'\xe8\xa2\x88'), "\u8888") + self.assertEquals(decoder.decode(b'\n'), "\n") + self.assertEquals(decoder.decode(b'\xe8\xa2\x88\r'), "\u8888") + self.assertEquals(decoder.decode(b'\n'), "\n") # XXX Tests for open()