Index: Lib/io.py =================================================================== --- Lib/io.py (revision 58916) +++ Lib/io.py (working copy) @@ -1039,6 +1039,78 @@ return None +class IncrementalNewlineDecoder(codecs.IncrementalDecoder): + def __init__(self, decoder, translate, errors='strict'): + codecs.IncrementalDecoder.__init__(self, errors=errors) + self.buffer = b'' + self.translate = translate + self.decoder = decoder + self.seennl = 0 + + def decode(self, input, final=False): + # decode input (with the eventual \r from a previous pass) + if self.buffer: + input = self.buffer + input + + output = self.decoder.decode(input, final=final) + + # retain last \r even when not translating data: + # then readline() is sure to get \r\n in one pass + if output.endswith("\r") and not final: + output = output[:-1] + self.buffer = b'\r' + else: + self.buffer= b'' + + # Record which newlines are read + crlf = output.count('\r\n') + cr = output.count('\r') - crlf + lf = output.count('\n') - crlf + self.seennl |= (lf and self._LF) | (cr and self._CR) \ + | (crlf and self._CRLF) + + if self.translate: + if crlf: + output = output.replace("\r\n", "\n") + if cr: + output = output.replace("\r", "\n") + + return output + + def getstate(self): + buf, flag = self.decoder.getstate() + return buf + self.buffer, flag + + def setstate(self, state): + buf, flag = state + if buf.endswith(b'\r'): + self.buffer = b'\r' + buf = buf[:-1] + else: + self.buffer = b'' + self.decoder.setstate((buf, flag)) + + def reset(self): + self.buffer = b'' + self.decoder.reset() + + _LF = 1 + _CR = 2 + _CRLF = 4 + + @property + def newlines(self): + return (None, + "\n", + "\r", + ("\r", "\n"), + "\r\n", + ("\n", "\r\n"), + ("\r", "\r\n"), + ("\r", "\n", "\r\n") + )[self.seennl] + + class TextIOWrapper(TextIOBase): """Buffered text stream. @@ -1075,7 +1147,6 @@ self._readnl = newline self._writetranslate = newline != '' self._writenl = newline or os.linesep - self._seennl = 0 self._decoder = None self._pending = "" self._snapshot = None @@ -1122,6 +1193,7 @@ if not isinstance(s, str): raise TypeError("can't write %s to text stream" % s.__class__.__name__) + length = len(s) haslf = "\n" in s if haslf and self._writetranslate and self._writenl != "\n": s = s.replace("\n", self._writenl) @@ -1130,15 +1202,20 @@ self.buffer.write(b) if haslf and self.isatty(): self.flush() - self._snapshot = self._decoder = None - return len(s) + self._snapshot = None + if self._decoder: + self._decoder.reset() + return length def _get_decoder(self): make_decoder = codecs.getincrementaldecoder(self._encoding) if make_decoder is None: raise IOError("Can't find an incremental decoder for encoding %s" % self._encoding) - decoder = self._decoder = make_decoder() # XXX: errors + decoder = make_decoder() # XXX: errors + if self._readuniversal: + decoder = IncrementalNewlineDecoder(decoder, self._readtranslate) + self._decoder = decoder return decoder def _read_chunk(self): @@ -1198,7 +1275,7 @@ decoder_buffer, decoder_state = decoder.getstate() return self._encode_decoder_state( decoder_state, - position + (i+1) - len(decoder_buffer)) + position + (i+1) - len(decoder_buffer) - (n-needed)) raise IOError("Can't reconstruct logical file position") finally: decoder.setstate(saved_state) @@ -1218,7 +1295,8 @@ pos = self.buffer.seek(0, 2) self._snapshot = None self._pending = "" - self._decoder = None + if self._decoder: + self._decoder.reset() return pos if whence != 0: raise ValueError("Invalid whence (%r, should be 0, 1 or 2)" % @@ -1232,7 +1310,8 @@ self.buffer.seek(pos) self._snapshot = None self._pending = "" - self._decoder = None + if self._decoder: + self._decoder.reset() return pos decoder = self._decoder or self._get_decoder() decoder.set_state(("", ds)) @@ -1251,7 +1330,7 @@ res += decoder.decode(self.buffer.read(), True) self._pending = "" self._snapshot = None - return self._replacenl(res) + return res else: while len(res) < n: readahead, pending = self._read_chunk() @@ -1259,7 +1338,7 @@ if not readahead: break self._pending = res[n:] - return self._replacenl(res[:n]) + return res[:n] def __next__(self): self._telling = False @@ -1283,62 +1362,55 @@ line = self._pending start = 0 - cr_eof = False decoder = self._decoder or self._get_decoder() pos = endpos = None - ending = None while True: - if self._readuniversal: + if self._readtranslate: + # Newlines are already translated, only search for \n + pos = line.find('\n', start) + if pos >= 0: + endpos = pos + 1 + break + else: + start = len(line) + + elif self._readuniversal: # Universal newline search. Find any of \r, \r\n, \n + # The decoder ensures that \r\n are not split in two pieces # In C we'd look for these in parallel of course. nlpos = line.find("\n", start) crpos = line.find("\r", start) if crpos == -1: if nlpos == -1: + # Nothing found start = len(line) else: # Found \n - pos = nlpos - endpos = pos + 1 - ending = self._LF + endpos = nlpos + 1 break elif nlpos == -1: - if crpos == len(line) - 1: - # Found \r at end of buffer, must keep reading - start = crpos - cr_eof = True - else: - # Found lone \r - ending = self._CR - pos = crpos - endpos = pos + 1 - break + # Found lone \r + endpos = crpos + 1 + break elif nlpos < crpos: # Found \n - pos = nlpos - endpos = pos + 1 - ending = self._LF + endpos = nlpos + 1 break elif nlpos == crpos + 1: # Found \r\n - ending = self._CRLF - pos = crpos - endpos = pos + 2 + endpos = crpos + 2 break else: # Found \r - pos = crpos - endpos = pos + 1 - ending = self._CR + endpos = crpos + 1 break else: # non-universal pos = line.find(self._readnl) if pos >= 0: - endpos = pos+len(self._readnl) - ending = self._nlflag(self._readnl) + endpos = pos + len(self._readnl) break # No line ending seen yet - get more data @@ -1354,66 +1426,15 @@ # end of file self._pending = '' self._snapshot = None - if cr_eof: - self._seennl |= self._CR - return line[:-1] + '\n' - else: - return line + return line self._pending = line[endpos:] - if self._readtranslate: - self._seennl |= ending - if ending != self._LF: - return line[:pos] + '\n' - else: - return line[:endpos] - else: - return line[:endpos] + return line[:endpos] - def _replacenl(self, data): - # Replace newlines in data as needed and record that they have - # been seen. - if not self._readtranslate: - return data - if self._readuniversal: - crlf = data.count('\r\n') - cr = data.count('\r') - crlf - lf = data.count('\n') - crlf - self._seennl |= (lf and self._LF) | (cr and self._CR) \ - | (crlf and self._CRLF) - if crlf: - data = data.replace("\r\n", "\n") - if cr: - data = data.replace("\r", "\n") - elif self._readnl == '\n': - # Only need to detect if \n was seen. - if data.count('\n'): - self._seennl |= self._LF - else: - newdata = data.replace(self._readnl, '\n') - if newdata is not data: - self._seennl |= self._nlflag(self._readnl) - data = newdata - return data - - _LF = 1 - _CR = 2 - _CRLF = 4 @property def newlines(self): - return (None, - "\n", - "\r", - ("\r", "\n"), - "\r\n", - ("\n", "\r\n"), - ("\r", "\r\n"), - ("\r", "\n", "\r\n") - )[self._seennl] + return self._decoder.newlines if self._decoder else None - def _nlflag(self, nlstr): - return [None, "\n", "\r", None, "\r\n"].index(nlstr) - class StringIO(TextIOWrapper): # XXX This is really slow, but fully functional Index: Lib/test/test_io.py =================================================================== --- Lib/test/test_io.py (revision 58916) +++ Lib/test/test_io.py (working copy) @@ -489,6 +489,10 @@ class TextIOWrapperTest(unittest.TestCase): + def setUp(self): + self.testdata = b"AAA\r\nBBB\rCCC\r\nDDD\nEEE\r\n" + self.normalized = b"AAA\nBBB\nCCC\nDDD\nEEE\n".decode("ASCII") + def tearDown(self): test_support.unlink(test_support.TESTFN) @@ -745,7 +749,115 @@ print("Reading using readline(): %6.3f seconds" % (t3-t2)) print("Using readline()+tell(): %6.3f seconds" % (t4-t3)) + def testReadOneByOne(self): + txt = io.TextIOWrapper(io.BytesIO(b"AA\r\nBB")) + reads = "" + while True: + c = txt.read(1) + if not c: + break + reads += c + self.assertEquals(reads, "AA\nBB") + # read in amounts equal to TextIOWrapper._CHUNK_SIZE which is 128. + def testReadByChunk(self): + # make sure "\r\n" straddles 128 char boundary. + txt = io.TextIOWrapper(io.BytesIO(b"A" * 127 + b"\r\nB")) + reads = "" + while True: + c = txt.read(128) + if not c: + break + reads += c + self.assertEquals(reads, "A"*127+"\nB") + + def test_issue1395_1(self): + txt = io.TextIOWrapper(io.BytesIO(self.testdata), encoding="ASCII") + + # read one char at a time + reads = "" + while True: + c = txt.read(1) + if not c: + break + reads += c + self.assertEquals(reads, self.normalized) + + def test_issue1395_2(self): + txt = io.TextIOWrapper(io.BytesIO(self.testdata), encoding="ASCII") + txt._CHUNK_SIZE = 4 + + reads = "" + while True: + c = txt.read(4) + if not c: + break + reads += c + self.assertEquals(reads, self.normalized) + + def test_issue1395_3(self): + txt = io.TextIOWrapper(io.BytesIO(self.testdata), encoding="ASCII") + txt._CHUNK_SIZE = 4 + + reads = txt.read(4) + reads += txt.read(4) + reads += txt.readline() + reads += txt.readline() + reads += txt.readline() + self.assertEquals(reads, self.normalized) + + def test_issue1395_4(self): + txt = io.TextIOWrapper(io.BytesIO(self.testdata), encoding="ASCII") + txt._CHUNK_SIZE = 4 + + reads = txt.read(4) + reads += txt.read() + self.assertEquals(reads, self.normalized) + + def test_issue1395_5(self): + txt = io.TextIOWrapper(io.BytesIO(self.testdata), encoding="ASCII") + txt._CHUNK_SIZE = 4 + + reads = txt.read(4) + pos = txt.tell() + txt.seek(0) + txt.seek(pos) + self.assertEquals(txt.read(4), "BBB\n") + + def test_newline_decoder(self): + import codecs + decoder = codecs.getincrementaldecoder("utf-8")() + decoder = io.IncrementalNewlineDecoder(decoder, translate=True) + + self.assertEquals(decoder.decode(b'\xe8\xa2\x88'), "\u8888") + + self.assertEquals(decoder.decode(b'\xe8'), "") + self.assertEquals(decoder.decode(b'\xa2'), "") + self.assertEquals(decoder.decode(b'\x88'), "\u8888") + + self.assertEquals(decoder.decode(b'\xe8'), "") + self.assertRaises(UnicodeDecodeError, decoder.decode, b'', final=True) + + decoder.setstate((b'', 0)) + self.assertEquals(decoder.decode(b'\n'), "\n") + self.assertEquals(decoder.decode(b'\r'), "") + self.assertEquals(decoder.decode(b'', final=True), "\n") + self.assertEquals(decoder.decode(b'\r', final=True), "\n") + + self.assertEquals(decoder.decode(b'\r'), "") + self.assertEquals(decoder.decode(b'a'), "\na") + + self.assertEquals(decoder.decode(b'\r\r\n'), "\n\n") + self.assertEquals(decoder.decode(b'\r'), "") + self.assertEquals(decoder.decode(b'\r'), "\n") + self.assertEquals(decoder.decode(b'\na'), "\na") + + self.assertEquals(decoder.decode(b'\xe8\xa2\x88\r\n'), "\u8888\n") + self.assertEquals(decoder.decode(b'\xe8\xa2\x88'), "\u8888") + self.assertEquals(decoder.decode(b'\n'), "\n") + self.assertEquals(decoder.decode(b'\xe8\xa2\x88\r'), "\u8888") + self.assertEquals(decoder.decode(b'\n'), "\n") + # XXX Tests for open() class MiscIOTest(unittest.TestCase):