Index: Doc/library/stdtypes.rst =================================================================== --- Doc/library/stdtypes.rst (revision 59453) +++ Doc/library/stdtypes.rst (working copy) @@ -1937,7 +1937,16 @@ right. However, using :meth:`seek` to reposition the file to an absolute position will flush the read-ahead buffer. +.. method:: file.__reversed__() + Return a new iterator that returns lines in reverse order (but without + reading the entire file into memory first). Normally called via the + :func:`reversed()` builtin, as in ``for line in reversed(f): print(line)``. + Useful for scanning backwards through large files without reading the + entire file first. Note that this changes the current position of the + underlying file object, so you should not interleave use of reverse and + forward iteration over the same file object. + .. method:: file.read([size]) Read at most *size* bytes from the file (less if the read hits EOF before Index: Lib/io.py =================================================================== --- Lib/io.py (revision 59453) +++ Lib/io.py (working copy) @@ -1136,6 +1136,126 @@ )[self.seennl] +class TextIOReverseIterator: + """Line-based reverse iterator wrapper for IOBase objects. + + This class is used to implement TextIOWrapper.__reversed__(). + It searches backwards for encoded line terminator, which + works for UTF-8 but not for encodings where one character encoding + can be a substring of another longer one. + """ + + # XXX Should we check for encodings that are known to work? Currently + # we would return incorrect results for a codec where, say, the encoding + # of newline could appear as a substring of the encoding for some other + # character or where the codec can have a non-default state at the start + # of a line (do such encodings exist?). + + def __init__(self, buffer, encoding, newline=None, + buffer_size=DEFAULT_BUFFER_SIZE, wrapped_file=None): + if not isinstance(encoding, str): + raise ValueError("invalid encoding: %r" % encoding) + buffer.seek(0, 2) + self.buffer = buffer + self._wrapped_file = wrapped_file # Keep ref to avoid premature close + self._bufsize = buffer_size + self._encoding = encoding + self._translate_newlines = newline is None + if newline: + self._enc_cr = self._enc_lf = None + else: + self._enc_cr = '\r'.encode(encoding) + self._enc_lf = '\n'.encode(encoding) + if self._enc_cr + self._enc_lf != '\r\n'.encode(encoding): + raise ValueError('unsupported encoding: %r' % encoding) + self._newline = newline.encode(encoding) if newline else None + self._limpos = buffer.tell() + self._bufpos = self._limpos + self._pending = b'' + + def _extend_buffer_backwards(self): + (bufpos, limpos, bufsize) = (self._bufpos, self._limpos, self._bufsize) + + newpos = (bufpos // bufsize) * bufsize + if newpos == bufpos: + newpos -= bufsize + assert newpos >= 0 + nbytes = bufpos - newpos + assert nbytes != 0 + + self.buffer.seek(newpos, 0) + assert self.buffer.tell() == newpos, \ + 'seek() arrived at %r (expected %r)' % (seekpos, newpos) + newbuf = self.buffer.read(nbytes) + assert len(newbuf) == nbytes, 'Unexpected EOF' + + if limpos > bufpos: + newbuf += self._pending[:limpos - bufpos] + (self._pending, self._bufpos) = (newbuf, newpos) + + __iter__ = lambda self: self + + # Look backwards for the first occurrence of \r, \n or \r\n. + # Return (offset, terminator) or (-1, None) if we need to read more. + def _find_universal_endline(self, limpos): + enc_cr, enc_lf = self._enc_cr, self._enc_lf + cr_pos = self._pending.rfind(enc_cr, 0, limpos) + lf_pos = self._pending.rfind(enc_lf, 0, limpos) + res = -1, None + if lf_pos != -1 and lf_pos > cr_pos: + if lf_pos > len(enc_cr) or self._bufpos == 0: + if cr_pos != -1 and cr_pos == lf_pos - len(enc_lf): + res = cr_pos, enc_cr + enc_lf + else: + res = lf_pos, enc_lf + elif cr_pos != -1: + res = cr_pos, enc_cr + return res + + def _getbytes(self): + is_firstline = self._pending == b'' + limpos, newline = self._limpos, self._newline + + if limpos is None: + raise StopIteration + assert limpos >= 0 + + # limpos points one character past the end of the line we're about to + # return - e.g "abc\ndef" + # ^ + while True: + lim_offset = limpos - self._bufpos # file offset -> buf offset + if newline is None: + offset, ending = self._find_universal_endline(lim_offset) + else: + offset = self._pending.rfind(newline, 0, lim_offset) + ending = newline + + if offset != -1: + self._limpos = self._bufpos + offset + line_offset = offset + len(ending) + break + + if self._bufpos > 0: + self._extend_buffer_backwards() + else: + self._limpos = None + line_offset = 0 + break + + # We treat the first returned line specially, as it may be missing + # the endline terminator. Also we avoid returning an initial empty + # line for files with a normal terminating endline. + # + if is_firstline: + return self._pending[line_offset:] or self._getbytes() + else: + ending_to_add = self._enc_lf if self._translate_newlines else ending + return self._pending[line_offset:lim_offset] + ending_to_add + + def __next__(self): + return self._getbytes().decode(self._encoding) + class TextIOWrapper(TextIOBase): """Buffered text stream. @@ -1382,6 +1502,10 @@ self._pending = res[n:] return res[:n] + def __reversed__(self): + return TextIOReverseIterator(self.buffer, self._encoding, self._readnl, + wrapped_file=self) + def __next__(self): self._telling = False line = self.readline() Index: Lib/test/test_io.py =================================================================== --- Lib/test/test_io.py (revision 59453) +++ Lib/test/test_io.py (working copy) @@ -621,6 +621,88 @@ self.assertEquals(got_line, exp_line) self.assertEquals(len(got_lines), len(exp_lines)) + def testReversedLines(self): + texts = [ + "a\nbb\nccc\n\neee\n" + "AAA\nBBB\nCCC\rDDD\rEEE\r\nFFF\r\nGGG", + "", + "foo", + "\nfoo", + "\rbar", + "\r\nbaz", + "foo\n", + "\n\n", + ("\0\x0f\xff\u0fff\uffff\U000fffff\U0010ffff"*3 + "\n") * 3 + "\n" + ] + + # Check the line splitting logic + encodings = [ "utf-8", "latin-1" ] + newlines = [ None, "\n", "\r", "\r\n" ] + for text in texts: + for encoding in encodings: + for newline in newlines: + for bufsize in None, 1, 2, 3, 5, 10: + def make_textio(): + bufio = io.BytesIO(text.encode(encoding)) + return io.TextIOWrapper(bufio, encoding=encoding, + newline=newline) + try: + textio = make_textio() + except UnicodeEncodeError: + # Skip non-ascii tests for latin-1 + continue + if bufsize is None: + revio = reversed(textio) + else: + revio = io.TextIOReverseIterator( + textio.buffer, encoding, newline, bufsize) + params = dict(text=text, enc=encoding, + nl=newline, bufsize=bufsize) + got = list(revio) + exp = list(reversed(list(make_textio()))) + self.assertEquals((got, params), (exp, params)) + + # Check the normal i/o path + path = test_support.TESTFN + f = io.open(path, "w+") + f.write(texts[0]) + f.seek(0, 0) + lines = f.readlines() + f.close() + revlines = list(reversed(io.open(path))) + self.assertEquals(revlines, list(reversed(lines))) + + def testReversedLinesOpcount(self): + import math + + class LoggingRaw (io.RawIOBase): + def __init__(self, data): + self._bytes = io.BytesIO(data) + self._nseeks = self._nreads = 0 + + def readinto(self, b): + res = self._bytes.readinto(b) + #print("readinto => %r" % (res,)) + self._nreads += 1 + return res + + def seek(self, pos, whence): + res = self._bytes.seek(pos, whence) + #print("seek(%r, %r) => %r" % (pos, whence, res)) + self._nseeks += 1 + return res + + readable = lambda self: True + + lines = [ "x" * 80 + "\n" ] * 1000 + [ "l" * 1000 ] + encoding = "ascii" + raw = LoggingRaw(b"".join(line.encode(encoding) for line in lines)) + textio = io.TextIOWrapper(io.BufferedReader(raw), encoding) + self.assertEqual(list(reversed(textio)), list(reversed(lines))) + exp_nreads = math.ceil(sum(map(len, lines)) / io.DEFAULT_BUFFER_SIZE) + self.assertEqual(raw._nreads, exp_nreads) + #print("nseeks=%d nreads=%d" % (raw._nseeks, raw._nreads)) + def testNewlinesInput(self): testdata = b"AAA\nBBB\nCCC\rDDD\rEEE\r\nFFF\r\nGGG" normalized = testdata.replace(b"\r\n", b"\n").replace(b"\r", b"\n") @@ -792,7 +874,11 @@ while f.readline(): f.tell() t4 = timer() + for line in reversed(f): + pass + t5 = timer() f.close() + if test_support.verbose: print("\nTiming test: %d lines of %d characters (%d bytes)" % (nlines, nchars, nbytes)) @@ -801,6 +887,7 @@ print("Reading using iteration: %6.3f seconds" % (t2-t1)) print("Reading using readline(): %6.3f seconds" % (t3-t2)) print("Using readline()+tell(): %6.3f seconds" % (t4-t3)) + print("Using reversed(): %6.3f seconds" % (t5-t4)) def testReadOneByOne(self): txt = io.TextIOWrapper(io.BytesIO(b"AA\r\nBB")) Index: Lib/test/output/test_profile =================================================================== --- Lib/test/output/test_profile (revision 59453) +++ Lib/test/output/test_profile (working copy) @@ -10,7 +10,7 @@ 12 0.000 0.000 0.012 0.001 :0(hasattr) 1 0.000 0.000 0.000 0.000 :0(setprofile) 1 0.000 0.000 1.000 1.000 :1() - 2 0.000 0.000 0.000 0.000 io.py:1211(flush) + 2 0.000 0.000 0.000 0.000 io.py:1331(flush) 1 0.000 0.000 0.000 0.000 io.py:269(flush) 1 0.000 0.000 0.000 0.000 io.py:656(closed) 1 0.000 0.000 0.000 0.000 io.py:874(flush) @@ -33,11 +33,11 @@ :0(append) -> :0(exc_info) -> :0(exec) -> :1()(1) 1.000 - io.py:1211(flush)(2) 0.000 + io.py:1331(flush)(2) 0.000 :0(hasattr) -> test_profile.py:115(__getattr__)(12) 0.028 :0(setprofile) -> :1() -> test_profile.py:30(testfunc)(1) 1.000 -io.py:1211(flush) -> io.py:269(flush)(1) 0.000 +io.py:1331(flush) -> io.py:269(flush)(1) 0.000 io.py:874(flush)(1) 0.000 io.py:269(flush) -> io.py:656(closed) -> @@ -74,10 +74,10 @@ test_profile.py:93(helper2)(8) 0.400 :0(setprofile) <- profile:0(testfunc())(1) 1.000 :1() <- :0(exec)(1) 1.000 -io.py:1211(flush) <- :0(exec)(2) 1.000 -io.py:269(flush) <- io.py:1211(flush)(1) 0.000 +io.py:1331(flush) <- :0(exec)(2) 1.000 +io.py:269(flush) <- io.py:1331(flush)(1) 0.000 io.py:656(closed) <- io.py:874(flush)(1) 0.000 -io.py:874(flush) <- io.py:1211(flush)(1) 0.000 +io.py:874(flush) <- io.py:1331(flush)(1) 0.000 profile:0(profiler) <- profile:0(testfunc()) <- profile:0(profiler)(1) 0.000 test_profile.py:103(subhelper) <- test_profile.py:93(helper2)(8) 0.400 Index: Lib/test/output/test_cProfile =================================================================== --- Lib/test/output/test_cProfile (revision 59453) +++ Lib/test/output/test_cProfile (working copy) @@ -5,7 +5,7 @@ ncalls tottime percall cumtime percall filename:lineno(function) 1 0.000 0.000 1.000 1.000 :1() - 2 0.000 0.000 0.000 0.000 io.py:1211(flush) + 2 0.000 0.000 0.000 0.000 io.py:1331(flush) 1 0.000 0.000 0.000 0.000 io.py:269(flush) 1 0.000 0.000 0.000 0.000 io.py:656(closed) 1 0.000 0.000 0.000 0.000 io.py:874(flush) @@ -30,7 +30,7 @@ Function called... ncalls tottime cumtime :1() -> 1 0.270 1.000 test_cProfile.py:30(testfunc) -io.py:1211(flush) -> 1 0.000 0.000 io.py:269(flush) +io.py:1331(flush) -> 1 0.000 0.000 io.py:269(flush) 1 0.000 0.000 io.py:874(flush) io.py:269(flush) -> io.py:656(closed) -> @@ -53,7 +53,7 @@ test_cProfile.py:93(helper2) -> 8 0.064 0.080 test_cProfile.py:103(subhelper) 8 0.000 0.008 {hasattr} {exec} -> 1 0.000 1.000 :1() - 2 0.000 0.000 io.py:1211(flush) + 2 0.000 0.000 io.py:1331(flush) {hasattr} -> 12 0.012 0.012 test_cProfile.py:115(__getattr__) {method 'append' of 'list' objects} -> {method 'disable' of '_lsprof.Profiler' objects} -> @@ -65,10 +65,10 @@ Function was called by... ncalls tottime cumtime :1() <- 1 0.000 1.000 {exec} -io.py:1211(flush) <- 2 0.000 0.000 {exec} -io.py:269(flush) <- 1 0.000 0.000 io.py:1211(flush) +io.py:1331(flush) <- 2 0.000 0.000 {exec} +io.py:269(flush) <- 1 0.000 0.000 io.py:1331(flush) io.py:656(closed) <- 1 0.000 0.000 io.py:874(flush) -io.py:874(flush) <- 1 0.000 0.000 io.py:1211(flush) +io.py:874(flush) <- 1 0.000 0.000 io.py:1331(flush) test_cProfile.py:103(subhelper) <- 8 0.064 0.080 test_cProfile.py:93(helper2) test_cProfile.py:115(__getattr__) <- 16 0.016 0.016 test_cProfile.py:103(subhelper) 12 0.012 0.012 {hasattr}