Index: Doc/reference/datamodel.rst =================================================================== --- Doc/reference/datamodel.rst (revision 59439) +++ Doc/reference/datamodel.rst (working copy) @@ -1662,12 +1662,27 @@ Iterator objects also need to implement this method; they are required to return themselves. For more information on iterator objects, see :ref:`typeiter`. +.. method:: object.__reversed__(self) + + Called (if present) by the :func:`reversed()` builtin to implement + reverse iteration. It should return a new iterator object that iterates + over all the objects in the container in reverse order. + + If the :meth:`__reversed__()` method is not provided, the + :func:`reversed()` builtin will fall back to using the sequence protocol + (:meth:`__len__()` and :meth:`__getitem__()`). Objects should normally + only provide :meth:`__reversed__()` if they do not support the sequence + protocol and an efficient implementation of reverse iteration is possible. + The membership test operators (:keyword:`in` and :keyword:`not in`) are normally implemented as an iteration through a sequence. However, container objects can supply the following special method with a more efficient implementation, which also does not require the object be a sequence. + + + .. method:: object.__contains__(self, item) Called to implement membership test operators. Should return true if *item* is Index: Doc/library/stdtypes.rst =================================================================== --- Doc/library/stdtypes.rst (revision 59439) +++ Doc/library/stdtypes.rst (working copy) @@ -1937,7 +1937,16 @@ right. However, using :meth:`seek` to reposition the file to an absolute position will flush the read-ahead buffer. +.. method:: file.__reversed__() + Return a new iterator that returns lines in reverse order (but without + reading the entire file into memory first). Normally called via the + :func:`reversed()` builtin, as in ``for line in reversed(f): print(line)``. + Useful for scanning backwards through large files without reading the + entire file first. Note that this changes the current position of the + underlying file object, so you should not interleave use of reverse and + forward iteration over the same file object. + .. method:: file.read([size]) Read at most *size* bytes from the file (less if the read hits EOF before Index: Doc/library/functions.rst =================================================================== --- Doc/library/functions.rst (revision 59439) +++ Doc/library/functions.rst (working copy) @@ -869,8 +869,9 @@ .. function:: reversed(seq) - Return a reverse :term:`iterator`. *seq* must be an object which supports - the sequence protocol (the :meth:`__len__` method and the :meth:`__getitem__` + Return a reverse :term:`iterator`. *seq* must be an object which has + a :meth:`__reversed__` method [#]_ or supports the sequence protocol + (the :meth:`__len__` method and the :meth:`__getitem__` method with integer arguments starting at ``0``). @@ -1099,6 +1100,8 @@ any I/O has been performed, and there's no reliable way to determine whether this is the case. +.. [#] See :ref:`sequence-types` + .. [#] In the current implementation, local variable bindings cannot normally be affected this way, but variables retrieved from other scopes (such as modules) can be. This may change. Index: Lib/io.py =================================================================== --- Lib/io.py (revision 59439) +++ Lib/io.py (working copy) @@ -1136,6 +1136,125 @@ )[self.seennl] +class TextIOReverseIterator: + """Line-based reverse iterator wrapper for IOBase objects. + + This class is used to implement TextIOWrapper.__reversed__(). + It searches backwards for encoded line terminator, which + works for UTF-8 but not for encodings where one character encoding + can be a substring of another longer one. + """ + + # XXX Should we check for encodings that are known to work? Currently + # we would return incorrect results for a codec where, say, the encoding + # of newline could appear as a substring of the encoding for some other + # character or where the codec can have a non-default state at the start + # of a line (do such encodings exist?). + + def __init__(self, buffer, encoding, newline=None, + buffer_size=DEFAULT_BUFFER_SIZE): + if not isinstance(encoding, str): + raise ValueError("invalid encoding: %r" % encoding) + buffer.seek(0, 2) + self.buffer = buffer + self._bufsize = buffer_size + self._encoding = encoding + self._translate_newlines = newline is None + if newline: + self._enc_cr = self._enc_lf = None + else: + self._enc_cr = '\r'.encode(encoding) + self._enc_lf = '\n'.encode(encoding) + if self._enc_cr + self._enc_lf != '\r\n'.encode(encoding): + raise ValueError('unsupported encoding: %r' % encoding) + self._newline = newline.encode(encoding) if newline else None + self._limpos = buffer.tell() + self._bufpos = self._limpos + self._pending = b'' + + def _extend_buffer_backwards(self): + (bufpos, limpos, bufsize) = (self._bufpos, self._limpos, self._bufsize) + + newpos = (bufpos // bufsize) * bufsize + if newpos == bufpos: + newpos -= bufsize + assert newpos >= 0 + nbytes = bufpos - newpos + assert nbytes != 0 + + self.buffer.seek(newpos, 0) + assert self.buffer.tell() == newpos, \ + 'seek() arrived at %r (expected %r)' % (seekpos, newpos) + newbuf = self.buffer.read(nbytes) + assert len(newbuf) == nbytes, 'Unexpected EOF' + + if limpos > bufpos: + newbuf += self._pending[:limpos - bufpos] + (self._pending, self._bufpos) = (newbuf, newpos) + + __iter__ = lambda self: self + + # Look backwards for the first occurrence of \r, \n or \r\n. + # Return (offset, terminator) or (-1, None) if we need to read more. + def _find_universal_endline(self, limpos): + enc_cr, enc_lf = self._enc_cr, self._enc_lf + cr_pos = self._pending.rfind(enc_cr, 0, limpos) + lf_pos = self._pending.rfind(enc_lf, 0, limpos) + res = -1, None + if lf_pos != -1 and lf_pos > cr_pos: + if lf_pos > len(enc_cr) or self._bufpos == 0: + if cr_pos != -1 and cr_pos == lf_pos - len(enc_lf): + res = cr_pos, enc_cr + enc_lf + else: + res = lf_pos, enc_lf + elif cr_pos != -1: + res = cr_pos, enc_cr + return res + + def _getbytes(self): + is_firstline = self._pending == b'' + limpos, newline = self._limpos, self._newline + + if limpos is None: + raise StopIteration + assert limpos >= 0 + + # limpos points one character past the end of the line we're about to + # return - e.g "abc\ndef" + # ^ + while True: + lim_offset = limpos - self._bufpos # file offset -> buf offset + if newline is None: + offset, ending = self._find_universal_endline(lim_offset) + else: + offset = self._pending.rfind(newline, 0, lim_offset) + ending = newline + + if offset != -1: + self._limpos = self._bufpos + offset + line_offset = offset + len(ending) + break + + if self._bufpos > 0: + self._extend_buffer_backwards() + else: + self._limpos = None + line_offset = 0 + break + + # We treat the first returned line specially, as it may be missing + # the endline terminator. Also we avoid returning an initial empty + # line for files with a normal terminating endline. + # + if is_firstline: + return self._pending[line_offset:] or self._getbytes() + else: + ending_to_add = self._enc_lf if self._translate_newlines else ending + return self._pending[line_offset:lim_offset] + ending_to_add + + def __next__(self): + return self._getbytes().decode(self._encoding) + class TextIOWrapper(TextIOBase): """Buffered text stream. @@ -1382,6 +1501,9 @@ self._pending = res[n:] return res[:n] + def __reversed__(self): + return TextIOReverseIterator(self.buffer, self._encoding, self._readnl) + def __next__(self): self._telling = False line = self.readline() Index: Lib/test/test_io.py =================================================================== --- Lib/test/test_io.py (revision 59439) +++ Lib/test/test_io.py (working copy) @@ -621,6 +621,76 @@ self.assertEquals(got_line, exp_line) self.assertEquals(len(got_lines), len(exp_lines)) + def testReversedLines(self): + texts = [ + "a\nbb\nccc\n\neee\n" + "AAA\nBBB\nCCC\rDDD\rEEE\r\nFFF\r\nGGG", + "", + "foo", + "\nfoo", + "\rbar", + "\r\nbaz", + "foo\n", + "\n\n", + ("\0\x0f\xff\u0fff\uffff\U000fffff\U0010ffff"*3 + "\n") * 3 + "\n" + ] + encodings = [ "utf-8", "latin-1" ] + newlines = [ None, "\n", "\r", "\r\n" ] + for text in texts: + for encoding in encodings: + for newline in newlines: + for bufsize in None, 1, 2, 3, 5, 10: + def make_textio(): + bufio = io.BytesIO(text.encode(encoding)) + return io.TextIOWrapper(bufio, encoding=encoding, + newline=newline) + try: + textio = make_textio() + except UnicodeEncodeError: + # Skip non-ascii tests for latin-1 + continue + if bufsize is None: + revio = reversed(textio) + else: + revio = io.TextIOReverseIterator( + textio.buffer, encoding, newline, bufsize) + params = dict(text=text, enc=encoding, + nl=newline, bufsize=bufsize) + got = list(revio) + exp = list(reversed(list(make_textio()))) + self.assertEquals((got, params), (exp, params)) + + def testReversedLinesOpcount(self): + import math + + class LoggingRaw (io.RawIOBase): + def __init__(self, data): + self._bytes = io.BytesIO(data) + self._nseeks = self._nreads = 0 + + def readinto(self, b): + res = self._bytes.readinto(b) + #print("readinto => %r" % (res,)) + self._nreads += 1 + return res + + def seek(self, pos, whence): + res = self._bytes.seek(pos, whence) + #print("seek(%r, %r) => %r" % (pos, whence, res)) + self._nseeks += 1 + return res + + readable = lambda self: True + + lines = [ "x" * 80 + "\n" ] * 1000 + [ "l" * 1000 ] + encoding = "ascii" + raw = LoggingRaw(b"".join(line.encode(encoding) for line in lines)) + textio = io.TextIOWrapper(io.BufferedReader(raw), encoding) + self.assertEqual(list(reversed(textio)), list(reversed(lines))) + exp_nreads = math.ceil(sum(map(len, lines)) / io.DEFAULT_BUFFER_SIZE) + self.assertEqual(raw._nreads, exp_nreads) + #print("nseeks=%d nreads=%d" % (raw._nseeks, raw._nreads)) + def testNewlinesInput(self): testdata = b"AAA\nBBB\nCCC\rDDD\rEEE\r\nFFF\r\nGGG" normalized = testdata.replace(b"\r\n", b"\n").replace(b"\r", b"\n") @@ -792,7 +862,11 @@ while f.readline(): f.tell() t4 = timer() + for line in reversed(f): + pass + t5 = timer() f.close() + if test_support.verbose: print("\nTiming test: %d lines of %d characters (%d bytes)" % (nlines, nchars, nbytes)) @@ -801,6 +875,7 @@ print("Reading using iteration: %6.3f seconds" % (t2-t1)) print("Reading using readline(): %6.3f seconds" % (t3-t2)) print("Using readline()+tell(): %6.3f seconds" % (t4-t3)) + print("Using reversed(): %6.3f seconds" % (t5-t4)) def testReadOneByOne(self): txt = io.TextIOWrapper(io.BytesIO(b"AA\r\nBB")) Index: Lib/test/output/test_profile =================================================================== --- Lib/test/output/test_profile (revision 59439) +++ Lib/test/output/test_profile (working copy) @@ -10,10 +10,10 @@ 12 0.000 0.000 0.012 0.001 :0(hasattr) 1 0.000 0.000 0.000 0.000 :0(setprofile) 1 0.000 0.000 1.000 1.000 :1() - 2 0.000 0.000 0.000 0.000 io.py:1201(flush) - 1 0.000 0.000 0.000 0.000 io.py:259(flush) - 1 0.000 0.000 0.000 0.000 io.py:646(closed) - 1 0.000 0.000 0.000 0.000 io.py:864(flush) + 2 0.000 0.000 0.000 0.000 io.py:1330(flush) + 1 0.000 0.000 0.000 0.000 io.py:269(flush) + 1 0.000 0.000 0.000 0.000 io.py:656(closed) + 1 0.000 0.000 0.000 0.000 io.py:874(flush) 0 0.000 0.000 profile:0(profiler) 1 0.000 0.000 1.000 1.000 profile:0(testfunc()) 8 0.064 0.008 0.080 0.010 test_profile.py:103(subhelper) @@ -33,15 +33,15 @@ :0(append) -> :0(exc_info) -> :0(exec) -> :1()(1) 1.000 - io.py:1201(flush)(2) 0.000 + io.py:1330(flush)(2) 0.000 :0(hasattr) -> test_profile.py:115(__getattr__)(12) 0.028 :0(setprofile) -> :1() -> test_profile.py:30(testfunc)(1) 1.000 -io.py:1201(flush) -> io.py:259(flush)(1) 0.000 - io.py:864(flush)(1) 0.000 -io.py:259(flush) -> -io.py:646(closed) -> -io.py:864(flush) -> io.py:646(closed)(1) 0.000 +io.py:1330(flush) -> io.py:269(flush)(1) 0.000 + io.py:874(flush)(1) 0.000 +io.py:269(flush) -> +io.py:656(closed) -> +io.py:874(flush) -> io.py:656(closed)(1) 0.000 profile:0(profiler) -> profile:0(testfunc())(1) 1.000 profile:0(testfunc()) -> :0(exec)(1) 1.000 :0(setprofile)(1) 0.000 @@ -74,10 +74,10 @@ test_profile.py:93(helper2)(8) 0.400 :0(setprofile) <- profile:0(testfunc())(1) 1.000 :1() <- :0(exec)(1) 1.000 -io.py:1201(flush) <- :0(exec)(2) 1.000 -io.py:259(flush) <- io.py:1201(flush)(1) 0.000 -io.py:646(closed) <- io.py:864(flush)(1) 0.000 -io.py:864(flush) <- io.py:1201(flush)(1) 0.000 +io.py:1330(flush) <- :0(exec)(2) 1.000 +io.py:269(flush) <- io.py:1330(flush)(1) 0.000 +io.py:656(closed) <- io.py:874(flush)(1) 0.000 +io.py:874(flush) <- io.py:1330(flush)(1) 0.000 profile:0(profiler) <- profile:0(testfunc()) <- profile:0(profiler)(1) 0.000 test_profile.py:103(subhelper) <- test_profile.py:93(helper2)(8) 0.400 Index: Lib/test/output/test_cProfile =================================================================== --- Lib/test/output/test_cProfile (revision 59439) +++ Lib/test/output/test_cProfile (working copy) @@ -5,10 +5,10 @@ ncalls tottime percall cumtime percall filename:lineno(function) 1 0.000 0.000 1.000 1.000 :1() - 2 0.000 0.000 0.000 0.000 io.py:1201(flush) - 1 0.000 0.000 0.000 0.000 io.py:259(flush) - 1 0.000 0.000 0.000 0.000 io.py:646(closed) - 1 0.000 0.000 0.000 0.000 io.py:864(flush) + 2 0.000 0.000 0.000 0.000 io.py:1330(flush) + 1 0.000 0.000 0.000 0.000 io.py:269(flush) + 1 0.000 0.000 0.000 0.000 io.py:656(closed) + 1 0.000 0.000 0.000 0.000 io.py:874(flush) 8 0.064 0.008 0.080 0.010 test_cProfile.py:103(subhelper) 28 0.028 0.001 0.028 0.001 test_cProfile.py:115(__getattr__) 1 0.270 0.270 1.000 1.000 test_cProfile.py:30(testfunc) @@ -30,11 +30,11 @@ Function called... ncalls tottime cumtime :1() -> 1 0.270 1.000 test_cProfile.py:30(testfunc) -io.py:1201(flush) -> 1 0.000 0.000 io.py:259(flush) - 1 0.000 0.000 io.py:864(flush) -io.py:259(flush) -> -io.py:646(closed) -> -io.py:864(flush) -> 1 0.000 0.000 io.py:646(closed) +io.py:1330(flush) -> 1 0.000 0.000 io.py:269(flush) + 1 0.000 0.000 io.py:874(flush) +io.py:269(flush) -> +io.py:656(closed) -> +io.py:874(flush) -> 1 0.000 0.000 io.py:656(closed) test_cProfile.py:103(subhelper) -> 16 0.016 0.016 test_cProfile.py:115(__getattr__) test_cProfile.py:115(__getattr__) -> test_cProfile.py:30(testfunc) -> 1 0.014 0.130 test_cProfile.py:40(factorial) @@ -53,7 +53,7 @@ test_cProfile.py:93(helper2) -> 8 0.064 0.080 test_cProfile.py:103(subhelper) 8 0.000 0.008 {hasattr} {exec} -> 1 0.000 1.000 :1() - 2 0.000 0.000 io.py:1201(flush) + 2 0.000 0.000 io.py:1330(flush) {hasattr} -> 12 0.012 0.012 test_cProfile.py:115(__getattr__) {method 'append' of 'list' objects} -> {method 'disable' of '_lsprof.Profiler' objects} -> @@ -65,10 +65,10 @@ Function was called by... ncalls tottime cumtime :1() <- 1 0.000 1.000 {exec} -io.py:1201(flush) <- 2 0.000 0.000 {exec} -io.py:259(flush) <- 1 0.000 0.000 io.py:1201(flush) -io.py:646(closed) <- 1 0.000 0.000 io.py:864(flush) -io.py:864(flush) <- 1 0.000 0.000 io.py:1201(flush) +io.py:1330(flush) <- 2 0.000 0.000 {exec} +io.py:269(flush) <- 1 0.000 0.000 io.py:1330(flush) +io.py:656(closed) <- 1 0.000 0.000 io.py:874(flush) +io.py:874(flush) <- 1 0.000 0.000 io.py:1330(flush) test_cProfile.py:103(subhelper) <- 8 0.064 0.080 test_cProfile.py:93(helper2) test_cProfile.py:115(__getattr__) <- 16 0.016 0.016 test_cProfile.py:103(subhelper) 12 0.012 0.012 {hasattr}