diff -r 6a35865eded4 Lib/fileinput.py --- a/Lib/fileinput.py Tue Dec 01 00:32:49 2015 +0200 +++ b/Lib/fileinput.py Thu Dec 03 13:00:10 2015 +0200 @@ -64,13 +64,6 @@ deleted when the output file is closed. disabled when standard input is read. XXX The current implementation does not work for MS-DOS 8+3 filesystems. -Performance: this module is unfortunately one of the slower ways of -processing large numbers of input lines. Nevertheless, a significant -speed-up has been obtained by using readlines(bufsize) instead of -readline(). A new keyword argument, bufsize=N, is present on the -input() function and the FileInput() class to override the default -buffer size. - XXX Possible additions: - optional getopt argument processing @@ -207,17 +200,19 @@ class FileInput: self._files = files self._inplace = inplace self._backup = backup - self._bufsize = bufsize or DEFAULT_BUFSIZE + if bufsize: + import warnings + warnings.warn('The bufsize option is ignored now', + PendingDeprecationWarning, stacklevel=2) self._savestdout = None self._output = None self._filename = None - self._lineno = 0 + self._startlineno = 0 self._filelineno = 0 self._file = None + self._readline = self._start_readline self._isstdin = False self._backupfilename = None - self._buffer = [] - self._bufindex = 0 # restrict mode argument to reading modes if mode not in ('r', 'rU', 'U', 'rb'): raise ValueError("FileInput opening mode must be one of " @@ -242,22 +237,18 @@ class FileInput: return self def next(self): - try: - line = self._buffer[self._bufindex] - except IndexError: - pass - else: - self._bufindex += 1 - self._lineno += 1 + line = self._readline() + if line: self._filelineno += 1 return line - line = self.readline() - if not line: + if not self._file: raise StopIteration - return line + self.nextfile() + # Recursive call + return self.next() def __getitem__(self, i): - if i != self._lineno: + if i != self.lineno(): raise RuntimeError, "accessing lines out of order" try: return self.next() @@ -277,7 +268,8 @@ class FileInput: output.close() finally: file = self._file - self._file = 0 + self._file = None + self._readline = self._start_readline try: if file and not self._isstdin: file.close() @@ -289,75 +281,72 @@ class FileInput: except OSError: pass self._isstdin = False - self._buffer = [] - self._bufindex = 0 def readline(self): - try: - line = self._buffer[self._bufindex] - except IndexError: - pass - else: - self._bufindex += 1 - self._lineno += 1 + line = self._readline() + if line: self._filelineno += 1 return line if not self._file: - if not self._files: - return "" - self._filename = self._files[0] - self._files = self._files[1:] - self._filelineno = 0 - self._file = None - self._isstdin = False - self._backupfilename = 0 - if self._filename == '-': - self._filename = '' - self._file = sys.stdin - self._isstdin = True - else: - if self._inplace: - self._backupfilename = ( - self._filename + (self._backup or os.extsep+"bak")) - try: os.unlink(self._backupfilename) - except os.error: pass - # The next few lines may raise IOError - os.rename(self._filename, self._backupfilename) - self._file = open(self._backupfilename, self._mode) - try: - perm = os.fstat(self._file.fileno()).st_mode - except OSError: - self._output = open(self._filename, "w") - else: - fd = os.open(self._filename, - os.O_CREAT | os.O_WRONLY | os.O_TRUNC, - perm) - self._output = os.fdopen(fd, "w") - try: - if hasattr(os, 'chmod'): - os.chmod(self._filename, perm) - except OSError: - pass - self._savestdout = sys.stdout - sys.stdout = self._output - else: - # This may raise IOError - if self._openhook: - self._file = self._openhook(self._filename, self._mode) - else: - self._file = open(self._filename, self._mode) - self._buffer = self._file.readlines(self._bufsize) - self._bufindex = 0 - if not self._buffer: - self.nextfile() + return line + self.nextfile() # Recursive call return self.readline() + def _start_readline(self): + if not self._files: + return "" + self._filename = self._files[0] + self._files = self._files[1:] + self._startlineno = self.lineno() + self._filelineno = 0 + self._file = None + self._isstdin = False + self._backupfilename = 0 + if self._filename == '-': + self._filename = '' + self._file = sys.stdin + self._isstdin = True + else: + if self._inplace: + self._backupfilename = ( + self._filename + (self._backup or os.extsep+"bak")) + try: os.unlink(self._backupfilename) + except os.error: pass + # The next few lines may raise IOError + os.rename(self._filename, self._backupfilename) + self._file = open(self._backupfilename, self._mode) + try: + perm = os.fstat(self._file.fileno()).st_mode + except OSError: + self._output = open(self._filename, "w") + else: + fd = os.open(self._filename, + os.O_CREAT | os.O_WRONLY | os.O_TRUNC, + perm) + self._output = os.fdopen(fd, "w") + try: + if hasattr(os, 'chmod'): + os.chmod(self._filename, perm) + except OSError: + pass + self._savestdout = sys.stdout + sys.stdout = self._output + else: + # This may raise IOError + if self._openhook: + self._file = self._openhook(self._filename, self._mode) + else: + self._file = open(self._filename, self._mode) + + self._readline = self._file.readline + return self._readline() + def filename(self): return self._filename def lineno(self): - return self._lineno + return self._startlineno + self._filelineno def filelineno(self): return self._filelineno diff -r 6a35865eded4 Lib/test/test_fileinput.py --- a/Lib/test/test_fileinput.py Tue Dec 01 00:32:49 2015 +0200 +++ b/Lib/test/test_fileinput.py Thu Dec 03 13:00:10 2015 +0200 @@ -5,7 +5,7 @@ Nick Mathewson import unittest from test.test_support import verbose, TESTFN, run_unittest -from test.test_support import unlink as safe_unlink +from test.test_support import unlink as safe_unlink, check_warnings import sys, re from StringIO import StringIO from fileinput import FileInput, hook_encoded @@ -28,6 +28,45 @@ def remove_tempfiles(*names): for name in names: safe_unlink(name) +class LineReader: + + def __init__(self): + self._linesread = [] + + @property + def linesread(self): + try: + return self._linesread[:] + finally: + self._linesread = [] + + def openhook(self, filename, mode): + self.it = iter(filename.splitlines(True)) + return self + + def readline(self, size=None): + try: + line = next(self.it) + except StopIteration: + line = '' + self._linesread.append(line) + return line + + def readlines(self, hint=-1): + lines = [] + size = 0 + while True: + line = self.readline() + if not line: + return lines + lines.append(line) + size += len(line) + if size >= hint: + return lines + + def close(self): + pass + class BufferSizesTests(unittest.TestCase): def test_buffer_sizes(self): # First, run the tests with default and teeny buffer size. @@ -37,7 +76,11 @@ class BufferSizesTests(unittest.TestCase t2 = writeTmp(2, ["Line %s of file 2\n" % (i+1) for i in range(10)]) t3 = writeTmp(3, ["Line %s of file 3\n" % (i+1) for i in range(5)]) t4 = writeTmp(4, ["Line %s of file 4\n" % (i+1) for i in range(1)]) - self.buffer_size_test(t1, t2, t3, t4, bs, round) + if bs: + with check_warnings(('', PendingDeprecationWarning)): + self.buffer_size_test(t1, t2, t3, t4, bs, round) + else: + self.buffer_size_test(t1, t2, t3, t4, bs, round) finally: remove_tempfiles(t1, t2, t3, t4) @@ -228,7 +271,7 @@ class FileInputTests(unittest.TestCase): f.write('\x80') self.addCleanup(safe_unlink, TESTFN) - fi = FileInput(files=TESTFN, openhook=hook_encoded('ascii'), bufsize=8) + fi = FileInput(files=TESTFN, openhook=hook_encoded('ascii')) # The most likely failure is a UnicodeDecodeError due to the entire # file being read when it shouldn't have been. self.assertEqual(fi.readline(), u'A\n') @@ -239,6 +282,38 @@ class FileInputTests(unittest.TestCase): list(fi) fi.close() + def test_readline_buffering(self): + src = LineReader() + fi = FileInput(files=['line1\nline2', 'line3\n'], openhook=src.openhook) + self.assertEqual(src.linesread, []) + self.assertEqual(fi.readline(), 'line1\n') + self.assertEqual(src.linesread, ['line1\n']) + self.assertEqual(fi.readline(), 'line2') + self.assertEqual(src.linesread, ['line2']) + self.assertEqual(fi.readline(), 'line3\n') + self.assertEqual(src.linesread, ['', 'line3\n']) + self.assertEqual(fi.readline(), '') + self.assertEqual(src.linesread, ['']) + self.assertEqual(fi.readline(), '') + self.assertEqual(src.linesread, []) + fi.close() + + def test_iteration_buffering(self): + src = LineReader() + fi = FileInput(files=['line1\nline2', 'line3\n'], openhook=src.openhook) + self.assertEqual(src.linesread, []) + self.assertEqual(next(fi), 'line1\n') + self.assertEqual(src.linesread, ['line1\n']) + self.assertEqual(next(fi), 'line2') + self.assertEqual(src.linesread, ['line2']) + self.assertEqual(next(fi), 'line3\n') + self.assertEqual(src.linesread, ['', 'line3\n']) + self.assertRaises(StopIteration, next, fi) + self.assertEqual(src.linesread, ['']) + self.assertRaises(StopIteration, next, fi) + self.assertEqual(src.linesread, []) + fi.close() + class Test_hook_encoded(unittest.TestCase): """Unit tests for fileinput.hook_encoded()"""