Index: Lib/zipfile.py =================================================================== --- Lib/zipfile.py (revision 76872) +++ Lib/zipfile.py (working copy) @@ -3,6 +3,8 @@ """ import struct, os, time, sys, shutil import binascii, cStringIO, stat +import io +import re try: import zlib # We may need its compression method @@ -451,7 +453,7 @@ self._UpdateKeys(c) return c -class ZipExtFile: +class ZipExtFile(io.BufferedIOBase): """File-like object for reading an archive member. Is returned by ZipFile.open(). """ @@ -462,117 +464,88 @@ self.bytes_read = 0L self.rawbuffer = '' self.readbuffer = '' - self.linebuffer = '' + self.offset = 0 self.eof = False + self.univ_newlines = False - self.nlSeps = ("\n", ) - self.lastdiscard = '' + self.newlines = None self.compress_type = zipinfo.compress_type self.compress_size = zipinfo.compress_size - self.closed = False self.mode = "r" self.name = zipinfo.filename - # read from compressed files in 64k blocks - self.compreadsize = 64*1024 + # read from compressed files in 4k blocks + self.compreadsize = 4*1024 if self.compress_type == ZIP_DEFLATED: self.dc = zlib.decompressobj(-15) def set_univ_newlines(self, univ_newlines): self.univ_newlines = univ_newlines - # pick line separator char(s) based on universal newlines flag - self.nlSeps = ("\n", ) - if self.univ_newlines: - self.nlSeps = ("\r\n", "\r", "\n") + # Search for universal newlines or line chunks. + _pattern = re.compile(r'^(?P[^\r\n]+)|(?P\n|\r\n?)') - def __iter__(self): - return self + def readline(self, limit=-1): + """Read and return a line from the stream. - def next(self): - nextline = self.readline() - if not nextline: - raise StopIteration() + If limit is specified, at most limit bytes will be read. + """ - return nextline + if not self.univ_newlines and limit < 0: + # Shortcut common case - newline found in buffer. + i = self.readbuffer.find('\n', self.offset) + 1 + if i > 0: + line = self.readbuffer[self.offset: i] + self.offset = i + return line - def close(self): - self.closed = True + if not self.univ_newlines: + return io.BufferedIOBase.readline(self, limit) - def _checkfornewline(self): - nl, nllen = -1, -1 - if self.linebuffer: - # ugly check for cases where half of an \r\n pair was - # read on the last pass, and the \r was discarded. In this - # case we just throw away the \n at the start of the buffer. - if (self.lastdiscard, self.linebuffer[0]) == ('\r','\n'): - self.linebuffer = self.linebuffer[1:] + line = '' + while limit < 0 or len(line) < limit: + readahead = self.peek(2) + if readahead == '': + return line - for sep in self.nlSeps: - nl = self.linebuffer.find(sep) - if nl >= 0: - nllen = len(sep) - return nl, nllen + # Search for universal newlines or line chunks. + match = self._pattern.search(readahead) + newline = match.group('newline') + if newline is not None: + if self.newlines is None: + self.newlines = [] + if newline not in self.newlines: + self.newlines.append(newline) + self.offset += match.span()[1] + return line + '\n' - return nl, nllen + chunk = match.group('chunk') + if limit >= 0 and len(chunk) > limit - len(line): + chunk = chunk[: limit - len(line)] - def readline(self, size = -1): - """Read a line with approx. size. If size is negative, - read a whole line. - """ - if size < 0: - size = sys.maxint - elif size == 0: - return '' + self.offset += len(chunk) + line += chunk - # check for a newline already in buffer - nl, nllen = self._checkfornewline() + return line - if nl >= 0: - # the next line was already in the buffer - nl = min(nl, size) - else: - # no line break in buffer - try to read more - size -= len(self.linebuffer) - while nl < 0 and size > 0: - buf = self.read(min(size, 100)) - if not buf: - break - self.linebuffer += buf - size -= len(buf) + def peek(self, n): + """Returns buffered bytes without advancing the position.""" + if n > len(self.readbuffer) - self.offset: + chunk = self.read(n) + self.offset -= len(chunk) + + # Return up to 512 bytes to reduce allocation overhead for tight loops. + return self.readbuffer[self.offset: self.offset + 512] - # check for a newline in buffer - nl, nllen = self._checkfornewline() + def read1(self, n): + """Read up to n bytes with at most one read() system call.""" + return self.read(n) - # we either ran out of bytes in the file, or - # met the specified size limit without finding a newline, - # so return current buffer - if nl < 0: - s = self.linebuffer - self.linebuffer = '' - return s + def readable(self): + return True - buf = self.linebuffer[:nl] - self.lastdiscard = self.linebuffer[nl:nl + nllen] - self.linebuffer = self.linebuffer[nl + nllen:] - - # line is always returned with \n as newline char (except possibly - # for a final incomplete line in the file, which is handled above). - return buf + "\n" - - def readlines(self, sizehint = -1): - """Return a list with all (following) lines. The sizehint parameter - is ignored in this implementation. - """ - result = [] - while True: - line = self.readline() - if not line: break - result.append(line) - return result - def read(self, size = None): # act like file() obj and return empty string if size is 0 if size == 0: @@ -588,10 +561,10 @@ if size is not None and size >= 0: if self.compress_type == ZIP_STORED: - lr = len(self.readbuffer) + lr = len(self.readbuffer) - self.offset bytesToRead = min(bytesToRead, size - lr) elif self.compress_type == ZIP_DEFLATED: - if len(self.readbuffer) > size: + if len(self.readbuffer) - self.offset > size: # the user has requested fewer bytes than we've already # pulled through the decompressor; don't read any more bytesToRead = 0 @@ -631,16 +604,17 @@ # prevent decompressor from being used again self.dc = None - self.readbuffer += newdata + self.readbuffer = self.readbuffer[self.offset:] + newdata + self.offset = 0 # return what the user asked for - if size is None or len(self.readbuffer) <= size: - bytes = self.readbuffer - self.readbuffer = '' + if size is None or len(self.readbuffer) - self.offset <= size: + bytes = self.readbuffer[self.offset:] + self.offset = len(self.readbuffer) else: - bytes = self.readbuffer[:size] - self.readbuffer = self.readbuffer[size:] + bytes = self.readbuffer[self.offset: self.offset + size] + self.offset += size return bytes Index: Lib/test/test_zipfile.py =================================================================== --- Lib/test/test_zipfile.py (revision 76872) +++ Lib/test/test_zipfile.py (working copy) @@ -177,6 +177,28 @@ for f in (TESTFN2, TemporaryFile(), StringIO()): self.zip_random_open_test(f, zipfile.ZIP_STORED) + def zip_readline_read_test(self, f, compression): + self.make_test_archive(f, compression) + + # Read the ZIP archive + zipfp = zipfile.ZipFile(f, "r") + zipopen = zipfp.open(TESTFN) + + data = '' + while True: + read = zipopen.readline() + if not read: + break + data += read + + read = zipopen.read(100) + if not read: + break + data += read + + self.assertEqual(data, self.data) + zipfp.close() + def zip_readline_test(self, f, compression): self.make_test_archive(f, compression) @@ -210,6 +232,13 @@ zipfp.close() + def test_readline_read_stored(self): + """Test readlines interleved with reads. + Bug 7610 - http://bugs.python.org/issue7610 + """ + for f in (TESTFN2, TemporaryFile(), StringIO()): + self.zip_readline_read_test(f, zipfile.ZIP_STORED) + def test_readline_stored(self): for f in (TESTFN2, TemporaryFile(), StringIO()): self.zip_readline_test(f, zipfile.ZIP_STORED) @@ -238,6 +267,14 @@ self.zip_random_open_test(f, zipfile.ZIP_DEFLATED) @skipUnless(zlib, "requires zlib") + def test_readline_read_deflated(self): + """Test readlines interleved with reads. + Bug 7610 - http://bugs.python.org/issue7610 + """ + for f in (TESTFN2, TemporaryFile(), StringIO()): + self.zip_readline_read_test(f, zipfile.ZIP_DEFLATED) + + @skipUnless(zlib, "requires zlib") def test_readline_deflated(self): for f in (TESTFN2, TemporaryFile(), StringIO()): self.zip_readline_test(f, zipfile.ZIP_DEFLATED) @@ -1094,6 +1131,29 @@ zipfp.close() + def readline_read_test(self, f, compression): + self.make_test_archive(f, compression) + + # Read the ZIP archive + zipfp = zipfile.ZipFile(f, "r") + for sep, fn in self.arcfiles.items(): + zipopen = zipfp.open(fn, "rU") + data = '' + while True: + read = zipopen.readline() + if not read: + break + data += read + + read = zipopen.read(5) + if not read: + break + data += read + + self.assertEqual(data, self.arcdata['\n']) + + zipfp.close() + def readline_test(self, f, compression): self.make_test_archive(f, compression) @@ -1134,6 +1194,13 @@ for f in (TESTFN2, TemporaryFile(), StringIO()): self.read_test(f, zipfile.ZIP_STORED) + def test_readline_read_stored(self): + """Test readlines interleved with reads. + Bug 7610 - http://bugs.python.org/issue7610 + """ + for f in (TESTFN2, TemporaryFile(), StringIO()): + self.readline_read_test(f, zipfile.ZIP_STORED) + def test_readline_stored(self): for f in (TESTFN2, TemporaryFile(), StringIO()): self.readline_test(f, zipfile.ZIP_STORED) @@ -1152,6 +1219,14 @@ self.read_test(f, zipfile.ZIP_DEFLATED) @skipUnless(zlib, "requires zlib") + def test_readline_read_deflated(self): + """Test readlines interleved with reads. + Bug 7610 - http://bugs.python.org/issue7610 + """ + for f in (TESTFN2, TemporaryFile(), StringIO()): + self.readline_read_test(f, zipfile.ZIP_DEFLATED) + + @skipUnless(zlib, "requires zlib") def test_readline_deflated(self): for f in (TESTFN2, TemporaryFile(), StringIO()): self.readline_test(f, zipfile.ZIP_DEFLATED)