From dc3aae1603fce49147b1c53be3e031877d1a4c7b Mon Sep 17 00:00:00 2001 From: Florian Festi Date: Wed, 22 Sep 2010 11:03:53 +0200 Subject: [PATCH] Avoid the need of seek()ing on the file read Wrapping .fileobj in a new "PaddedFile" class that returns the prepended data before calling read() of the underlying file object --- gzip.py | 86 +++++++++++++++++++++++++++++++++++++++++++++++++------------- 1 files changed, 68 insertions(+), 18 deletions(-) diff --git a/gzip.py b/gzip.py index 83311cc..5690b12 100644 --- a/gzip.py +++ b/gzip.py @@ -45,6 +45,57 @@ def open(filename, mode="rb", compresslevel=9): """ return GzipFile(filename, mode, compresslevel) +class PaddedFile: + """readonly file object that prepends string the content of f""" + def __init__(self, f, prepend=b''): + self._buffer = prepend + self._length = len(prepend) + self.file = f + self._read = 0 + + def read(self, size): + if self._read is None: + return self.file.read(size) + if self._read + size <= self._length: + read = self._read + self._read += size + return self._buffer[read:self._read] + else: + read = self._read + self._read = None + return self._buffer[read:] + \ + self.file.read(size-self._length+read) + + def prepend(self, prepend=b'', readprevious=False): + if self._read is None: + self._buffer = prepend + elif readprevious and len(prepend) <= self._read: + self._read -= len(prepend) + return + else: + self._buffer = self._buffer[read:] + prepend + self._length = len(self._buffer) + self._read = 0 + + def unused(self): + if self._read is None: + return b'' + return self._buffer[self._read:] + + def seek(self, offset, whence): + if whence == 1 and self._read is not None: + if 0 <= offset + self._read <= self._length: + self._read += offset + return + else: + offset += self._length - self._read + self._read = None + self._buffer = None + return self.file.seek(offset, whence) + + def __getattr__(self, name): + return getattr(name, self.file) + class GzipFile(io.BufferedIOBase): """The GzipFile class simulates most of the methods of a file object with the exception of the readinto() and truncate() methods. @@ -119,6 +170,7 @@ class GzipFile(io.BufferedIOBase): self.name = filename # Starts small, scales exponentially self.min_readsize = 100 + fileobj = PaddedFile(fileobj) elif mode[0:1] == 'w' or mode[0:1] == 'a': self.mode = WRITE @@ -188,6 +240,9 @@ class GzipFile(io.BufferedIOBase): def _read_gzip_header(self): magic = self.fileobj.read(2) + if magic == b'': + raise EOFError("Reached EOF") + if magic != b'\037\213': raise IOError('Not a gzipped file') method = ord( self.fileobj.read(1) ) @@ -219,6 +274,11 @@ class GzipFile(io.BufferedIOBase): if flag & FHCRC: self.fileobj.read(2) # Read & discard the 16-bit header CRC + unused = self.fileobj.unused() + if unused: + uncompress = self.decompress.decompress(unused) + self._add_read_data(uncompress) + def write(self,data): if self.mode != WRITE: import errno @@ -282,16 +342,6 @@ class GzipFile(io.BufferedIOBase): if self._new_member: # If the _new_member flag is set, we have to # jump to the next member, if there is one. - # - # First, check if we're at the end of the file; - # if so, it's time to stop; no more members to read. - pos = self.fileobj.tell() # Save current position - self.fileobj.seek(0, 2) # Seek to end of file - if pos == self.fileobj.tell(): - raise EOFError("Reached EOF") - else: - self.fileobj.seek( pos ) # Return to original position - self._init_read() self._read_gzip_header() self.decompress = zlib.decompressobj(-zlib.MAX_WBITS) @@ -305,6 +355,9 @@ class GzipFile(io.BufferedIOBase): if buf == b"": uncompress = self.decompress.flush() + # Prepend the already read bytes to the fileobj to they can be + # seen by _read_eof() + self.fileobj.prepend(self.decompress.unused_data, True) self._read_eof() self._add_read_data( uncompress ) raise EOFError('Reached EOF') @@ -316,10 +369,9 @@ class GzipFile(io.BufferedIOBase): # Ending case: we've come to the end of a member in the file, # so seek back to the start of the unused data, finish up # this member, and read a new gzip header. - # (The number of bytes to seek back is the length of the unused - # data, minus 8 because _read_eof() will rewind a further 8 bytes) - self.fileobj.seek( -len(self.decompress.unused_data)+8, 1) - + # Prepend the already read bytes to the fileobj to they can be + # seen by _read_eof() and _read_gzip_header() + self.fileobj.prepend(self.decompress.unused_data, True) # Check the CRC and file size, and set the flag so we read # a new member on the next call self._read_eof() @@ -334,12 +386,10 @@ class GzipFile(io.BufferedIOBase): self.size = self.size + len(data) def _read_eof(self): - # We've read to the end of the file, so we have to rewind in order - # to reread the 8 bytes containing the CRC and the file size. + # We've read to the end of the file # We check the that the computed CRC and size of the # uncompressed data matches the stored values. Note that the size # stored is the true file size mod 2**32. - self.fileobj.seek(-8, 1) crc32 = read32(self.fileobj) isize = read32(self.fileobj) # may exceed 2GB if crc32 != self.crc: @@ -355,7 +405,7 @@ class GzipFile(io.BufferedIOBase): while c == b"\x00": c = self.fileobj.read(1) if c: - self.fileobj.seek(-1, 1) + self.fileobj.prepend(c, True) @property def closed(self): -- 1.7.3