From 9b307684333fcae533e01785fceb059d82f1b107 Mon Sep 17 00:00:00 2001 From: Florian Festi Date: Wed, 22 Sep 2010 11:03:53 +0200 Subject: [PATCH] Avoid the need of seek()ing on the file read using a new "PaddedFile" class that returns the prepended data before calling read() of the underlying file object --- gzip.py | 73 +++++++++++++++++++++++++++++++++++++++++++++++--------------- 1 files changed, 55 insertions(+), 18 deletions(-) diff --git a/gzip.py b/gzip.py index 83311cc..0bc2c19 100644 --- a/gzip.py +++ b/gzip.py @@ -45,6 +45,43 @@ def open(filename, mode="rb", compresslevel=9): """ return GzipFile(filename, mode, compresslevel) +class PaddedFile: + """readonly file object that prepends string the content of f""" + def __init__(self, f, prepend=b''): + self._buffer = prepend + self._length = len(prepend) + self.file = f + self._read = 0 + + def read(self, size): + if self._read is None: + return self.file.read(size) + if self._read + size <= self._length: + read = self._read + self._read += size + return self._buffer[read:self._read] + else: + read = self._read + self._read = None + return self._buffer[read:] + \ + self.file.read(size-self._length+read) + + def prepend(self, prepend=b'', readprevious=False): + if self._read is None: + self._buffer = prepend + elif readprevious and len(prepend) <= self._read: + self._read -= len(prepend) + return + else: + self._buffer = self._buffer[read:] + prepend + self._length = len(self._buffer) + self._read = 0 + + def unused(self): + if self._read is None: + return b'' + return self._buffer[self._read:] + class GzipFile(io.BufferedIOBase): """The GzipFile class simulates most of the methods of a file object with the exception of the readinto() and truncate() methods. @@ -188,6 +225,9 @@ class GzipFile(io.BufferedIOBase): def _read_gzip_header(self): magic = self.fileobj.read(2) + if magic == b'': + raise EOFError("Reached EOF") + if magic != b'\037\213': raise IOError('Not a gzipped file') method = ord( self.fileobj.read(1) ) @@ -219,6 +259,13 @@ class GzipFile(io.BufferedIOBase): if flag & FHCRC: self.fileobj.read(2) # Read & discard the 16-bit header CRC + if isinstance(self.fileobj, PaddedFile): + unused = self.fileobj.unused() + if unused: + uncompress = self.decompress.decompress(unused) + self._add_read_data(uncompress) + self.fileobj = self.fileobj.file + def write(self,data): if self.mode != WRITE: import errno @@ -282,16 +329,6 @@ class GzipFile(io.BufferedIOBase): if self._new_member: # If the _new_member flag is set, we have to # jump to the next member, if there is one. - # - # First, check if we're at the end of the file; - # if so, it's time to stop; no more members to read. - pos = self.fileobj.tell() # Save current position - self.fileobj.seek(0, 2) # Seek to end of file - if pos == self.fileobj.tell(): - raise EOFError("Reached EOF") - else: - self.fileobj.seek( pos ) # Return to original position - self._init_read() self._read_gzip_header() self.decompress = zlib.decompressobj(-zlib.MAX_WBITS) @@ -305,6 +342,9 @@ class GzipFile(io.BufferedIOBase): if buf == b"": uncompress = self.decompress.flush() + # Prepend the already read bytes to the fileobj to they can be + # seen by _read_eof() + self.fileobj = PaddedFile(self.fileobj, self.decompress.unused_data) self._read_eof() self._add_read_data( uncompress ) raise EOFError('Reached EOF') @@ -316,10 +356,9 @@ class GzipFile(io.BufferedIOBase): # Ending case: we've come to the end of a member in the file, # so seek back to the start of the unused data, finish up # this member, and read a new gzip header. - # (The number of bytes to seek back is the length of the unused - # data, minus 8 because _read_eof() will rewind a further 8 bytes) - self.fileobj.seek( -len(self.decompress.unused_data)+8, 1) - + # Prepend the already read bytes to the fileobj to they can be + # seen by _read_eof() and _read_gzip_header() + self.fileobj = PaddedFile(self.fileobj, self.decompress.unused_data) # Check the CRC and file size, and set the flag so we read # a new member on the next call self._read_eof() @@ -334,12 +373,10 @@ class GzipFile(io.BufferedIOBase): self.size = self.size + len(data) def _read_eof(self): - # We've read to the end of the file, so we have to rewind in order - # to reread the 8 bytes containing the CRC and the file size. + # We've read to the end of the file # We check the that the computed CRC and size of the # uncompressed data matches the stored values. Note that the size # stored is the true file size mod 2**32. - self.fileobj.seek(-8, 1) crc32 = read32(self.fileobj) isize = read32(self.fileobj) # may exceed 2GB if crc32 != self.crc: @@ -355,7 +392,7 @@ class GzipFile(io.BufferedIOBase): while c == b"\x00": c = self.fileobj.read(1) if c: - self.fileobj.seek(-1, 1) + self.fileobj.prepend(c, True) @property def closed(self): -- 1.7.3