import time import gzip original_timing = 0 def timing(func): def wrapper(*args, **kwargs): global original_timing t1 = time.time() ret = func(*args, **kwargs) delta = ((time.time() - t1) * 1000.) offset = kwargs.get('offset') chunk_size = kwargs.get('chunk_size') / 1024 if chunk_size == 1: original_timing = delta else: # Get diff delta -= original_timing print '%0.3fms %sx %s offset' % (delta, chunk_size, offset) return ret return wrapper class GzipFile(gzip.GzipFile): chunk_size = 1024 "Version of GzipFile with larger read sizes and thus faster seeks." def seek(self, offset, whence=0): if whence: if whence == 1: offset = self.offset + offset else: raise ValueError('Seek from end not supported') if self.mode == gzip.WRITE: if offset < self.offset: raise IOError('Negative seek in write mode') count = offset - self.offset for i in xrange(count // self.chunk_size): self.write(self.chunk_size * '\0') self.write((count % self.chunk_size) * '\0') elif self.mode == gzip.READ: if offset < self.offset: # for negative seek, rewind and do positive seek self.rewind() count = offset - self.offset for i in xrange(count // self.chunk_size): self.read(self.chunk_size) self.read(count % self.chunk_size) return self.offset def read(self, size=-1): self._check_closed() if self.mode != gzip.READ: import errno raise IOError(errno.EBADF, "read() on write-only GzipFile object") if self.extrasize <= 0 and self.fileobj is None: return '' readsize = self.chunk_size if size < 0: # get the whole thing try: while True: self._read(readsize) readsize = min(self.max_read_chunk, readsize * 2) except EOFError: size = self.extrasize else: # just get some more of it try: while size > self.extrasize: self._read(readsize) readsize = min(self.max_read_chunk, readsize * 2) except EOFError: if size > self.extrasize: size = self.extrasize offset = self.offset - self.extrastart chunk = self.extrabuf[offset: offset + size] self.extrasize = self.extrasize - size self.offset += size return chunk @timing def seek(self, offset, sample_size=100, chunk_size=1024): self.chunk_size = chunk_size for x in range(sample_size): ret = super(GzipFile, self).seek(offset) super(GzipFile, self).seek(0) # Rewind cursor return ret files = ['10K.gz', '1M.gz', '5M.gz', '100M.gz', '1000M.gz'] offsets = [1000, 10000000] chunk_size_multiples = [1, 4, 8, 16, 32, 64] for filename in files: fp = GzipFile(filename) print('--- %s ---' % filename) for offset in offsets: print('') for multiple in chunk_size_multiples: fp.seek(offset=offset, chunk_size=1024 * multiple) fp.close()