--- gzip.py.orig 2007-03-07 18:54:30.000000000 +0100 +++ gzip.py 2007-03-15 18:38:42.000000000 +0100 @@ -15,15 +15,6 @@ READ, WRITE = 1, 2 -def U32(i): - """Return i as an unsigned integer, assuming it fits in 32 bits. - - If it's >= 2GB when viewed as a 32-bit unsigned int, return a long. - """ - if i < 0: - i += 1L << 32 - return i - def LOWU32(i): """Return the low-order 32 bits of an int, as a non-negative int.""" return i & 0xFFFFFFFFL @@ -36,9 +27,6 @@ # or unsigned. output.write(struct.pack(" 8: + f = PaddedFile(unused[8:], self.fileobj) + else: + f = self.fileobj + try: + self._read_gzip_header(f) + except IOError: return '' + if len(unused) > 8: + return f.unused() + else: + return '' + - readsize = 1024 - if size < 0: # get the whole thing - try: - while True: - self._read(readsize) - readsize = min(self.max_read_chunk, readsize * 2) - except EOFError: - size = self.extrasize - else: # just get some more of it - try: - while size > self.extrasize: - self._read(readsize) - readsize = min(self.max_read_chunk, readsize * 2) - except EOFError: - if size > self.extrasize: - size = self.extrasize - - chunk = self.extrabuf[:size] - self.extrabuf = self.extrabuf[size:] - self.extrasize = self.extrasize - size - - self.offset += size - return chunk - - def _unread(self, buf): - self.extrabuf = buf + self.extrabuf - self.extrasize = len(buf) + self.extrasize - self.offset -= len(buf) - - def _read(self, size=1024): - if self.fileobj is None: - raise EOFError, "Reached EOF" + def _read(self, readsize): + data = self.fileobj.read(readsize) - if self._new_member: - # If the _new_member flag is set, we have to - # jump to the next member, if there is one. - # - # First, check if we're at the end of the file; - # if so, it's time to stop; no more members to read. - pos = self.fileobj.tell() # Save current position - self.fileobj.seek(0, 2) # Seek to end of file - if pos == self.fileobj.tell(): - raise EOFError, "Reached EOF" + while True: + if data == "": + decompdata = self.decompobj.flush() else: - self.fileobj.seek( pos ) # Return to original position + decompdata = self.decompobj.decompress(data) + decomplen = len(decompdata) + self.buffer.append(decompdata) + self.bufferlen += decomplen + self.size += decomplen + self.crcval = zlib.crc32(decompdata, self.crcval) + if self.decompobj.unused_data: + data = self._read_eof() + self.decompobj = zlib.decompressobj(-zlib.MAX_WBITS) + self.crcval = zlib.crc32("") + self.size = 0 + if data: + continue + break + return data=='' - self._init_read() - self._read_gzip_header() - self.decompress = zlib.decompressobj(-zlib.MAX_WBITS) - self._new_member = False + def read(self, size=-1): + """Decompress up to bytes bytes from input. - # Read a chunk of data from the file - buf = self.fileobj.read(size) + Raise IOError.""" - # If the EOF has been reached, flush the decompression object - # and mark this object as finished. + if self.mode != READ: + import errno + raise IOError(errno.EBADF, "read() on write-only GzipFile object") - if buf == "": - uncompress = self.decompress.flush() - self._read_eof() - self._add_read_data( uncompress ) - raise EOFError, 'Reached EOF' - - uncompress = self.decompress.decompress(buf) - self._add_read_data( uncompress ) - - if self.decompress.unused_data != "": - # Ending case: we've come to the end of a member in the file, - # so seek back to the start of the unused data, finish up - # this member, and read a new gzip header. - # (The number of bytes to seek back is the length of the unused - # data, minus 8 because _read_eof() will rewind a further 8 bytes) - self.fileobj.seek( -len(self.decompress.unused_data)+8, 1) - - # Check the CRC and file size, and set the flag so we read - # a new member on the next call - self._read_eof() - self._new_member = True + if self._new_member: + self._read_gzip_header(self.fileobj) + self._new_member = False - def _add_read_data(self, data): - self.crc = zlib.crc32(data, self.crc) - self.extrabuf = self.extrabuf + data - self.extrasize = self.extrasize + len(data) - self.size = self.size + len(data) + while size < 0 or self.bufferlen < size: + if size < 0: + readsize = 65536 - self.bufferlen + else: + readsize = size - self.bufferlen + + if readsize > 65536: + readsize = 32768 + elif readsize > 32768: + readsize = 16384 + elif readsize > 16384: + readsize = 8192 + elif readsize > 8192: + readsize = 4096 + elif readsize > 4096: + readsize = 2048 + else: + readsize = 1024 - def _read_eof(self): - # We've read to the end of the file, so we have to rewind in order - # to reread the 8 bytes containing the CRC and the file size. - # We check the that the computed CRC and size of the - # uncompressed data matches the stored values. Note that the size - # stored is the true file size mod 2**32. - self.fileobj.seek(-8, 1) - crc32 = read32(self.fileobj) - isize = U32(read32(self.fileobj)) # may exceed 2GB - if U32(crc32) != U32(self.crc): - raise IOError, "CRC check failed" - elif isize != LOWU32(self.size): - raise IOError, "Incorrect length of data produced" + eof = self._read(readsize) + if eof: + break + if size < 0: + size = self.bufferlen + retdata = "" + while size > 0 and self.buffer: + decompdata = self.buffer[0] + decomplen = len(decompdata) + if size+self.pos <= decomplen: + tmpdata = decompdata[self.pos:size+self.pos] + retdata += tmpdata + self.bufferlen -= size + self.pos += size + break + decomplen -= self.pos + size -= decomplen + self.bufferlen -= decomplen + if self.pos != 0: + retdata += decompdata[self.pos:] + else: + retdata += decompdata + self.pos = 0 + self.buffer.pop(0) + self.offset += len(retdata) + return retdata def close(self): if self.mode == WRITE: @@ -375,9 +395,13 @@ raise IOError("Can't rewind in write mode") self.fileobj.seek(0) self._new_member = True - self.extrabuf = "" - self.extrasize = 0 + self.decompobj = zlib.decompressobj(-zlib.MAX_WBITS) + self.crcval = zlib.crc32("") + self.buffer = [] # List of data blocks + self.bufferlen = 0 + self.pos = 0 # Offset of next data to read from buffer[0] self.offset = 0 + self.size = 0 def seek(self, offset, whence=0): if whence: @@ -402,35 +426,30 @@ self.read(count % 1024) def readline(self, size=-1): - if size < 0: - size = sys.maxint - readsize = self.min_readsize - else: - readsize = size - bufs = [] - while size != 0: - c = self.read(readsize) - i = c.find('\n') - - # We set i=size to break out of the loop under two - # conditions: 1) there's no newline, and the chunk is - # larger than size, or 2) there is a newline, but the - # resulting line would be longer than 'size'. - if (size <= i) or (i == -1 and len(c) > size): - i = size - 1 - - if i >= 0 or c == '': - bufs.append(c[:i + 1]) # Add portion of last chunk - self._unread(c[i + 1:]) # Push back rest of chunk - break + if self._new_member: + self._read_gzip_header(self.fileobj) + self._new_member = False - # Append chunk to list, decrease 'size', - bufs.append(c) - size = size - len(c) - readsize = min(size, readsize * 2) - if readsize > self.min_readsize: - self.min_readsize = min(readsize, self.min_readsize * 2, 512) - return ''.join(bufs) # Return resulting line + scansize = 0 + buffpos = 0 + while True: + for idx in range(buffpos, len(self.buffer)): + if idx == 0: + scansize -= self.pos + pos = self.buffer[idx].find('\n', self.pos) + else: + pos = self.buffer[idx].find('\n') + if pos != -1: + if size>=0 and scansize+pos+1>size: + return self.read(size) + return self.read(scansize+pos+1) + scansize += len(self.buffer[idx]) + if size>=0 and scansize>size: + return self.read(size) + buffpos = len(self.buffer) + eof = self._read(1024) + if eof: + return self.read(scansize) def readlines(self, sizehint=0): # Negative numbers result in reading all the lines