# HG changeset patch # Parent 8120043810aff20b2daceb319f927127e90cca0b Use the max_length parameter to protect against gzip decompression bombs I am assuming that Decompressor.flush() returns no more than a small amount of data, since it has no ”max_length” parameter. Also removed unused and inconsistent code paths in _PaddedFile methods. diff -r 8120043810af Lib/gzip.py --- a/Lib/gzip.py Tue Jan 06 00:45:52 2015 -0600 +++ b/Lib/gzip.py Thu Jan 08 14:30:13 2015 +0000 @@ -89,33 +89,19 @@ return self._buffer[read:] + \ self.file.read(size-self._length+read) - def prepend(self, prepend=b'', readprevious=False): + def prepend(self, prepend=b''): if self._read is None: self._buffer = prepend - elif readprevious and len(prepend) <= self._read: + else: self._read -= len(prepend) return - else: - self._buffer = self._buffer[self._read:] + prepend self._length = len(self._buffer) self._read = 0 - def unused(self): - if self._read is None: - return b'' - return self._buffer[self._read:] - - def seek(self, offset, whence=0): - # This is only ever called with offset=whence=0 - if whence == 1 and self._read is not None: - if 0 <= offset + self._read <= self._length: - self._read += offset - return - else: - offset += self._length - self._read + def rewind(self): self._read = None self._buffer = None - return self.file.seek(offset, whence) + return self.file.seek(0) def __getattr__(self, name): return getattr(self.file, name) @@ -190,14 +176,14 @@ self.mode = READ # Set flag indicating start of a new member self._new_member = True - # Buffer data read from gzip file. extrastart is offset in + # Buffer unread decompressed data. extrastart is offset in # stream where buffer starts. extrasize is number of # bytes remaining in buffer from current stream position. self.extrabuf = b"" self.extrasize = 0 self.extrastart = 0 self.name = filename - # Starts small, scales exponentially + # Starts small, scales exponentially, for readline() only self.min_readsize = 100 fileobj = _PaddedFile(fileobj) @@ -213,7 +199,7 @@ raise ValueError("Invalid mode: {!r}".format(mode)) self.fileobj = fileobj - self.offset = 0 + self.offset = 0 # Current file offset for seek(), tell(), etc self.mtime = mtime if self.mode == WRITE: @@ -318,11 +304,6 @@ break if flag & FHCRC: self._read_exact(2) # Read & discard the 16-bit header CRC - - unused = self.fileobj.unused() - if unused: - uncompress = self.decompress.decompress(unused) - self._add_read_data(uncompress) return True def write(self,data): @@ -352,7 +333,7 @@ import errno raise OSError(errno.EBADF, "read() on write-only GzipFile object") - if self.extrasize <= 0 and self.fileobj is None: + if not size: return b'' readsize = 1024 @@ -362,7 +343,7 @@ size = self.extrasize else: # just get some more of it while size > self.extrasize: - if not self._read(readsize): + if not self._read(readsize, limit=size): if size > self.extrasize: size = self.extrasize break @@ -381,12 +362,12 @@ import errno raise OSError(errno.EBADF, "read1() on write-only GzipFile object") - if self.extrasize <= 0 and self.fileobj is None: + if not size: return b'' # For certain input data, a single call to _read() may not return # any data. In this case, retry until we get some data or reach EOF. - while self.extrasize <= 0 and self._read(): + while self.extrasize <= 0 and self._read(limit=size): pass if size < 0 or size > self.extrasize: size = self.extrasize @@ -411,7 +392,7 @@ return b'' # Ensure that we don't return b"" if we haven't reached EOF. # 1024 is the same buffering heuristic used in read() - while self.extrasize == 0 and self._read(max(n, 1024)): + while self.extrasize == 0 and self._read(max(n, 1024), limit=n): pass offset = self.offset - self.extrastart remaining = self.extrasize @@ -422,10 +403,7 @@ self.extrasize = len(buf) + self.extrasize self.offset -= len(buf) - def _read(self, size=1024): - if self.fileobj is None: - return False - + def _read(self, size=1024, limit=-1): if self._new_member: # If the _new_member flag is set, we have to # jump to the next member, if there is one. @@ -438,28 +416,36 @@ # Read a chunk of data from the file buf = self.fileobj.read(size) - # If the EOF has been reached, flush the decompression object + if limit < 0: + max_length = () + else: + max_length = (limit,) + uncompress = self.decompress.decompress(buf, *max_length) + self._add_read_data( uncompress ) + + # If the ends of both the compressed and decompressed streams + # have been reached, flush the decompression object # and mark this object as finished. - if buf == b"": + if buf == b"" and uncompress == b"": + # Assuming flush() only returns a limited amount of data uncompress = self.decompress.flush() - # Prepend the already read bytes to the fileobj to they can be + # Prepend the already read bytes to the fileobj so they can be # seen by _read_eof() - self.fileobj.prepend(self.decompress.unused_data, True) + self.fileobj.prepend(self.decompress.unused_data) self._read_eof() self._add_read_data( uncompress ) return False - uncompress = self.decompress.decompress(buf) - self._add_read_data( uncompress ) - - if self.decompress.unused_data != b"": + if self.decompress.unconsumed_tail != b"": + self.fileobj.prepend(self.decompress.unconsumed_tail) + elif self.decompress.unused_data != b"": # Ending case: we've come to the end of a member in the file, # so seek back to the start of the unused data, finish up # this member, and read a new gzip header. # Prepend the already read bytes to the fileobj to they can be # seen by _read_eof() and _read_gzip_header() - self.fileobj.prepend(self.decompress.unused_data, True) + self.fileobj.prepend(self.decompress.unused_data) # Check the CRC and file size, and set the flag so we read # a new member on the next call self._read_eof() @@ -493,7 +479,7 @@ while c == b"\x00": c = self.fileobj.read(1) if c: - self.fileobj.prepend(c, True) + self.fileobj.prepend(c) @property def closed(self): @@ -534,7 +520,7 @@ beginning of the file''' if self.mode != READ: raise OSError("Can't rewind in write mode") - self.fileobj.seek(0) + self.fileobj.rewind() self._new_member = True self.extrabuf = b"" self.extrasize = 0 diff -r 8120043810af Lib/test/test_gzip.py --- a/Lib/test/test_gzip.py Tue Jan 06 00:45:52 2015 -0600 +++ b/Lib/test/test_gzip.py Thu Jan 08 14:30:13 2015 +0000 @@ -428,6 +428,18 @@ with gzip.open(self.filename, "rb") as f: f.fileobj.prepend() + def test_decompress_limited(self): + """Read a small portion of a highly compressed stream""" + + bomb = gzip.compress(bytes(int(1.01e6)), compresslevel=9) + self.assertLess(len(bomb), 1024) # _read() will consume it in one go + + bomb = io.BytesIO(bomb) + decomp = gzip.GzipFile(fileobj=bomb) + self.assertEqual(bytes(1), decomp.read(1)) + buffered = len(decomp.extrabuf) + self.assertLess(buffered, 1e6, "Excessive amount of buffered data") + class TestOpen(BaseTest): def test_binary_modes(self): uncompressed = data1 * 50