diff -r bc2a22eaa0af Doc/library/bz2.rst --- a/Doc/library/bz2.rst Mon Mar 30 03:21:06 2015 +0200 +++ b/Doc/library/bz2.rst Mon Mar 30 02:58:11 2015 +0000 @@ -120,6 +120,10 @@ All of the classes in this module may sa .. versionchanged:: 3.4 The ``'x'`` (exclusive creation) mode was added. + .. versionchanged:: 3.5 + The :meth:`~io.BufferedIOBase.read` method now accepts an argument of + ``None``. + Incremental (de)compression --------------------------- diff -r bc2a22eaa0af Doc/library/gzip.rst --- a/Doc/library/gzip.rst Mon Mar 30 03:21:06 2015 +0200 +++ b/Doc/library/gzip.rst Mon Mar 30 02:58:11 2015 +0000 @@ -90,13 +90,9 @@ The module defines the following items: is no compression. The default is ``9``. The *mtime* argument is an optional numeric timestamp to be written to - the stream when compressing. All :program:`gzip` compressed streams are - required to contain a timestamp. If omitted or ``None``, the current - time is used. This module ignores the timestamp when decompressing; - however, some programs, such as :program:`gunzip`\ , make use of it. - The format of the timestamp is the same as that of the return value of - ``time.time()`` and of the ``st_mtime`` attribute of the object returned - by ``os.stat()``. + the last modification time field in the stream when compressing. It + should only be provided in compression mode. If omitted or ``None``, the + current time is used. See the :attr:`mtime` attribute for more details. Calling a :class:`GzipFile` object's :meth:`close` method does not close *fileobj*, since you might wish to append more material after the compressed @@ -108,9 +104,9 @@ The module defines the following items: including iteration and the :keyword:`with` statement. Only the :meth:`truncate` method isn't implemented. - :class:`GzipFile` also provides the following method: + :class:`GzipFile` also provides the following method and attribute: - .. method:: peek([n]) + .. method:: peek(n) Read *n* uncompressed bytes without advancing the file position. At most one single read on the compressed stream is done to satisfy @@ -124,9 +120,21 @@ The module defines the following items: .. versionadded:: 3.2 + .. attribute:: mtime + + When decompressing, the value of the last modification time field in + the most recently read header may be read from this attribute, as an + integer. The initial value before reading any headers is ``None``. + + All :program:`gzip` compressed streams are required to contain this + timestamp field. Some programs, such as :program:`gunzip`\ , make use + of the timestamp. The format is the same as the return value of + :func:`time.time` and the :attr:`~os.stat_result.st_mtime` attribute of + the object returned by :func:`os.stat`. + .. versionchanged:: 3.1 Support for the :keyword:`with` statement was added, along with the - *mtime* argument. + *mtime* constructor argument and :attr:`mtime` attribute. .. versionchanged:: 3.2 Support for zero-padded and unseekable files was added. @@ -140,6 +148,8 @@ The module defines the following items: .. versionchanged:: 3.5 Added support for writing arbitrary :term:`bytes-like objects `. + The :meth:`~io.BufferedIOBase.read` method now accepts an argument of + ``None``. .. function:: compress(data, compresslevel=9) diff -r bc2a22eaa0af Doc/library/lzma.rst --- a/Doc/library/lzma.rst Mon Mar 30 03:21:06 2015 +0200 +++ b/Doc/library/lzma.rst Mon Mar 30 02:58:11 2015 +0000 @@ -110,6 +110,10 @@ Reading and writing compressed files .. versionchanged:: 3.4 Added support for the ``"x"`` and ``"xb"`` modes. + .. versionchanged:: 3.5 + The :meth:`~io.BufferedIOBase.read` method now accepts an argument of + ``None``. + Compressing and decompressing data in memory -------------------------------------------- diff -r bc2a22eaa0af Lib/_compression.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Lib/_compression.py Mon Mar 30 02:58:11 2015 +0000 @@ -0,0 +1,152 @@ +"""Internal classes used by the gzip, lzma and bz2 modules""" + +import io + + +BUFFER_SIZE = io.DEFAULT_BUFFER_SIZE # Compressed data read chunk size + + +class BaseStream(io.BufferedIOBase): + """Mode-checking helper functions.""" + + def _check_not_closed(self): + if self.closed: + raise ValueError("I/O operation on closed file") + + def _check_can_read(self): + if not self.readable(): + raise io.UnsupportedOperation("File not open for reading") + + def _check_can_write(self): + if not self.writable(): + raise io.UnsupportedOperation("File not open for writing") + + def _check_can_seek(self): + if not self.readable(): + raise io.UnsupportedOperation("Seeking is only supported " + "on files open for reading") + if not self.seekable(): + raise io.UnsupportedOperation("The underlying file object " + "does not support seeking") + + +class DecompressReader(io.RawIOBase): + """Adapts the decompressor API to a RawIOBase reader API""" + + def readable(self): + return True + + def __init__(self, fp, decomp_factory, trailing_error=(), **decomp_args): + self._fp = fp + self._eof = False + self._pos = 0 # Current offset in decompressed stream + + # Set to size of decompressed stream once it is known, for SEEK_END + self._size = -1 + + # Save the decompressor factory and arguments. + # If the file contains multiple compressed streams, each + # stream will need a separate decompressor object. A new decompressor + # object is also needed when implementing a backwards seek(). + self._decomp_factory = decomp_factory + self._decomp_args = decomp_args + self._decompressor = self._decomp_factory(**self._decomp_args) + + # Exception class to catch from decompressor signifying invalid + # trailing data to ignore + self._trailing_error = trailing_error + + def close(self): + self._decompressor = None + return super().close() + + def seekable(self): + return self._fp.seekable() + + def readinto(self, b): + with memoryview(b) as view, view.cast("B") as byte_view: + data = self.read(len(byte_view)) + byte_view[:len(data)] = data + return len(data) + + def read(self, size=-1): + if size < 0: + return self.readall() + + if not size or self._eof: + return b"" + data = None # Default if EOF is encountered + # Depending on the input data, our call to the decompressor may not + # return any data. In this case, try again after reading another block. + while True: + if self._decompressor.eof: + rawblock = (self._decompressor.unused_data or + self._fp.read(BUFFER_SIZE)) + if not rawblock: + break + # Continue to next stream. + self._decompressor = self._decomp_factory( + **self._decomp_args) + try: + data = self._decompressor.decompress(rawblock, size) + except self._trailing_error: + # Trailing data isn't a valid compressed stream; ignore it. + break + else: + if self._decompressor.needs_input: + rawblock = self._fp.read(BUFFER_SIZE) + if not rawblock: + raise EOFError("Compressed file ended before the " + "end-of-stream marker was reached") + else: + rawblock = b"" + data = self._decompressor.decompress(rawblock, size) + if data: + break + if not data: + self._eof = True + self._size = self._pos + return b"" + self._pos += len(data) + return data + + # Rewind the file to the beginning of the data stream. + def _rewind(self): + self._fp.seek(0) + self._eof = False + self._pos = 0 + self._decompressor = self._decomp_factory(**self._decomp_args) + + def seek(self, offset, whence=io.SEEK_SET): + # Recalculate offset as an absolute file position. + if whence == io.SEEK_SET: + pass + elif whence == io.SEEK_CUR: + offset = self._pos + offset + elif whence == io.SEEK_END: + # Seeking relative to EOF - we need to know the file's size. + if self._size < 0: + while self.read(io.DEFAULT_BUFFER_SIZE): + pass + offset = self._size + offset + else: + raise ValueError("Invalid value for whence: {}".format(whence)) + + # Make it so that offset is the number of bytes to skip forward. + if offset < self._pos: + self._rewind() + else: + offset -= self._pos + + # Read and discard data until we reach the desired position. + while offset > 0: + data = self.read(min(io.DEFAULT_BUFFER_SIZE, offset)) + if not data: + break + offset -= len(data) + + return self._pos + + def tell(self): + """Return the current file position.""" + return self._pos diff -r bc2a22eaa0af Lib/bz2.py --- a/Lib/bz2.py Mon Mar 30 03:21:06 2015 +0200 +++ b/Lib/bz2.py Mon Mar 30 02:58:11 2015 +0000 @@ -12,6 +12,7 @@ This module provides a file interface, c from builtins import open as _builtin_open import io import warnings +import _compression try: from threading import RLock @@ -23,13 +24,11 @@ from _bz2 import BZ2Compressor, BZ2Decom _MODE_CLOSED = 0 _MODE_READ = 1 -_MODE_READ_EOF = 2 +# Value 2 no longer used _MODE_WRITE = 3 -_BUFFER_SIZE = 8192 - -class BZ2File(io.BufferedIOBase): +class BZ2File(_compression.BaseStream): """A file object providing transparent bzip2 (de)compression. @@ -61,13 +60,11 @@ class BZ2File(io.BufferedIOBase): multiple compressed streams. """ # This lock must be recursive, so that BufferedIOBase's - # readline(), readlines() and writelines() don't deadlock. + # writelines() does not deadlock. self._lock = RLock() self._fp = None self._closefp = False self._mode = _MODE_CLOSED - self._pos = 0 - self._size = -1 if buffering is not None: warnings.warn("Use of 'buffering' argument is deprecated", @@ -79,9 +76,6 @@ class BZ2File(io.BufferedIOBase): if mode in ("", "r", "rb"): mode = "rb" mode_code = _MODE_READ - self._decompressor = BZ2Decompressor() - self._buffer = b"" - self._buffer_offset = 0 elif mode in ("w", "wb"): mode = "wb" mode_code = _MODE_WRITE @@ -107,6 +101,13 @@ class BZ2File(io.BufferedIOBase): else: raise TypeError("filename must be a str or bytes object, or a file") + if self._mode == _MODE_READ: + raw = _compression.DecompressReader(self._fp, + BZ2Decompressor, trailing_error=OSError) + self._buffer = io.BufferedReader(raw) + else: + self._pos = 0 + def close(self): """Flush and close the file. @@ -117,8 +118,8 @@ class BZ2File(io.BufferedIOBase): if self._mode == _MODE_CLOSED: return try: - if self._mode in (_MODE_READ, _MODE_READ_EOF): - self._decompressor = None + if self._mode == _MODE_READ: + self._buffer.close() elif self._mode == _MODE_WRITE: self._fp.write(self._compressor.flush()) self._compressor = None @@ -130,8 +131,7 @@ class BZ2File(io.BufferedIOBase): self._fp = None self._closefp = False self._mode = _MODE_CLOSED - self._buffer = b"" - self._buffer_offset = 0 + self._buffer = None @property def closed(self): @@ -145,125 +145,18 @@ class BZ2File(io.BufferedIOBase): def seekable(self): """Return whether the file supports seeking.""" - return self.readable() and self._fp.seekable() + return self.readable() and self._buffer.seekable() def readable(self): """Return whether the file was opened for reading.""" self._check_not_closed() - return self._mode in (_MODE_READ, _MODE_READ_EOF) + return self._mode == _MODE_READ def writable(self): """Return whether the file was opened for writing.""" self._check_not_closed() return self._mode == _MODE_WRITE - # Mode-checking helper functions. - - def _check_not_closed(self): - if self.closed: - raise ValueError("I/O operation on closed file") - - def _check_can_read(self): - if self._mode not in (_MODE_READ, _MODE_READ_EOF): - self._check_not_closed() - raise io.UnsupportedOperation("File not open for reading") - - def _check_can_write(self): - if self._mode != _MODE_WRITE: - self._check_not_closed() - raise io.UnsupportedOperation("File not open for writing") - - def _check_can_seek(self): - if self._mode not in (_MODE_READ, _MODE_READ_EOF): - self._check_not_closed() - raise io.UnsupportedOperation("Seeking is only supported " - "on files open for reading") - if not self._fp.seekable(): - raise io.UnsupportedOperation("The underlying file object " - "does not support seeking") - - # Fill the readahead buffer if it is empty. Returns False on EOF. - def _fill_buffer(self): - if self._mode == _MODE_READ_EOF: - return False - # Depending on the input data, our call to the decompressor may not - # return any data. In this case, try again after reading another block. - while self._buffer_offset == len(self._buffer): - rawblock = (self._decompressor.unused_data or - self._fp.read(_BUFFER_SIZE)) - - if not rawblock: - if self._decompressor.eof: - # End-of-stream marker and end of file. We're good. - self._mode = _MODE_READ_EOF - self._size = self._pos - return False - else: - # Problem - we were expecting more compressed data. - raise EOFError("Compressed file ended before the " - "end-of-stream marker was reached") - - if self._decompressor.eof: - # Continue to next stream. - self._decompressor = BZ2Decompressor() - try: - self._buffer = self._decompressor.decompress(rawblock) - except OSError: - # Trailing data isn't a valid bzip2 stream. We're done here. - self._mode = _MODE_READ_EOF - self._size = self._pos - return False - else: - self._buffer = self._decompressor.decompress(rawblock) - self._buffer_offset = 0 - return True - - # Read data until EOF. - # If return_data is false, consume the data without returning it. - def _read_all(self, return_data=True): - # The loop assumes that _buffer_offset is 0. Ensure that this is true. - self._buffer = self._buffer[self._buffer_offset:] - self._buffer_offset = 0 - - blocks = [] - while self._fill_buffer(): - if return_data: - blocks.append(self._buffer) - self._pos += len(self._buffer) - self._buffer = b"" - if return_data: - return b"".join(blocks) - - # Read a block of up to n bytes. - # If return_data is false, consume the data without returning it. - def _read_block(self, n, return_data=True): - # If we have enough data buffered, return immediately. - end = self._buffer_offset + n - if end <= len(self._buffer): - data = self._buffer[self._buffer_offset : end] - self._buffer_offset = end - self._pos += len(data) - return data if return_data else None - - # The loop assumes that _buffer_offset is 0. Ensure that this is true. - self._buffer = self._buffer[self._buffer_offset:] - self._buffer_offset = 0 - - blocks = [] - while n > 0 and self._fill_buffer(): - if n < len(self._buffer): - data = self._buffer[:n] - self._buffer_offset = n - else: - data = self._buffer - self._buffer = b"" - if return_data: - blocks.append(data) - self._pos += len(data) - n -= len(data) - if return_data: - return b"".join(blocks) - def peek(self, n=0): """Return buffered data without advancing the file position. @@ -272,9 +165,10 @@ class BZ2File(io.BufferedIOBase): """ with self._lock: self._check_can_read() - if not self._fill_buffer(): - return b"" - return self._buffer[self._buffer_offset:] + # Relies on the undocumented fact that BufferedReader.peek() + # always returns at least one byte (except at EOF), independent + # of the value of n + return self._buffer.peek(n) def read(self, size=-1): """Read up to size uncompressed bytes from the file. @@ -284,47 +178,29 @@ class BZ2File(io.BufferedIOBase): """ with self._lock: self._check_can_read() - if size == 0: - return b"" - elif size < 0: - return self._read_all() - else: - return self._read_block(size) + return self._buffer.read(size) def read1(self, size=-1): """Read up to size uncompressed bytes, while trying to avoid - making multiple reads from the underlying stream. + making multiple reads from the underlying stream. Reads up to a + buffer's worth of data if size is negative. Returns b'' if the file is at EOF. """ - # Usually, read1() calls _fp.read() at most once. However, sometimes - # this does not give enough data for the decompressor to make progress. - # In this case we make multiple reads, to avoid returning b"". with self._lock: self._check_can_read() - if (size == 0 or - # Only call _fill_buffer() if the buffer is actually empty. - # This gives a significant speedup if *size* is small. - (self._buffer_offset == len(self._buffer) and not self._fill_buffer())): - return b"" - if size > 0: - data = self._buffer[self._buffer_offset : - self._buffer_offset + size] - self._buffer_offset += len(data) - else: - data = self._buffer[self._buffer_offset:] - self._buffer = b"" - self._buffer_offset = 0 - self._pos += len(data) - return data + if size < 0: + size = io.DEFAULT_BUFFER_SIZE + return self._buffer.read1(size) def readinto(self, b): - """Read up to len(b) bytes into b. + """Read bytes into b. Returns the number of bytes read (0 for EOF). """ with self._lock: - return io.BufferedIOBase.readinto(self, b) + self._check_can_read() + return self._buffer.readinto(b) def readline(self, size=-1): """Read a line of uncompressed bytes from the file. @@ -339,15 +215,7 @@ class BZ2File(io.BufferedIOBase): size = size.__index__() with self._lock: self._check_can_read() - # Shortcut for the common case - the whole line is in the buffer. - if size < 0: - end = self._buffer.find(b"\n", self._buffer_offset) + 1 - if end > 0: - line = self._buffer[self._buffer_offset : end] - self._buffer_offset = end - self._pos += len(line) - return line - return io.BufferedIOBase.readline(self, size) + return self._buffer.readline(size) def readlines(self, size=-1): """Read a list of lines of uncompressed bytes from the file. @@ -361,7 +229,8 @@ class BZ2File(io.BufferedIOBase): raise TypeError("Integer argument expected") size = size.__index__() with self._lock: - return io.BufferedIOBase.readlines(self, size) + self._check_can_read() + return self._buffer.readlines(size) def write(self, data): """Write a byte string to the file. @@ -386,18 +255,9 @@ class BZ2File(io.BufferedIOBase): Line separators are not added between the written byte strings. """ with self._lock: - return io.BufferedIOBase.writelines(self, seq) + return _compression.BaseStream.writelines(self, seq) - # Rewind the file to the beginning of the data stream. - def _rewind(self): - self._fp.seek(0, 0) - self._mode = _MODE_READ - self._pos = 0 - self._decompressor = BZ2Decompressor() - self._buffer = b"" - self._buffer_offset = 0 - - def seek(self, offset, whence=0): + def seek(self, offset, whence=io.SEEK_SET): """Change the file position. The new position is specified by offset, relative to the @@ -414,35 +274,14 @@ class BZ2File(io.BufferedIOBase): """ with self._lock: self._check_can_seek() - - # Recalculate offset as an absolute file position. - if whence == 0: - pass - elif whence == 1: - offset = self._pos + offset - elif whence == 2: - # Seeking relative to EOF - we need to know the file's size. - if self._size < 0: - self._read_all(return_data=False) - offset = self._size + offset - else: - raise ValueError("Invalid value for whence: %s" % (whence,)) - - # Make it so that offset is the number of bytes to skip forward. - if offset < self._pos: - self._rewind() - else: - offset -= self._pos - - # Read and discard data until we reach the desired position. - self._read_block(offset, return_data=False) - - return self._pos + return self._buffer.seek(offset, whence) def tell(self): """Return the current file position.""" with self._lock: self._check_not_closed() + if self._mode == _MODE_READ: + return self._buffer.tell() return self._pos diff -r bc2a22eaa0af Lib/gzip.py --- a/Lib/gzip.py Mon Mar 30 03:21:06 2015 +0200 +++ b/Lib/gzip.py Mon Mar 30 02:58:11 2015 +0000 @@ -9,6 +9,7 @@ import struct, sys, time, os import zlib import builtins import io +import _compression __all__ = ["GzipFile", "open", "compress", "decompress"] @@ -89,49 +90,35 @@ class _PaddedFile: return self._buffer[read:] + \ self.file.read(size-self._length+read) - def prepend(self, prepend=b'', readprevious=False): + def prepend(self, prepend=b''): if self._read is None: self._buffer = prepend - elif readprevious and len(prepend) <= self._read: + else: # Assume data was read since the last prepend() call self._read -= len(prepend) return - else: - self._buffer = self._buffer[self._read:] + prepend self._length = len(self._buffer) self._read = 0 - def unused(self): - if self._read is None: - return b'' - return self._buffer[self._read:] - - def seek(self, offset, whence=0): - # This is only ever called with offset=whence=0 - if whence == 1 and self._read is not None: - if 0 <= offset + self._read <= self._length: - self._read += offset - return - else: - offset += self._length - self._read + def seek(self, off): self._read = None self._buffer = None - return self.file.seek(offset, whence) + return self.file.seek(off) - def __getattr__(self, name): - return getattr(self.file, name) + def seekable(self): + return True # Allows fast-forwarding even in unseekable streams - -class GzipFile(io.BufferedIOBase): +class GzipFile(_compression.BaseStream): """The GzipFile class simulates most of the methods of a file object with - the exception of the readinto() and truncate() methods. + the exception of the truncate() method. This class only supports opening files in binary mode. If you need to open a compressed file in text mode, use the gzip.open() function. """ + # Overridden with internal file object to be closed, if only a filename + # is passed in myfileobj = None - max_read_chunk = 10 * 1024 * 1024 # 10Mb def __init__(self, filename=None, mode=None, compresslevel=9, fileobj=None, mtime=None): @@ -163,13 +150,8 @@ class GzipFile(io.BufferedIOBase): at all. The default is 9. The mtime argument is an optional numeric timestamp to be written - to the stream when compressing. All gzip compressed streams - are required to contain a timestamp. If omitted or None, the - current time is used. This module ignores the timestamp when - decompressing; however, some programs, such as gunzip, make use - of it. The format of the timestamp is the same as that of the - return value of time.time() and of the st_mtime member of the - object returned by os.stat(). + to the last modification time field in the stream when compressing. + If omitted or None, the current time is used. """ @@ -188,18 +170,9 @@ class GzipFile(io.BufferedIOBase): if mode.startswith('r'): self.mode = READ - # Set flag indicating start of a new member - self._new_member = True - # Buffer data read from gzip file. extrastart is offset in - # stream where buffer starts. extrasize is number of - # bytes remaining in buffer from current stream position. - self.extrabuf = b"" - self.extrasize = 0 - self.extrastart = 0 + raw = _GzipReader(fileobj) + self._buffer = io.BufferedReader(raw) self.name = filename - # Starts small, scales exponentially - self.min_readsize = 100 - fileobj = _PaddedFile(fileobj) elif mode.startswith(('w', 'a', 'x')): self.mode = WRITE @@ -209,12 +182,11 @@ class GzipFile(io.BufferedIOBase): -zlib.MAX_WBITS, zlib.DEF_MEM_LEVEL, 0) + self._write_mtime = mtime else: raise ValueError("Invalid mode: {!r}".format(mode)) self.fileobj = fileobj - self.offset = 0 - self.mtime = mtime if self.mode == WRITE: self._write_gzip_header() @@ -227,26 +199,22 @@ class GzipFile(io.BufferedIOBase): return self.name + ".gz" return self.name + @property + def mtime(self): + """Last modification time read from stream, or None""" + return self._buffer.raw._last_mtime + def __repr__(self): - fileobj = self.fileobj - if isinstance(fileobj, _PaddedFile): - fileobj = fileobj.file - s = repr(fileobj) + s = repr(self.fileobj) return '' - def _check_closed(self): - """Raises a ValueError if the underlying file object has been closed. - - """ - if self.closed: - raise ValueError('I/O operation on closed file.') - def _init_write(self, filename): self.name = filename self.crc = zlib.crc32(b"") & 0xffffffff self.size = 0 self.writebuf = [] self.bufsize = 0 + self.offset = 0 # Current file offset for seek(), tell(), etc def _write_gzip_header(self): self.fileobj.write(b'\037\213') # magic header @@ -265,7 +233,7 @@ class GzipFile(io.BufferedIOBase): if fname: flags = FNAME self.fileobj.write(chr(flags).encode('latin-1')) - mtime = self.mtime + mtime = self._write_mtime if mtime is None: mtime = time.time() write32u(self.fileobj, int(mtime)) @@ -274,59 +242,8 @@ class GzipFile(io.BufferedIOBase): if fname: self.fileobj.write(fname + b'\000') - def _init_read(self): - self.crc = zlib.crc32(b"") & 0xffffffff - self.size = 0 - - def _read_exact(self, n): - data = self.fileobj.read(n) - while len(data) < n: - b = self.fileobj.read(n - len(data)) - if not b: - raise EOFError("Compressed file ended before the " - "end-of-stream marker was reached") - data += b - return data - - def _read_gzip_header(self): - magic = self.fileobj.read(2) - if magic == b'': - return False - - if magic != b'\037\213': - raise OSError('Not a gzipped file') - - method, flag, self.mtime = struct.unpack(" self.extrasize: - if not self._read(readsize): - if size > self.extrasize: - size = self.extrasize - break - readsize = min(self.max_read_chunk, readsize * 2) - - offset = self.offset - self.extrastart - chunk = self.extrabuf[offset: offset + size] - self.extrasize = self.extrasize - size - - self.offset += size - return chunk + return self._buffer.read(size) def read1(self, size=-1): - self._check_closed() + """Implements BufferedIOBase.read1() + + Reads up to a buffer's worth of data is size is negative.""" + self._check_not_closed() if self.mode != READ: import errno raise OSError(errno.EBADF, "read1() on write-only GzipFile object") - if self.extrasize <= 0 and self.fileobj is None: - return b'' - - # For certain input data, a single call to _read() may not return - # any data. In this case, retry until we get some data or reach EOF. - while self.extrasize <= 0 and self._read(): - pass - if size < 0 or size > self.extrasize: - size = self.extrasize - - offset = self.offset - self.extrastart - chunk = self.extrabuf[offset: offset + size] - self.extrasize -= size - self.offset += size - return chunk + if size < 0: + size = io.DEFAULT_BUFFER_SIZE + return self._buffer.read1(size) def peek(self, n): + self._check_not_closed() if self.mode != READ: import errno raise OSError(errno.EBADF, "peek() on write-only GzipFile object") - - # Do not return ridiculously small buffers, for one common idiom - # is to call peek(1) and expect more bytes in return. - if n < 100: - n = 100 - if self.extrasize == 0: - if self.fileobj is None: - return b'' - # Ensure that we don't return b"" if we haven't reached EOF. - # 1024 is the same buffering heuristic used in read() - while self.extrasize == 0 and self._read(max(n, 1024)): - pass - offset = self.offset - self.extrastart - remaining = self.extrasize - assert remaining == len(self.extrabuf) - offset - return self.extrabuf[offset:offset + n] - - def _unread(self, buf): - self.extrasize = len(buf) + self.extrasize - self.offset -= len(buf) - - def _read(self, size=1024): - if self.fileobj is None: - return False - - if self._new_member: - # If the _new_member flag is set, we have to - # jump to the next member, if there is one. - self._init_read() - if not self._read_gzip_header(): - return False - self.decompress = zlib.decompressobj(-zlib.MAX_WBITS) - self._new_member = False - - # Read a chunk of data from the file - buf = self.fileobj.read(size) - - # If the EOF has been reached, flush the decompression object - # and mark this object as finished. - - if buf == b"": - uncompress = self.decompress.flush() - # Prepend the already read bytes to the fileobj to they can be - # seen by _read_eof() - self.fileobj.prepend(self.decompress.unused_data, True) - self._read_eof() - self._add_read_data( uncompress ) - return False - - uncompress = self.decompress.decompress(buf) - self._add_read_data( uncompress ) - - if self.decompress.unused_data != b"": - # Ending case: we've come to the end of a member in the file, - # so seek back to the start of the unused data, finish up - # this member, and read a new gzip header. - # Prepend the already read bytes to the fileobj to they can be - # seen by _read_eof() and _read_gzip_header() - self.fileobj.prepend(self.decompress.unused_data, True) - # Check the CRC and file size, and set the flag so we read - # a new member on the next call - self._read_eof() - self._new_member = True - return True - - def _add_read_data(self, data): - self.crc = zlib.crc32(data, self.crc) & 0xffffffff - offset = self.offset - self.extrastart - self.extrabuf = self.extrabuf[offset:] + data - self.extrasize = self.extrasize + len(data) - self.extrastart = self.offset - self.size = self.size + len(data) - - def _read_eof(self): - # We've read to the end of the file - # We check the that the computed CRC and size of the - # uncompressed data matches the stored values. Note that the size - # stored is the true file size mod 2**32. - crc32, isize = struct.unpack(" 0: - self.extrasize -= i - offset - self.offset += i - offset - return self.extrabuf[offset: i] + return self.readall() + # size=0 is special because decompress(max_length=0) is not supported + if not size: + return b"" - size = sys.maxsize - readsize = self.min_readsize - else: - readsize = size - bufs = [] - while size != 0: - c = self.read(readsize) - i = c.find(b'\n') + # For certain input data, a single + # call to decompress() may not return + # any data. In this case, retry until we get some data or reach EOF. + while True: + if self._decompressor.eof: + # Ending case: we've come to the end of a member in the file, + # so finish up this member, and read a new gzip header. + # Check the CRC and file size, and set the flag so we read + # a new member + self._read_eof() + self._new_member = True + self._decompressor = self._decomp_factory( + **self._decomp_args) - # We set i=size to break out of the loop under two - # conditions: 1) there's no newline, and the chunk is - # larger than size, or 2) there is a newline, but the - # resulting line would be longer than 'size'. - if (size <= i) or (i == -1 and len(c) > size): - i = size - 1 + if self._new_member: + # If the _new_member flag is set, we have to + # jump to the next member, if there is one. + self._init_read() + if not self._read_gzip_header(): + self._size = self._pos + return b"" + self._new_member = False - if i >= 0 or c == b'': - bufs.append(c[:i + 1]) # Add portion of last chunk - self._unread(c[i + 1:]) # Push back rest of chunk + # Read a chunk of data from the file + buf = self._fp.read(io.DEFAULT_BUFFER_SIZE) + + uncompress = self._decompressor.decompress(buf, size) + if self._decompressor.unconsumed_tail != b"": + self._fp.prepend(self._decompressor.unconsumed_tail) + elif self._decompressor.unused_data != b"": + # Prepend the already read bytes to the fileobj so they can + # be seen by _read_eof() and _read_gzip_header() + self._fp.prepend(self._decompressor.unused_data) + + if uncompress != b"": break + if buf == b"": + raise EOFError("Compressed file ended before the " + "end-of-stream marker was reached") - # Append chunk to list, decrease 'size', - bufs.append(c) - size = size - len(c) - readsize = min(size, readsize * 2) - if readsize > self.min_readsize: - self.min_readsize = min(readsize, self.min_readsize * 2, 512) - return b''.join(bufs) # Return resulting line + self._add_read_data( uncompress ) + self._pos += len(uncompress) + return uncompress + def _add_read_data(self, data): + self._crc = zlib.crc32(data, self._crc) & 0xffffffff + self._stream_size = self._stream_size + len(data) + def _read_eof(self): + # We've read to the end of the file + # We check the that the computed CRC and size of the + # uncompressed data matches the stored values. Note that the size + # stored is the true file size mod 2**32. + crc32, isize = struct.unpack(" 0 and self._fill_buffer(): - if n < len(self._buffer): - data = self._buffer[:n] - self._buffer_offset = n - else: - data = self._buffer - self._buffer = b"" - if return_data: - blocks.append(data) - self._pos += len(data) - n -= len(data) - if return_data: - return b"".join(blocks) - def peek(self, size=-1): """Return buffered data without advancing the file position. @@ -293,9 +184,9 @@ class LZMAFile(io.BufferedIOBase): The exact number of bytes returned is unspecified. """ self._check_can_read() - if not self._fill_buffer(): - return b"" - return self._buffer[self._buffer_offset:] + # Relies on the undocumented fact that BufferedReader.peek() always + # returns at least one byte (except at EOF) + return self._buffer.peek(size) def read(self, size=-1): """Read up to size uncompressed bytes from the file. @@ -304,38 +195,19 @@ class LZMAFile(io.BufferedIOBase): Returns b"" if the file is already at EOF. """ self._check_can_read() - if size == 0: - return b"" - elif size < 0: - return self._read_all() - else: - return self._read_block(size) + return self._buffer.read(size) def read1(self, size=-1): """Read up to size uncompressed bytes, while trying to avoid - making multiple reads from the underlying stream. + making multiple reads from the underlying stream. Reads up to a + buffer's worth of data if size is negative. Returns b"" if the file is at EOF. """ - # Usually, read1() calls _fp.read() at most once. However, sometimes - # this does not give enough data for the decompressor to make progress. - # In this case we make multiple reads, to avoid returning b"". self._check_can_read() - if (size == 0 or - # Only call _fill_buffer() if the buffer is actually empty. - # This gives a significant speedup if *size* is small. - (self._buffer_offset == len(self._buffer) and not self._fill_buffer())): - return b"" - if size > 0: - data = self._buffer[self._buffer_offset : - self._buffer_offset + size] - self._buffer_offset += len(data) - else: - data = self._buffer[self._buffer_offset:] - self._buffer = b"" - self._buffer_offset = 0 - self._pos += len(data) - return data + if size < 0: + size = io.DEFAULT_BUFFER_SIZE + return self._buffer.read1(size) def readline(self, size=-1): """Read a line of uncompressed bytes from the file. @@ -345,15 +217,7 @@ class LZMAFile(io.BufferedIOBase): case the line may be incomplete). Returns b'' if already at EOF. """ self._check_can_read() - # Shortcut for the common case - the whole line is in the buffer. - if size < 0: - end = self._buffer.find(b"\n", self._buffer_offset) + 1 - if end > 0: - line = self._buffer[self._buffer_offset : end] - self._buffer_offset = end - self._pos += len(line) - return line - return io.BufferedIOBase.readline(self, size) + return self._buffer.readline(size) def write(self, data): """Write a bytes object to the file. @@ -368,16 +232,7 @@ class LZMAFile(io.BufferedIOBase): self._pos += len(data) return len(data) - # Rewind the file to the beginning of the data stream. - def _rewind(self): - self._fp.seek(0, 0) - self._mode = _MODE_READ - self._pos = 0 - self._decompressor = LZMADecompressor(**self._init_args) - self._buffer = b"" - self._buffer_offset = 0 - - def seek(self, offset, whence=0): + def seek(self, offset, whence=io.SEEK_SET): """Change the file position. The new position is specified by offset, relative to the @@ -389,38 +244,17 @@ class LZMAFile(io.BufferedIOBase): Returns the new file position. - Note that seeking is emulated, sp depending on the parameters, + Note that seeking is emulated, so depending on the parameters, this operation may be extremely slow. """ self._check_can_seek() - - # Recalculate offset as an absolute file position. - if whence == 0: - pass - elif whence == 1: - offset = self._pos + offset - elif whence == 2: - # Seeking relative to EOF - we need to know the file's size. - if self._size < 0: - self._read_all(return_data=False) - offset = self._size + offset - else: - raise ValueError("Invalid value for whence: {}".format(whence)) - - # Make it so that offset is the number of bytes to skip forward. - if offset < self._pos: - self._rewind() - else: - offset -= self._pos - - # Read and discard data until we reach the desired position. - self._read_block(offset, return_data=False) - - return self._pos + return self._buffer.seek(offset, whence) def tell(self): """Return the current file position.""" self._check_not_closed() + if self._mode == _MODE_READ: + return self._buffer.tell() return self._pos diff -r bc2a22eaa0af Lib/test/test_bz2.py --- a/Lib/test/test_bz2.py Mon Mar 30 03:21:06 2015 +0200 +++ b/Lib/test/test_bz2.py Mon Mar 30 02:58:11 2015 +0000 @@ -2,7 +2,7 @@ from test import support from test.support import bigmemtest, _4G import unittest -from io import BytesIO +from io import BytesIO, DEFAULT_BUFFER_SIZE import os import pickle import glob @@ -10,6 +10,7 @@ import random import subprocess import sys from test.support import unlink +import _compression try: import threading @@ -110,7 +111,7 @@ class BZ2FileTest(BaseTest): def testRead(self): self.createTempFile() with BZ2File(self.filename) as bz2f: - self.assertRaises(TypeError, bz2f.read, None) + self.assertRaises(TypeError, bz2f.read, float()) self.assertEqual(bz2f.read(), self.TEXT) def testReadBadFile(self): @@ -121,21 +122,21 @@ class BZ2FileTest(BaseTest): def testReadMultiStream(self): self.createTempFile(streams=5) with BZ2File(self.filename) as bz2f: - self.assertRaises(TypeError, bz2f.read, None) + self.assertRaises(TypeError, bz2f.read, float()) self.assertEqual(bz2f.read(), self.TEXT * 5) def testReadMonkeyMultiStream(self): # Test BZ2File.read() on a multi-stream archive where a stream # boundary coincides with the end of the raw read buffer. - buffer_size = bz2._BUFFER_SIZE - bz2._BUFFER_SIZE = len(self.DATA) + buffer_size = _compression.BUFFER_SIZE + _compression.BUFFER_SIZE = len(self.DATA) try: self.createTempFile(streams=5) with BZ2File(self.filename) as bz2f: - self.assertRaises(TypeError, bz2f.read, None) + self.assertRaises(TypeError, bz2f.read, float()) self.assertEqual(bz2f.read(), self.TEXT * 5) finally: - bz2._BUFFER_SIZE = buffer_size + _compression.BUFFER_SIZE = buffer_size def testReadTrailingJunk(self): self.createTempFile(suffix=self.BAD_DATA) @@ -150,7 +151,7 @@ class BZ2FileTest(BaseTest): def testRead0(self): self.createTempFile() with BZ2File(self.filename) as bz2f: - self.assertRaises(TypeError, bz2f.read, None) + self.assertRaises(TypeError, bz2f.read, float()) self.assertEqual(bz2f.read(0), b"") def testReadChunk10(self): @@ -561,13 +562,24 @@ class BZ2FileTest(BaseTest): with BZ2File(str_filename, "rb") as f: self.assertEqual(f.read(), self.DATA) + def testDecompressLimited(self): + """Decompressed data buffering should be limited""" + bomb = bz2.compress(bytes(int(2e6)), compresslevel=9) + self.assertLess(len(bomb), _compression.BUFFER_SIZE) + + decomp = BZ2File(BytesIO(bomb)) + self.assertEqual(bytes(1), decomp.read(1)) + max_decomp = 1 + DEFAULT_BUFFER_SIZE + self.assertLessEqual(decomp._buffer.raw.tell(), max_decomp, + "Excessive amount of data was decompressed") + # Tests for a BZ2File wrapping another file object: def testReadBytesIO(self): with BytesIO(self.DATA) as bio: with BZ2File(bio) as bz2f: - self.assertRaises(TypeError, bz2f.read, None) + self.assertRaises(TypeError, bz2f.read, float()) self.assertEqual(bz2f.read(), self.TEXT) self.assertFalse(bio.closed) diff -r bc2a22eaa0af Lib/test/test_gzip.py --- a/Lib/test/test_gzip.py Mon Mar 30 03:21:06 2015 +0200 +++ b/Lib/test/test_gzip.py Mon Mar 30 02:58:11 2015 +0000 @@ -123,7 +123,10 @@ class TestGzip(BaseTest): # Write to a file, open it for reading, then close it. self.test_write() f = gzip.GzipFile(self.filename, 'r') + fileobj = f.fileobj + self.assertFalse(fileobj.closed) f.close() + self.assertTrue(fileobj.closed) with self.assertRaises(ValueError): f.read(1) with self.assertRaises(ValueError): @@ -132,7 +135,10 @@ class TestGzip(BaseTest): f.tell() # Open the file for writing, then close it. f = gzip.GzipFile(self.filename, 'w') + fileobj = f.fileobj + self.assertFalse(fileobj.closed) f.close() + self.assertTrue(fileobj.closed) with self.assertRaises(ValueError): f.write(b'') with self.assertRaises(ValueError): @@ -271,9 +277,10 @@ class TestGzip(BaseTest): with gzip.GzipFile(self.filename, 'w', mtime = mtime) as fWrite: fWrite.write(data1) with gzip.GzipFile(self.filename) as fRead: + self.assertTrue(hasattr(fRead, 'mtime')) + self.assertIsNone(fRead.mtime) dataRead = fRead.read() self.assertEqual(dataRead, data1) - self.assertTrue(hasattr(fRead, 'mtime')) self.assertEqual(fRead.mtime, mtime) def test_metadata(self): @@ -416,6 +423,18 @@ class TestGzip(BaseTest): with gzip.GzipFile(str_filename, "rb") as f: self.assertEqual(f.read(), data1 * 50) + def test_decompress_limited(self): + """Decompressed data buffering should be limited""" + bomb = gzip.compress(bytes(int(2e6)), compresslevel=9) + self.assertLess(len(bomb), io.DEFAULT_BUFFER_SIZE) + + bomb = io.BytesIO(bomb) + decomp = gzip.GzipFile(fileobj=bomb) + self.assertEqual(bytes(1), decomp.read(1)) + max_decomp = 1 + io.DEFAULT_BUFFER_SIZE + self.assertLessEqual(decomp._buffer.raw.tell(), max_decomp, + "Excessive amount of data was decompressed") + # Testing compress/decompress shortcut functions def test_compress(self): @@ -463,7 +482,7 @@ class TestGzip(BaseTest): with gzip.open(self.filename, "wb") as f: f.write(data1) with gzip.open(self.filename, "rb") as f: - f.fileobj.prepend() + f._buffer.raw._fp.prepend() class TestOpen(BaseTest): def test_binary_modes(self): diff -r bc2a22eaa0af Lib/test/test_lzma.py --- a/Lib/test/test_lzma.py Mon Mar 30 03:21:06 2015 +0200 +++ b/Lib/test/test_lzma.py Mon Mar 30 02:58:11 2015 +0000 @@ -1,4 +1,5 @@ -from io import BytesIO, UnsupportedOperation +import _compression +from io import BytesIO, UnsupportedOperation, DEFAULT_BUFFER_SIZE import os import pickle import random @@ -772,13 +773,13 @@ class FileTestCase(unittest.TestCase): def test_read_multistream_buffer_size_aligned(self): # Test the case where a stream boundary coincides with the end # of the raw read buffer. - saved_buffer_size = lzma._BUFFER_SIZE - lzma._BUFFER_SIZE = len(COMPRESSED_XZ) + saved_buffer_size = _compression.BUFFER_SIZE + _compression.BUFFER_SIZE = len(COMPRESSED_XZ) try: with LZMAFile(BytesIO(COMPRESSED_XZ * 5)) as f: self.assertEqual(f.read(), INPUT * 5) finally: - lzma._BUFFER_SIZE = saved_buffer_size + _compression.BUFFER_SIZE = saved_buffer_size def test_read_trailing_junk(self): with LZMAFile(BytesIO(COMPRESSED_XZ + COMPRESSED_BOGUS)) as f: @@ -829,7 +830,7 @@ class FileTestCase(unittest.TestCase): with LZMAFile(BytesIO(), "w") as f: self.assertRaises(ValueError, f.read) with LZMAFile(BytesIO(COMPRESSED_XZ)) as f: - self.assertRaises(TypeError, f.read, None) + self.assertRaises(TypeError, f.read, float()) def test_read_bad_data(self): with LZMAFile(BytesIO(COMPRESSED_BOGUS)) as f: @@ -925,6 +926,17 @@ class FileTestCase(unittest.TestCase): with LZMAFile(BytesIO(COMPRESSED_XZ)) as f: self.assertListEqual(f.readlines(), lines) + def test_decompress_limited(self): + """Decompressed data buffering should be limited""" + bomb = lzma.compress(bytes(int(2e6)), preset=6) + self.assertLess(len(bomb), _compression.BUFFER_SIZE) + + decomp = LZMAFile(BytesIO(bomb)) + self.assertEqual(bytes(1), decomp.read(1)) + max_decomp = 1 + DEFAULT_BUFFER_SIZE + self.assertLessEqual(decomp._buffer.raw.tell(), max_decomp, + "Excessive amount of data was decompressed") + def test_write(self): with BytesIO() as dst: with LZMAFile(dst, "w") as f: @@ -1090,7 +1102,8 @@ class FileTestCase(unittest.TestCase): self.assertRaises(ValueError, f.seek, 0) with LZMAFile(BytesIO(COMPRESSED_XZ)) as f: self.assertRaises(ValueError, f.seek, 0, 3) - self.assertRaises(ValueError, f.seek, 9, ()) + # io.BufferedReader raises TypeError instead of ValueError + self.assertRaises((TypeError, ValueError), f.seek, 9, ()) self.assertRaises(TypeError, f.seek, None) self.assertRaises(TypeError, f.seek, b"derp")