# HG changeset patch # Parent 87c102d0df39a1fdde2d0f7f2de4284121dd44a6 Use max_length in LZMAFile, BZ2File, GzipFile decompression * Split out _DecompressReader and wrap it with BufferedReader to provide the read mode APIs of LZMAFile * The specification of the peek() method is vague * read() now accepts size=None, because BufferedReader does * BufferedReader.seek() raises a different exception for invalid “whence” * Work around different signature for BufferedReader.read1() * Fix documented GzipFile.peek(n) signature to match implementation * Removed unused and inconsistent code paths in gzip._PaddedFile * gzip module does not share the common _DecompressReader, but does use BufferedReader * zlib.decompressobj().flush() assumed to return a limited amount of data, since it has no “max_length” parameter * Added new _compression.py file for common base classes Not done: * open(buffering=...) parameter * open(buffering=0) returning unbuffered reader * detach() method on buffered reader * further factoring of a common CompressedFile base class * apply buffer size parameter to write mode diff -r 87c102d0df39 Doc/library/bz2.rst --- a/Doc/library/bz2.rst Mon Mar 16 12:45:27 2015 -0500 +++ b/Doc/library/bz2.rst Mon Mar 16 23:24:37 2015 +0000 @@ -58,7 +58,7 @@ The ``'x'`` (exclusive creation) mode was added. -.. class:: BZ2File(filename, mode='r', buffering=None, compresslevel=9) +.. class:: BZ2File(filename, mode='r', buffering=None, compresslevel=9, *, buffer_size=io.DEFAULT_BUFFER_SIZE) Open a bzip2-compressed file in binary mode. @@ -83,6 +83,10 @@ If *mode* is ``'r'``, the input file may be the concatenation of multiple compressed streams. + The *buffer_size* argument is only used in read mode. It must be an + integer, at least one, and gives the number of decompressed bytes that may + be buffered between reads. + :class:`BZ2File` provides all of the members specified by the :class:`io.BufferedIOBase`, except for :meth:`detach` and :meth:`truncate`. Iteration and the :keyword:`with` statement are supported. @@ -120,6 +124,9 @@ .. versionchanged:: 3.4 The ``'x'`` (exclusive creation) mode was added. + .. versionchanged:: 3.5 + The *buffer_size* constructor parameter was added. + Incremental (de)compression --------------------------- diff -r 87c102d0df39 Doc/library/gzip.rst --- a/Doc/library/gzip.rst Mon Mar 16 12:45:27 2015 -0500 +++ b/Doc/library/gzip.rst Mon Mar 16 23:24:37 2015 +0000 @@ -57,7 +57,7 @@ Added support for the ``'x'``, ``'xb'`` and ``'xt'`` modes. -.. class:: GzipFile(filename=None, mode=None, compresslevel=9, fileobj=None, mtime=None) +.. class:: GzipFile(filename=None, mode=None, compresslevel=9, fileobj=None, mtime=None, *, buffer_size=io.DEFAULT_BUFFER_SIZE) Constructor for the :class:`GzipFile` class, which simulates most of the methods of a :term:`file object`, with the exception of the :meth:`truncate` @@ -98,6 +98,10 @@ ``time.time()`` and of the ``st_mtime`` attribute of the object returned by ``os.stat()``. + The *buffer_size* argument is only used in read mode. It must be an + integer, at least one, and gives the number of decompressed bytes that may + be buffered between reads. + Calling a :class:`GzipFile` object's :meth:`close` method does not close *fileobj*, since you might wish to append more material after the compressed data. This also allows you to pass a :class:`io.BytesIO` object opened for @@ -110,7 +114,7 @@ :class:`GzipFile` also provides the following method: - .. method:: peek([n]) + .. method:: peek(n) Read *n* uncompressed bytes without advancing the file position. At most one single read on the compressed stream is done to satisfy @@ -126,7 +130,7 @@ .. versionchanged:: 3.1 Support for the :keyword:`with` statement was added, along with the - *mtime* argument. + *mtime* constructor parameter. .. versionchanged:: 3.2 Support for zero-padded and unseekable files was added. @@ -137,6 +141,9 @@ .. versionchanged:: 3.4 Added support for the ``'x'`` and ``'xb'`` modes. + .. versionchanged:: 3.5 + The *buffer_size* constructor parameter was added. + .. function:: compress(data, compresslevel=9) diff -r 87c102d0df39 Doc/library/lzma.rst --- a/Doc/library/lzma.rst Mon Mar 16 12:45:27 2015 -0500 +++ b/Doc/library/lzma.rst Mon Mar 16 23:24:37 2015 +0000 @@ -29,7 +29,7 @@ Reading and writing compressed files ------------------------------------ -.. function:: open(filename, mode="rb", \*, format=None, check=-1, preset=None, filters=None, encoding=None, errors=None, newline=None) +.. function:: open(filename, mode="rb", *, format=None, check=-1, preset=None, filters=None, encoding=None, errors=None, newline=None) Open an LZMA-compressed file in binary or text mode, returning a :term:`file object`. @@ -61,7 +61,7 @@ Added support for the ``"x"``, ``"xb"`` and ``"xt"`` modes. -.. class:: LZMAFile(filename=None, mode="r", \*, format=None, check=-1, preset=None, filters=None) +.. class:: LZMAFile(filename=None, mode="r", *, format=None, check=-1, preset=None, filters=None, buffer_size=io.DEFAULT_BUFFER_SIZE) Open an LZMA-compressed file in binary mode. @@ -90,6 +90,10 @@ When opening a file for writing, the *format*, *check*, *preset* and *filters* arguments have the same meanings as for :class:`LZMACompressor`. + The *buffer_size* argument is only used in read mode. It must be an + integer, at least one, and gives the number of decompressed bytes that may + be buffered between reads. + :class:`LZMAFile` supports all the members specified by :class:`io.BufferedIOBase`, except for :meth:`detach` and :meth:`truncate`. Iteration and the :keyword:`with` statement are supported. @@ -110,6 +114,9 @@ .. versionchanged:: 3.4 Added support for the ``"x"`` and ``"xb"`` modes. + .. versionchanged:: 3.5 + The *buffer_size* constructor parameter was added. + Compressing and decompressing data in memory -------------------------------------------- diff -r 87c102d0df39 Lib/_compression.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Lib/_compression.py Mon Mar 16 23:24:37 2015 +0000 @@ -0,0 +1,150 @@ +"""Internal classes used by the gzip, lzma and bz2 modules""" + +import io + + +class BaseStream(io.IOBase): + """Mode-checking helper functions.""" + + def _check_not_closed(self): + if self.closed: + raise ValueError("I/O operation on closed file") + + def _check_can_read(self): + if not self.readable(): + raise io.UnsupportedOperation("File not open for reading") + + def _check_can_write(self): + if not self.writable(): + raise io.UnsupportedOperation("File not open for writing") + + def _check_can_seek(self): + if not self.seekable(): + raise io.UnsupportedOperation("The underlying file object " + "does not support seeking") + + +class DecompressReader(BaseStream, io.RawIOBase): + """Adapts the decompressor API to a RawIOBase reader API""" + + def readable(self): + return True + + def __init__(self, fp, chunk_size, + decomp_factory, trailing_error, **decomp_args): + self._fp = fp + self._chunk_size = chunk_size + self._eof = False + self._pos = 0 # Current offset in decompressed stream + + # Set to size of decompressed stream once it is known, for SEEK_END + self._size = -1 + + # Save the decompressor factory and arguments. + # If the file contains multiple compressed streams, each + # stream will need a separate decompressor object. A new decompressor + # object is also needed when implementing a backwards seek(). + self._decomp_factory = decomp_factory + self._decomp_args = decomp_args + self._decompressor = self._decomp_factory(**self._decomp_args) + + # Exception class to catch from decompressor signifying invalid + # trailing data to ignore + self._trailing_error = trailing_error + + def close(self): + self._decompressor = None + return super().close() + + def seekable(self): + return self._fp.seekable() + + def readinto(self, b): + with memoryview(b) as view, view.cast("B") as byte_view: + data = self.read(len(byte_view)) + byte_view[:len(data)] = data + return len(data) + + def read(self, size=-1): + if size < 0: + return self.readall() + + if not size or self._eof: + return b"" + # Depending on the input data, our call to the decompressor may not + # return any data. In this case, try again after reading another block. + while True: + if self._decompressor.eof: + rawblock = (self._decompressor.unused_data or + self._fp.read(self._chunk_size)) + if not rawblock: + self._eof = True + self._size = self._pos + return b"" + # Continue to next stream. + self._decompressor = self._decomp_factory( + **self._decomp_args) + try: + data = self._decompressor.decompress(rawblock, size) + except self._trailing_error: + # Trailing data isn't a valid compressed stream; ignore it. + self._eof = True + self._size = self._pos + return b"" + else: + if self._decompressor.needs_input: + rawblock = self._fp.read(self._chunk_size) + if not rawblock: + raise EOFError("Compressed file ended before the " + "end-of-stream marker was reached") + else: + rawblock = bytes() + data = self._decompressor.decompress(rawblock, size) + if data: + break + self._pos += len(data) + return data + + # Rewind the file to the beginning of the data stream. + def _rewind(self): + self._fp.seek(0) + self._eof = False + self._pos = 0 + self._decompressor = self._decomp_factory(**self._decomp_args) + + def seek(self, offset, whence=io.SEEK_SET): + self._check_can_seek() + + # Recalculate offset as an absolute file position. + if whence == io.SEEK_SET: + pass + elif whence == io.SEEK_CUR: + offset = self._pos + offset + elif whence == io.SEEK_END: + # Seeking relative to EOF - we need to know the file's size. + if self._size < 0: + while self.read(self._chunk_size): + pass + offset = self._size + offset + else: + raise ValueError("Invalid value for whence: {}".format(whence)) + + # Make it so that offset is the number of bytes to skip forward. + if offset < self._pos: + self._rewind() + else: + offset -= self._pos + + # Read and discard data until we reach the desired position. + while offset > 0: + data = self.read(min(self._chunk_size, offset)) + if not data: + break + offset -= len(data) + + return self._pos + + def tell(self): + """Return the current file position.""" + self._check_not_closed() + return self._pos diff -r 87c102d0df39 Lib/bz2.py --- a/Lib/bz2.py Mon Mar 16 12:45:27 2015 -0500 +++ b/Lib/bz2.py Mon Mar 16 23:24:37 2015 +0000 @@ -12,6 +12,7 @@ from builtins import open as _builtin_open import io import warnings +import _compression try: from threading import RLock @@ -23,13 +24,11 @@ _MODE_CLOSED = 0 _MODE_READ = 1 -_MODE_READ_EOF = 2 +# Value 2 no longer used _MODE_WRITE = 3 -_BUFFER_SIZE = 8192 - -class BZ2File(io.BufferedIOBase): +class BZ2File(_compression.BaseStream, io.BufferedIOBase): """A file object providing transparent bzip2 (de)compression. @@ -40,7 +39,8 @@ returned as bytes, and data to be written should be given as bytes. """ - def __init__(self, filename, mode="r", buffering=None, compresslevel=9): + def __init__(self, filename, mode="r", buffering=None, compresslevel=9, + *, buffer_size=io.DEFAULT_BUFFER_SIZE): """Open a bzip2-compressed file. If filename is a str or bytes object, it gives the name @@ -59,15 +59,17 @@ If mode is 'r', the input file may be the concatenation of multiple compressed streams. + + The buffer_size argument is only used in read mode. It must be an + integer, at least one, and gives the number of decompressed bytes + that may be buffered between reads. """ # This lock must be recursive, so that BufferedIOBase's - # readline(), readlines() and writelines() don't deadlock. + # writelines() does not deadlock. self._lock = RLock() self._fp = None self._closefp = False self._mode = _MODE_CLOSED - self._pos = 0 - self._size = -1 if buffering is not None: warnings.warn("Use of 'buffering' argument is deprecated", @@ -79,9 +81,6 @@ if mode in ("", "r", "rb"): mode = "rb" mode_code = _MODE_READ - self._decompressor = BZ2Decompressor() - self._buffer = b"" - self._buffer_offset = 0 elif mode in ("w", "wb"): mode = "wb" mode_code = _MODE_WRITE @@ -107,6 +106,14 @@ else: raise TypeError("filename must be a str or bytes object, or a file") + if self._mode == _MODE_READ: + raw = _compression.DecompressReader(self._fp, buffer_size, + BZ2Decompressor, OSError) + self._buffer = io.BufferedReader(raw, buffer_size) + self._buffer_size = buffer_size + else: + self._pos = 0 + def close(self): """Flush and close the file. @@ -117,8 +124,8 @@ if self._mode == _MODE_CLOSED: return try: - if self._mode in (_MODE_READ, _MODE_READ_EOF): - self._decompressor = None + if self._mode == _MODE_READ: + self._buffer.close() elif self._mode == _MODE_WRITE: self._fp.write(self._compressor.flush()) self._compressor = None @@ -130,8 +137,7 @@ self._fp = None self._closefp = False self._mode = _MODE_CLOSED - self._buffer = b"" - self._buffer_offset = 0 + self._buffer = None @property def closed(self): @@ -145,125 +151,18 @@ def seekable(self): """Return whether the file supports seeking.""" - return self.readable() and self._fp.seekable() + return self.readable() and self._buffer.seekable() def readable(self): """Return whether the file was opened for reading.""" self._check_not_closed() - return self._mode in (_MODE_READ, _MODE_READ_EOF) + return self._mode == _MODE_READ def writable(self): """Return whether the file was opened for writing.""" self._check_not_closed() return self._mode == _MODE_WRITE - # Mode-checking helper functions. - - def _check_not_closed(self): - if self.closed: - raise ValueError("I/O operation on closed file") - - def _check_can_read(self): - if self._mode not in (_MODE_READ, _MODE_READ_EOF): - self._check_not_closed() - raise io.UnsupportedOperation("File not open for reading") - - def _check_can_write(self): - if self._mode != _MODE_WRITE: - self._check_not_closed() - raise io.UnsupportedOperation("File not open for writing") - - def _check_can_seek(self): - if self._mode not in (_MODE_READ, _MODE_READ_EOF): - self._check_not_closed() - raise io.UnsupportedOperation("Seeking is only supported " - "on files open for reading") - if not self._fp.seekable(): - raise io.UnsupportedOperation("The underlying file object " - "does not support seeking") - - # Fill the readahead buffer if it is empty. Returns False on EOF. - def _fill_buffer(self): - if self._mode == _MODE_READ_EOF: - return False - # Depending on the input data, our call to the decompressor may not - # return any data. In this case, try again after reading another block. - while self._buffer_offset == len(self._buffer): - rawblock = (self._decompressor.unused_data or - self._fp.read(_BUFFER_SIZE)) - - if not rawblock: - if self._decompressor.eof: - # End-of-stream marker and end of file. We're good. - self._mode = _MODE_READ_EOF - self._size = self._pos - return False - else: - # Problem - we were expecting more compressed data. - raise EOFError("Compressed file ended before the " - "end-of-stream marker was reached") - - if self._decompressor.eof: - # Continue to next stream. - self._decompressor = BZ2Decompressor() - try: - self._buffer = self._decompressor.decompress(rawblock) - except OSError: - # Trailing data isn't a valid bzip2 stream. We're done here. - self._mode = _MODE_READ_EOF - self._size = self._pos - return False - else: - self._buffer = self._decompressor.decompress(rawblock) - self._buffer_offset = 0 - return True - - # Read data until EOF. - # If return_data is false, consume the data without returning it. - def _read_all(self, return_data=True): - # The loop assumes that _buffer_offset is 0. Ensure that this is true. - self._buffer = self._buffer[self._buffer_offset:] - self._buffer_offset = 0 - - blocks = [] - while self._fill_buffer(): - if return_data: - blocks.append(self._buffer) - self._pos += len(self._buffer) - self._buffer = b"" - if return_data: - return b"".join(blocks) - - # Read a block of up to n bytes. - # If return_data is false, consume the data without returning it. - def _read_block(self, n, return_data=True): - # If we have enough data buffered, return immediately. - end = self._buffer_offset + n - if end <= len(self._buffer): - data = self._buffer[self._buffer_offset : end] - self._buffer_offset = end - self._pos += len(data) - return data if return_data else None - - # The loop assumes that _buffer_offset is 0. Ensure that this is true. - self._buffer = self._buffer[self._buffer_offset:] - self._buffer_offset = 0 - - blocks = [] - while n > 0 and self._fill_buffer(): - if n < len(self._buffer): - data = self._buffer[:n] - self._buffer_offset = n - else: - data = self._buffer - self._buffer = b"" - if return_data: - blocks.append(data) - self._pos += len(data) - n -= len(data) - if return_data: - return b"".join(blocks) - def peek(self, n=0): """Return buffered data without advancing the file position. @@ -272,9 +171,10 @@ """ with self._lock: self._check_can_read() - if not self._fill_buffer(): - return b"" - return self._buffer[self._buffer_offset:] + # Relies on the undocumented fact that BufferedReader.peek() + # always returns at least one byte (except at EOF), independent + # of the value of n + return self._buffer.peek(n) def read(self, size=-1): """Read up to size uncompressed bytes from the file. @@ -284,47 +184,29 @@ """ with self._lock: self._check_can_read() - if size == 0: - return b"" - elif size < 0: - return self._read_all() - else: - return self._read_block(size) + return self._buffer.read(size) def read1(self, size=-1): """Read up to size uncompressed bytes, while trying to avoid - making multiple reads from the underlying stream. + making multiple reads from the underlying stream. Reads up to a + buffer's worth of data if size is negative. Returns b'' if the file is at EOF. """ - # Usually, read1() calls _fp.read() at most once. However, sometimes - # this does not give enough data for the decompressor to make progress. - # In this case we make multiple reads, to avoid returning b"". with self._lock: self._check_can_read() - if (size == 0 or - # Only call _fill_buffer() if the buffer is actually empty. - # This gives a significant speedup if *size* is small. - (self._buffer_offset == len(self._buffer) and not self._fill_buffer())): - return b"" - if size > 0: - data = self._buffer[self._buffer_offset : - self._buffer_offset + size] - self._buffer_offset += len(data) - else: - data = self._buffer[self._buffer_offset:] - self._buffer = b"" - self._buffer_offset = 0 - self._pos += len(data) - return data + if size < 0: + size = self._buffer_size + return self._buffer.read1(size) def readinto(self, b): - """Read up to len(b) bytes into b. + """Read bytes into b. Returns the number of bytes read (0 for EOF). """ with self._lock: - return io.BufferedIOBase.readinto(self, b) + self._check_can_read() + return self._buffer.readinto(b) def readline(self, size=-1): """Read a line of uncompressed bytes from the file. @@ -339,15 +221,7 @@ size = size.__index__() with self._lock: self._check_can_read() - # Shortcut for the common case - the whole line is in the buffer. - if size < 0: - end = self._buffer.find(b"\n", self._buffer_offset) + 1 - if end > 0: - line = self._buffer[self._buffer_offset : end] - self._buffer_offset = end - self._pos += len(line) - return line - return io.BufferedIOBase.readline(self, size) + return self._buffer.readline(size) def readlines(self, size=-1): """Read a list of lines of uncompressed bytes from the file. @@ -361,7 +235,8 @@ raise TypeError("Integer argument expected") size = size.__index__() with self._lock: - return io.BufferedIOBase.readlines(self, size) + self._check_can_read() + return self._buffer.readlines(size) def write(self, data): """Write a byte string to the file. @@ -388,15 +263,6 @@ with self._lock: return io.BufferedIOBase.writelines(self, seq) - # Rewind the file to the beginning of the data stream. - def _rewind(self): - self._fp.seek(0, 0) - self._mode = _MODE_READ - self._pos = 0 - self._decompressor = BZ2Decompressor() - self._buffer = b"" - self._buffer_offset = 0 - def seek(self, offset, whence=0): """Change the file position. @@ -413,35 +279,17 @@ this operation may be extremely slow. """ with self._lock: - self._check_can_seek() - - # Recalculate offset as an absolute file position. - if whence == 0: - pass - elif whence == 1: - offset = self._pos + offset - elif whence == 2: - # Seeking relative to EOF - we need to know the file's size. - if self._size < 0: - self._read_all(return_data=False) - offset = self._size + offset - else: - raise ValueError("Invalid value for whence: %s" % (whence,)) - - # Make it so that offset is the number of bytes to skip forward. - if offset < self._pos: - self._rewind() - else: - offset -= self._pos - - # Read and discard data until we reach the desired position. - self._read_block(offset, return_data=False) - - return self._pos + if self._mode != _MODE_READ: + self._check_not_closed() + raise io.UnsupportedOperation("Seeking is only supported " + "on files open for reading") + return self._buffer.seek(offset, whence) def tell(self): """Return the current file position.""" with self._lock: + if self._mode == _MODE_READ: + return self._buffer.tell() self._check_not_closed() return self._pos diff -r 87c102d0df39 Lib/gzip.py --- a/Lib/gzip.py Mon Mar 16 12:45:27 2015 -0500 +++ b/Lib/gzip.py Mon Mar 16 23:24:37 2015 +0000 @@ -9,6 +9,7 @@ import zlib import builtins import io +import _compression __all__ = ["GzipFile", "open", "compress", "decompress"] @@ -89,41 +90,24 @@ return self._buffer[read:] + \ self.file.read(size-self._length+read) - def prepend(self, prepend=b'', readprevious=False): + def prepend(self, prepend=b''): if self._read is None: self._buffer = prepend - elif readprevious and len(prepend) <= self._read: + else: # Assume data was read since the last prepend() call self._read -= len(prepend) return - else: - self._buffer = self._buffer[self._read:] + prepend self._length = len(self._buffer) self._read = 0 - def unused(self): - if self._read is None: - return b'' - return self._buffer[self._read:] - - def seek(self, offset, whence=0): - # This is only ever called with offset=whence=0 - if whence == 1 and self._read is not None: - if 0 <= offset + self._read <= self._length: - self._read += offset - return - else: - offset += self._length - self._read + def rewind(self): self._read = None self._buffer = None - return self.file.seek(offset, whence) + return self.file.seek(0) - def __getattr__(self, name): - return getattr(self.file, name) - -class GzipFile(io.BufferedIOBase): +class GzipFile(_compression.BaseStream, io.BufferedIOBase): """The GzipFile class simulates most of the methods of a file object with - the exception of the readinto() and truncate() methods. + the exception of the truncate() method. This class only supports opening files in binary mode. If you need to open a compressed file in text mode, use the gzip.open() function. @@ -131,10 +115,10 @@ """ myfileobj = None - max_read_chunk = 10 * 1024 * 1024 # 10Mb def __init__(self, filename=None, mode=None, - compresslevel=9, fileobj=None, mtime=None): + compresslevel=9, fileobj=None, mtime=None, *, + buffer_size=io.DEFAULT_BUFFER_SIZE): """Constructor for the GzipFile class. At least one of fileobj and filename must be given a @@ -171,6 +155,10 @@ return value of time.time() and of the st_mtime member of the object returned by os.stat(). + The buffer_size argument is only used in read mode. It must be an + integer, at least one, and gives the number of decompressed bytes + that may be buffered between reads. + """ if mode and ('t' in mode or 'U' in mode): @@ -188,18 +176,10 @@ if mode.startswith('r'): self.mode = READ - # Set flag indicating start of a new member - self._new_member = True - # Buffer data read from gzip file. extrastart is offset in - # stream where buffer starts. extrasize is number of - # bytes remaining in buffer from current stream position. - self.extrabuf = b"" - self.extrasize = 0 - self.extrastart = 0 + raw = _DecompressReader(fileobj, buffer_size, self) + self._buffer = io.BufferedReader(raw, buffer_size) + self._buffer_size = buffer_size self.name = filename - # Starts small, scales exponentially - self.min_readsize = 100 - fileobj = _PaddedFile(fileobj) elif mode.startswith(('w', 'a', 'x')): self.mode = WRITE @@ -213,7 +193,6 @@ raise ValueError("Invalid mode: {!r}".format(mode)) self.fileobj = fileobj - self.offset = 0 self.mtime = mtime if self.mode == WRITE: @@ -228,25 +207,16 @@ return self.name def __repr__(self): - fileobj = self.fileobj - if isinstance(fileobj, _PaddedFile): - fileobj = fileobj.file - s = repr(fileobj) + s = repr(self.fileobj) return '' - def _check_closed(self): - """Raises a ValueError if the underlying file object has been closed. - - """ - if self.closed: - raise ValueError('I/O operation on closed file.') - def _init_write(self, filename): self.name = filename self.crc = zlib.crc32(b"") & 0xffffffff self.size = 0 self.writebuf = [] self.bufsize = 0 + self.offset = 0 # Current file offset for seek(), tell(), etc def _write_gzip_header(self): self.fileobj.write(b'\037\213') # magic header @@ -274,6 +244,145 @@ if fname: self.fileobj.write(fname + b'\000') + def write(self,data): + self._check_not_closed() + if self.mode != WRITE: + import errno + raise OSError(errno.EBADF, "write() on read-only GzipFile object") + + if self.fileobj is None: + raise ValueError("write() on closed GzipFile object") + + # Convert data type if called by io.BufferedWriter. + if isinstance(data, memoryview): + data = data.tobytes() + + if len(data) > 0: + self.size = self.size + len(data) + self.crc = zlib.crc32(data, self.crc) & 0xffffffff + self.fileobj.write( self.compress.compress(data) ) + self.offset += len(data) + + return len(data) + + def read(self, size=-1): + self._check_not_closed() + if self.mode != READ: + import errno + raise OSError(errno.EBADF, "read() on write-only GzipFile object") + return self._buffer.read(size) + + def read1(self, size=-1): + """Implements BufferedIOBase.read1() + + Reads up to a buffer's worth of data is size is negative.""" + self._check_not_closed() + if self.mode != READ: + import errno + raise OSError(errno.EBADF, "read1() on write-only GzipFile object") + + if size < 0: + size = self._buffer_size + return self._buffer.read1(size) + + def peek(self, n): + if self.mode != READ: + import errno + raise OSError(errno.EBADF, "peek() on write-only GzipFile object") + + # Do not return ridiculously small buffers, for one common idiom + # is to call peek(1) and expect more bytes in return. + if n < 100: + n = 100 + return self._buffer.peek(n) + + @property + def closed(self): + return self.fileobj is None + + def close(self): + if self.fileobj is None: + return + if self.mode == WRITE: + self.fileobj.write(self.compress.flush()) + write32u(self.fileobj, self.crc) + # self.size may exceed 2GB, or even 4GB + write32u(self.fileobj, self.size & 0xffffffff) + self.fileobj = None + elif self.mode == READ: + self.fileobj = None + if self.myfileobj: + self.myfileobj.close() + self.myfileobj = None + + def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH): + self._check_not_closed() + if self.mode == WRITE: + # Ensure the compressor's buffer is flushed + self.fileobj.write(self.compress.flush(zlib_mode)) + self.fileobj.flush() + + def fileno(self): + """Invoke the underlying file object's fileno() method. + + This will raise AttributeError if the underlying file object + doesn't support fileno(). + """ + return self.fileobj.fileno() + + def rewind(self): + '''Return the uncompressed stream file position indicator to the + beginning of the file''' + if self.mode != READ: + raise OSError("Can't rewind in write mode") + self._buffer.seek(0) + + def readable(self): + return self.mode == READ + + def writable(self): + return self.mode == WRITE + + def seekable(self): + return True + + def seek(self, offset, whence=io.SEEK_SET): + if self.mode == WRITE: + if whence != io.SEEK_SET: + if whence == io.SEEK_CUR: + offset = self.offset + offset + else: + raise ValueError('Seek from end not supported') + if offset < self.offset: + raise OSError('Negative seek in write mode') + count = offset - self.offset + chunk = bytes(1024) + for i in range(count // 1024): + self.write(chunk) + self.write(bytes(count % 1024)) + elif self.mode == READ: + self._check_not_closed() + return self._buffer.seek(offset, whence) + + return self.offset + + def readline(self, size=-1): + return self._buffer.readline(size) + + +class _DecompressReader(io.RawIOBase): + def readable(self): + return True + + def __init__(self, fileobj, chunk_size, parent): + # Set flag indicating start of a new member + self._new_member = True + fileobj = _PaddedFile(fileobj) + self.fileobj = fileobj + self.offset = 0 # Current offset in decompressed stream + self._chunk_size = chunk_size + self._parent = parent # Parent GzipFile object to set mtime on + def _init_read(self): self.crc = zlib.crc32(b"") & 0xffffffff self.size = 0 @@ -296,7 +405,7 @@ if magic != b'\037\213': raise OSError('Not a gzipped file') - method, flag, self.mtime = struct.unpack(" 0: - self.size = self.size + len(data) - self.crc = zlib.crc32(data, self.crc) & 0xffffffff - self.fileobj.write( self.compress.compress(data) ) - self.offset += len(data) - + def readinto(self, b): + with memoryview(b) as view, view.cast("B") as byte_view: + data = self.read(len(byte_view)) + byte_view[:len(data)] = data return len(data) def read(self, size=-1): - self._check_closed() - if self.mode != READ: - import errno - raise OSError(errno.EBADF, "read() on write-only GzipFile object") + if size < 0: + return self.readall() + if not size: # decompress(max_length=0) not supported + return b"" - if self.extrasize <= 0 and self.fileobj is None: - return b'' + # For certain input data, a single + # call to decompress() may not return + # any data. In this case, retry until we get some data or reach EOF. + while True: + if self._new_member: + # If the _new_member flag is set, we have to + # jump to the next member, if there is one. + self._init_read() + if not self._read_gzip_header(): + return b"" + self.decompress = zlib.decompressobj(-zlib.MAX_WBITS) + self._new_member = False - readsize = 1024 - if size < 0: # get the whole thing - while self._read(readsize): - readsize = min(self.max_read_chunk, readsize * 2) - size = self.extrasize - else: # just get some more of it - while size > self.extrasize: - if not self._read(readsize): - if size > self.extrasize: - size = self.extrasize - break - readsize = min(self.max_read_chunk, readsize * 2) + # Read a chunk of data from the file + buf = self.fileobj.read(self._chunk_size) - offset = self.offset - self.extrastart - chunk = self.extrabuf[offset: offset + size] - self.extrasize = self.extrasize - size + uncompress = self.decompress.decompress(buf, size) + self._add_read_data( uncompress ) - self.offset += size - return chunk + # If the ends of both the compressed and decompressed streams + # have been reached, flush the decompression object + # and mark this object as finished. - def read1(self, size=-1): - self._check_closed() - if self.mode != READ: - import errno - raise OSError(errno.EBADF, "read1() on write-only GzipFile object") + if buf == b"" and uncompress == b"": + # Assuming flush() only returns a limited amount of data + uncompress = self.decompress.flush() + # Prepend the already read bytes to the fileobj so they can + # be seen by _read_eof() + self.fileobj.prepend(self.decompress.unused_data) + self._read_eof() + self._add_read_data( uncompress ) + break - if self.extrasize <= 0 and self.fileobj is None: - return b'' + if self.decompress.unconsumed_tail != b"": + self.fileobj.prepend(self.decompress.unconsumed_tail) + elif self.decompress.unused_data != b"": + # Ending case: we've come to the end of a member in the file, + # so seek back to the start of the unused data, finish up + # this member, and read a new gzip header. + # Prepend the already read bytes to the fileobj to they can + # be seen by _read_eof() and _read_gzip_header() + self.fileobj.prepend(self.decompress.unused_data) + # Check the CRC and file size, and set the flag so we read + # a new member on the next iteration + self._read_eof() + self._new_member = True - # For certain input data, a single call to _read() may not return - # any data. In this case, retry until we get some data or reach EOF. - while self.extrasize <= 0 and self._read(): - pass - if size < 0 or size > self.extrasize: - size = self.extrasize - - offset = self.offset - self.extrastart - chunk = self.extrabuf[offset: offset + size] - self.extrasize -= size - self.offset += size - return chunk - - def peek(self, n): - if self.mode != READ: - import errno - raise OSError(errno.EBADF, "peek() on write-only GzipFile object") - - # Do not return ridiculously small buffers, for one common idiom - # is to call peek(1) and expect more bytes in return. - if n < 100: - n = 100 - if self.extrasize == 0: - if self.fileobj is None: - return b'' - # Ensure that we don't return b"" if we haven't reached EOF. - # 1024 is the same buffering heuristic used in read() - while self.extrasize == 0 and self._read(max(n, 1024)): - pass - offset = self.offset - self.extrastart - remaining = self.extrasize - assert remaining == len(self.extrabuf) - offset - return self.extrabuf[offset:offset + n] - - def _unread(self, buf): - self.extrasize = len(buf) + self.extrasize - self.offset -= len(buf) - - def _read(self, size=1024): - if self.fileobj is None: - return False - - if self._new_member: - # If the _new_member flag is set, we have to - # jump to the next member, if there is one. - self._init_read() - if not self._read_gzip_header(): - return False - self.decompress = zlib.decompressobj(-zlib.MAX_WBITS) - self._new_member = False - - # Read a chunk of data from the file - buf = self.fileobj.read(size) - - # If the EOF has been reached, flush the decompression object - # and mark this object as finished. - - if buf == b"": - uncompress = self.decompress.flush() - # Prepend the already read bytes to the fileobj to they can be - # seen by _read_eof() - self.fileobj.prepend(self.decompress.unused_data, True) - self._read_eof() - self._add_read_data( uncompress ) - return False - - uncompress = self.decompress.decompress(buf) - self._add_read_data( uncompress ) - - if self.decompress.unused_data != b"": - # Ending case: we've come to the end of a member in the file, - # so seek back to the start of the unused data, finish up - # this member, and read a new gzip header. - # Prepend the already read bytes to the fileobj to they can be - # seen by _read_eof() and _read_gzip_header() - self.fileobj.prepend(self.decompress.unused_data, True) - # Check the CRC and file size, and set the flag so we read - # a new member on the next call - self._read_eof() - self._new_member = True - return True + if uncompress != b"": + break + self.offset += len(uncompress) + return uncompress def _add_read_data(self, data): self.crc = zlib.crc32(data, self.crc) & 0xffffffff - offset = self.offset - self.extrastart - self.extrabuf = self.extrabuf[offset:] + data - self.extrasize = self.extrasize + len(data) - self.extrastart = self.offset self.size = self.size + len(data) def _read_eof(self): @@ -493,127 +516,28 @@ while c == b"\x00": c = self.fileobj.read(1) if c: - self.fileobj.prepend(c, True) - - @property - def closed(self): - return self.fileobj is None - - def close(self): - if self.fileobj is None: - return - if self.mode == WRITE: - self.fileobj.write(self.compress.flush()) - write32u(self.fileobj, self.crc) - # self.size may exceed 2GB, or even 4GB - write32u(self.fileobj, self.size & 0xffffffff) - self.fileobj = None - elif self.mode == READ: - self.fileobj = None - if self.myfileobj: - self.myfileobj.close() - self.myfileobj = None - - def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH): - self._check_closed() - if self.mode == WRITE: - # Ensure the compressor's buffer is flushed - self.fileobj.write(self.compress.flush(zlib_mode)) - self.fileobj.flush() - - def fileno(self): - """Invoke the underlying file object's fileno() method. - - This will raise AttributeError if the underlying file object - doesn't support fileno(). - """ - return self.fileobj.fileno() - - def rewind(self): - '''Return the uncompressed stream file position indicator to the - beginning of the file''' - if self.mode != READ: - raise OSError("Can't rewind in write mode") - self.fileobj.seek(0) - self._new_member = True - self.extrabuf = b"" - self.extrasize = 0 - self.extrastart = 0 - self.offset = 0 - - def readable(self): - return self.mode == READ - - def writable(self): - return self.mode == WRITE + self.fileobj.prepend(c) def seekable(self): return True - def seek(self, offset, whence=0): - if whence: - if whence == 1: + def seek(self, offset, whence=io.SEEK_SET): + if whence != io.SEEK_SET: + if whence == io.SEEK_CUR: offset = self.offset + offset else: raise ValueError('Seek from end not supported') - if self.mode == WRITE: - if offset < self.offset: - raise OSError('Negative seek in write mode') - count = offset - self.offset - chunk = bytes(1024) - for i in range(count // 1024): - self.write(chunk) - self.write(bytes(count % 1024)) - elif self.mode == READ: - if offset < self.offset: - # for negative seek, rewind and do positive seek - self.rewind() - count = offset - self.offset - for i in range(count // 1024): - self.read(1024) - self.read(count % 1024) - + if offset < self.offset: + # for negative seek, rewind and do positive seek + self.fileobj.rewind() + self._new_member = True + self.offset = 0 + while self.offset < offset: + count = min(offset - self.offset, self._chunk_size) + if not self.read(count): + break return self.offset - def readline(self, size=-1): - if size < 0: - # Shortcut common case - newline found in buffer. - offset = self.offset - self.extrastart - i = self.extrabuf.find(b'\n', offset) + 1 - if i > 0: - self.extrasize -= i - offset - self.offset += i - offset - return self.extrabuf[offset: i] - - size = sys.maxsize - readsize = self.min_readsize - else: - readsize = size - bufs = [] - while size != 0: - c = self.read(readsize) - i = c.find(b'\n') - - # We set i=size to break out of the loop under two - # conditions: 1) there's no newline, and the chunk is - # larger than size, or 2) there is a newline, but the - # resulting line would be longer than 'size'. - if (size <= i) or (i == -1 and len(c) > size): - i = size - 1 - - if i >= 0 or c == b'': - bufs.append(c[:i + 1]) # Add portion of last chunk - self._unread(c[i + 1:]) # Push back rest of chunk - break - - # Append chunk to list, decrease 'size', - bufs.append(c) - size = size - len(c) - readsize = min(size, readsize * 2) - if readsize > self.min_readsize: - self.min_readsize = min(readsize, self.min_readsize * 2, 512) - return b''.join(bufs) # Return resulting line - def compress(data, compresslevel=9): """Compress data in one shot and return the compressed string. diff -r 87c102d0df39 Lib/lzma.py --- a/Lib/lzma.py Mon Mar 16 12:45:27 2015 -0500 +++ b/Lib/lzma.py Mon Mar 16 23:24:37 2015 +0000 @@ -25,17 +25,16 @@ import io from _lzma import * from _lzma import _encode_filter_properties, _decode_filter_properties +import _compression _MODE_CLOSED = 0 _MODE_READ = 1 -_MODE_READ_EOF = 2 +# Value 2 no longer used _MODE_WRITE = 3 -_BUFFER_SIZE = 8192 - -class LZMAFile(io.BufferedIOBase): +class LZMAFile(_compression.BaseStream, io.BufferedIOBase): """A file object providing transparent LZMA (de)compression. @@ -47,7 +46,8 @@ """ def __init__(self, filename=None, mode="r", *, - format=None, check=-1, preset=None, filters=None): + format=None, check=-1, preset=None, filters=None, + buffer_size=io.DEFAULT_BUFFER_SIZE): """Open an LZMA-compressed file in binary mode. filename can be either an actual file name (given as a str or @@ -88,12 +88,14 @@ filters (if provided) should be a sequence of dicts. Each dict should have an entry for "id" indicating ID of the filter, plus additional entries for options to the filter. + + The buffer_size argument is only used in read mode. It must be an + integer, at least one, and gives the number of decompressed bytes + that may be buffered between reads. """ self._fp = None self._closefp = False self._mode = _MODE_CLOSED - self._pos = 0 - self._size = -1 if mode in ("r", "rb"): if check != -1: @@ -105,19 +107,13 @@ if format is None: format = FORMAT_AUTO mode_code = _MODE_READ - # Save the args to pass to the LZMADecompressor initializer. - # If the file contains multiple compressed streams, each - # stream will need a separate decompressor object. - self._init_args = {"format":format, "filters":filters} - self._decompressor = LZMADecompressor(**self._init_args) - self._buffer = b"" - self._buffer_offset = 0 elif mode in ("w", "wb", "a", "ab", "x", "xb"): if format is None: format = FORMAT_XZ mode_code = _MODE_WRITE self._compressor = LZMACompressor(format=format, check=check, preset=preset, filters=filters) + self._pos = 0 else: raise ValueError("Invalid mode: {!r}".format(mode)) @@ -133,6 +129,12 @@ else: raise TypeError("filename must be a str or bytes object, or a file") + if self._mode == _MODE_READ: + raw = _compression.DecompressReader(self._fp, buffer_size, + LZMADecompressor, LZMAError, format=format, filters=filters) + self._buffer = io.BufferedReader(raw, buffer_size) + self._buffer_size = buffer_size + def close(self): """Flush and close the file. @@ -142,9 +144,9 @@ if self._mode == _MODE_CLOSED: return try: - if self._mode in (_MODE_READ, _MODE_READ_EOF): - self._decompressor = None - self._buffer = b"" + if self._mode == _MODE_READ: + self._buffer.close() + self._buffer = None elif self._mode == _MODE_WRITE: self._fp.write(self._compressor.flush()) self._compressor = None @@ -169,123 +171,18 @@ def seekable(self): """Return whether the file supports seeking.""" - return self.readable() and self._fp.seekable() + return self.readable() and self._buffer.seekable() def readable(self): """Return whether the file was opened for reading.""" self._check_not_closed() - return self._mode in (_MODE_READ, _MODE_READ_EOF) + return self._mode == _MODE_READ def writable(self): """Return whether the file was opened for writing.""" self._check_not_closed() return self._mode == _MODE_WRITE - # Mode-checking helper functions. - - def _check_not_closed(self): - if self.closed: - raise ValueError("I/O operation on closed file") - - def _check_can_read(self): - if self._mode not in (_MODE_READ, _MODE_READ_EOF): - self._check_not_closed() - raise io.UnsupportedOperation("File not open for reading") - - def _check_can_write(self): - if self._mode != _MODE_WRITE: - self._check_not_closed() - raise io.UnsupportedOperation("File not open for writing") - - def _check_can_seek(self): - if self._mode not in (_MODE_READ, _MODE_READ_EOF): - self._check_not_closed() - raise io.UnsupportedOperation("Seeking is only supported " - "on files open for reading") - if not self._fp.seekable(): - raise io.UnsupportedOperation("The underlying file object " - "does not support seeking") - - # Fill the readahead buffer if it is empty. Returns False on EOF. - def _fill_buffer(self): - if self._mode == _MODE_READ_EOF: - return False - # Depending on the input data, our call to the decompressor may not - # return any data. In this case, try again after reading another block. - while self._buffer_offset == len(self._buffer): - rawblock = (self._decompressor.unused_data or - self._fp.read(_BUFFER_SIZE)) - - if not rawblock: - if self._decompressor.eof: - self._mode = _MODE_READ_EOF - self._size = self._pos - return False - else: - raise EOFError("Compressed file ended before the " - "end-of-stream marker was reached") - - if self._decompressor.eof: - # Continue to next stream. - self._decompressor = LZMADecompressor(**self._init_args) - try: - self._buffer = self._decompressor.decompress(rawblock) - except LZMAError: - # Trailing data isn't a valid compressed stream; ignore it. - self._mode = _MODE_READ_EOF - self._size = self._pos - return False - else: - self._buffer = self._decompressor.decompress(rawblock) - self._buffer_offset = 0 - return True - - # Read data until EOF. - # If return_data is false, consume the data without returning it. - def _read_all(self, return_data=True): - # The loop assumes that _buffer_offset is 0. Ensure that this is true. - self._buffer = self._buffer[self._buffer_offset:] - self._buffer_offset = 0 - - blocks = [] - while self._fill_buffer(): - if return_data: - blocks.append(self._buffer) - self._pos += len(self._buffer) - self._buffer = b"" - if return_data: - return b"".join(blocks) - - # Read a block of up to n bytes. - # If return_data is false, consume the data without returning it. - def _read_block(self, n, return_data=True): - # If we have enough data buffered, return immediately. - end = self._buffer_offset + n - if end <= len(self._buffer): - data = self._buffer[self._buffer_offset : end] - self._buffer_offset = end - self._pos += len(data) - return data if return_data else None - - # The loop assumes that _buffer_offset is 0. Ensure that this is true. - self._buffer = self._buffer[self._buffer_offset:] - self._buffer_offset = 0 - - blocks = [] - while n > 0 and self._fill_buffer(): - if n < len(self._buffer): - data = self._buffer[:n] - self._buffer_offset = n - else: - data = self._buffer - self._buffer = b"" - if return_data: - blocks.append(data) - self._pos += len(data) - n -= len(data) - if return_data: - return b"".join(blocks) - def peek(self, size=-1): """Return buffered data without advancing the file position. @@ -293,9 +190,9 @@ The exact number of bytes returned is unspecified. """ self._check_can_read() - if not self._fill_buffer(): - return b"" - return self._buffer[self._buffer_offset:] + # Relies on the undocumented fact that BufferedReader.peek() always + # returns at least one byte (except at EOF) + return self._buffer.peek(size) def read(self, size=-1): """Read up to size uncompressed bytes from the file. @@ -304,38 +201,19 @@ Returns b"" if the file is already at EOF. """ self._check_can_read() - if size == 0: - return b"" - elif size < 0: - return self._read_all() - else: - return self._read_block(size) + return self._buffer.read(size) def read1(self, size=-1): """Read up to size uncompressed bytes, while trying to avoid - making multiple reads from the underlying stream. + making multiple reads from the underlying stream. Reads up to a + buffer's worth of data if size is negative. Returns b"" if the file is at EOF. """ - # Usually, read1() calls _fp.read() at most once. However, sometimes - # this does not give enough data for the decompressor to make progress. - # In this case we make multiple reads, to avoid returning b"". self._check_can_read() - if (size == 0 or - # Only call _fill_buffer() if the buffer is actually empty. - # This gives a significant speedup if *size* is small. - (self._buffer_offset == len(self._buffer) and not self._fill_buffer())): - return b"" - if size > 0: - data = self._buffer[self._buffer_offset : - self._buffer_offset + size] - self._buffer_offset += len(data) - else: - data = self._buffer[self._buffer_offset:] - self._buffer = b"" - self._buffer_offset = 0 - self._pos += len(data) - return data + if size < 0: + size = self._buffer_size + return self._buffer.read1(size) def readline(self, size=-1): """Read a line of uncompressed bytes from the file. @@ -345,15 +223,7 @@ case the line may be incomplete). Returns b'' if already at EOF. """ self._check_can_read() - # Shortcut for the common case - the whole line is in the buffer. - if size < 0: - end = self._buffer.find(b"\n", self._buffer_offset) + 1 - if end > 0: - line = self._buffer[self._buffer_offset : end] - self._buffer_offset = end - self._pos += len(line) - return line - return io.BufferedIOBase.readline(self, size) + return self._buffer.readline(size) def write(self, data): """Write a bytes object to the file. @@ -368,15 +238,6 @@ self._pos += len(data) return len(data) - # Rewind the file to the beginning of the data stream. - def _rewind(self): - self._fp.seek(0, 0) - self._mode = _MODE_READ - self._pos = 0 - self._decompressor = LZMADecompressor(**self._init_args) - self._buffer = b"" - self._buffer_offset = 0 - def seek(self, offset, whence=0): """Change the file position. @@ -389,37 +250,19 @@ Returns the new file position. - Note that seeking is emulated, sp depending on the parameters, + Note that seeking is emulated, so depending on the parameters, this operation may be extremely slow. """ - self._check_can_seek() - - # Recalculate offset as an absolute file position. - if whence == 0: - pass - elif whence == 1: - offset = self._pos + offset - elif whence == 2: - # Seeking relative to EOF - we need to know the file's size. - if self._size < 0: - self._read_all(return_data=False) - offset = self._size + offset - else: - raise ValueError("Invalid value for whence: {}".format(whence)) - - # Make it so that offset is the number of bytes to skip forward. - if offset < self._pos: - self._rewind() - else: - offset -= self._pos - - # Read and discard data until we reach the desired position. - self._read_block(offset, return_data=False) - - return self._pos + if self._mode != _MODE_READ: + self._check_not_closed() + raise io.UnsupportedOperation("Seeking is only supported " + "on files open for reading") + return self._buffer.seek(offset, whence) def tell(self): """Return the current file position.""" + if self._mode == _MODE_READ: + return self._buffer.tell() self._check_not_closed() return self._pos diff -r 87c102d0df39 Lib/test/test_bz2.py --- a/Lib/test/test_bz2.py Mon Mar 16 12:45:27 2015 -0500 +++ b/Lib/test/test_bz2.py Mon Mar 16 23:24:37 2015 +0000 @@ -110,7 +110,7 @@ def testRead(self): self.createTempFile() with BZ2File(self.filename) as bz2f: - self.assertRaises(TypeError, bz2f.read, None) + self.assertRaises(TypeError, bz2f.read, float()) self.assertEqual(bz2f.read(), self.TEXT) def testReadBadFile(self): @@ -121,21 +121,16 @@ def testReadMultiStream(self): self.createTempFile(streams=5) with BZ2File(self.filename) as bz2f: - self.assertRaises(TypeError, bz2f.read, None) + self.assertRaises(TypeError, bz2f.read, float()) self.assertEqual(bz2f.read(), self.TEXT * 5) def testReadMonkeyMultiStream(self): # Test BZ2File.read() on a multi-stream archive where a stream # boundary coincides with the end of the raw read buffer. - buffer_size = bz2._BUFFER_SIZE - bz2._BUFFER_SIZE = len(self.DATA) - try: - self.createTempFile(streams=5) - with BZ2File(self.filename) as bz2f: - self.assertRaises(TypeError, bz2f.read, None) - self.assertEqual(bz2f.read(), self.TEXT * 5) - finally: - bz2._BUFFER_SIZE = buffer_size + self.createTempFile(streams=5) + with BZ2File(self.filename, buffer_size=len(self.DATA)) as bz2f: + self.assertRaises(TypeError, bz2f.read, float()) + self.assertEqual(bz2f.read(), self.TEXT * 5) def testReadTrailingJunk(self): self.createTempFile(suffix=self.BAD_DATA) @@ -150,7 +145,7 @@ def testRead0(self): self.createTempFile() with BZ2File(self.filename) as bz2f: - self.assertRaises(TypeError, bz2f.read, None) + self.assertRaises(TypeError, bz2f.read, float()) self.assertEqual(bz2f.read(0), b"") def testReadChunk10(self): @@ -561,13 +556,33 @@ with BZ2File(str_filename, "rb") as f: self.assertEqual(f.read(), self.DATA) + def testDecompressLimited(self): + """Decompressed data buffering should be limited by buffer_size""" + bomb = bz2.compress(bytes(int(2e6)), compresslevel=9) + BUFFER_SIZE = 3000 + self.assertLess(len(bomb), BUFFER_SIZE) + + decomp = BZ2File(BytesIO(bomb), buffer_size=BUFFER_SIZE) + self.assertEqual(bytes(1), decomp.read(1)) + self.assertLessEqual(decomp._buffer.raw.tell(), 1 + BUFFER_SIZE, + "Excessive amount of data was decompressed") + + def testBadBufferSize(self): + self.createTempFile() + with self.assertRaises(ValueError): + BZ2File(self.filename, "r", buffer_size=0) + with self.assertRaises(ValueError): + BZ2File(self.filename, "r", buffer_size=-1) + with BZ2File(self.filename, "r", buffer_size=+1): # No error + pass + # Tests for a BZ2File wrapping another file object: def testReadBytesIO(self): with BytesIO(self.DATA) as bio: with BZ2File(bio) as bz2f: - self.assertRaises(TypeError, bz2f.read, None) + self.assertRaises(TypeError, bz2f.read, float()) self.assertEqual(bz2f.read(), self.TEXT) self.assertFalse(bio.closed) diff -r 87c102d0df39 Lib/test/test_gzip.py --- a/Lib/test/test_gzip.py Mon Mar 16 12:45:27 2015 -0500 +++ b/Lib/test/test_gzip.py Mon Mar 16 23:24:37 2015 +0000 @@ -379,6 +379,27 @@ with gzip.GzipFile(str_filename, "rb") as f: self.assertEqual(f.read(), data1 * 50) + def test_decompress_limited(self): + """Decompressed data buffering should be limited by buffer_size""" + bomb = gzip.compress(bytes(int(2e6)), compresslevel=9) + BUFFER_SIZE = 3000 + self.assertLess(len(bomb), BUFFER_SIZE) + + bomb = io.BytesIO(bomb) + decomp = gzip.GzipFile(fileobj=bomb, buffer_size=BUFFER_SIZE) + self.assertEqual(bytes(1), decomp.read(1)) + self.assertLessEqual(decomp._buffer.raw.tell(), 1 + BUFFER_SIZE, + "Excessive amount of data was decompressed") + + def test_bad_buffer_size(self): + self.test_write() + with self.assertRaises(ValueError): + gzip.GzipFile(self.filename, "r", buffer_size=0) + with self.assertRaises(ValueError): + gzip.GzipFile(self.filename, "r", buffer_size=-1) + with gzip.GzipFile(self.filename, "r", buffer_size=+1): # No error + pass + # Testing compress/decompress shortcut functions def test_compress(self): @@ -426,7 +447,7 @@ with gzip.open(self.filename, "wb") as f: f.write(data1) with gzip.open(self.filename, "rb") as f: - f.fileobj.prepend() + f._buffer.raw.fileobj.prepend() class TestOpen(BaseTest): def test_binary_modes(self): diff -r 87c102d0df39 Lib/test/test_lzma.py --- a/Lib/test/test_lzma.py Mon Mar 16 12:45:27 2015 -0500 +++ b/Lib/test/test_lzma.py Mon Mar 16 23:24:37 2015 +0000 @@ -597,6 +597,14 @@ LZMAFile(BytesIO(), "w", format=lzma.FORMAT_RAW, preset=6, filters=FILTERS_RAW_1) + def test_init_bad_buffer_size(self): + with self.assertRaises(ValueError): + LZMAFile(BytesIO(), "r", buffer_size=0) + with self.assertRaises(ValueError): + LZMAFile(BytesIO(), "r", buffer_size=-1) + with LZMAFile(BytesIO(), "r", buffer_size=+1): # No error + pass + def test_close(self): with BytesIO(COMPRESSED_XZ) as src: f = LZMAFile(src) @@ -772,13 +780,10 @@ def test_read_multistream_buffer_size_aligned(self): # Test the case where a stream boundary coincides with the end # of the raw read buffer. - saved_buffer_size = lzma._BUFFER_SIZE - lzma._BUFFER_SIZE = len(COMPRESSED_XZ) - try: - with LZMAFile(BytesIO(COMPRESSED_XZ * 5)) as f: - self.assertEqual(f.read(), INPUT * 5) - finally: - lzma._BUFFER_SIZE = saved_buffer_size + input = BytesIO(COMPRESSED_XZ * 5) + buffer_size = len(COMPRESSED_XZ) + with LZMAFile(input, buffer_size=buffer_size) as f: + self.assertEqual(f.read(), INPUT * 5) def test_read_trailing_junk(self): with LZMAFile(BytesIO(COMPRESSED_XZ + COMPRESSED_BOGUS)) as f: @@ -829,7 +834,7 @@ with LZMAFile(BytesIO(), "w") as f: self.assertRaises(ValueError, f.read) with LZMAFile(BytesIO(COMPRESSED_XZ)) as f: - self.assertRaises(TypeError, f.read, None) + self.assertRaises(TypeError, f.read, float()) def test_read_bad_data(self): with LZMAFile(BytesIO(COMPRESSED_BOGUS)) as f: @@ -925,6 +930,17 @@ with LZMAFile(BytesIO(COMPRESSED_XZ)) as f: self.assertListEqual(f.readlines(), lines) + def test_decompress_limited(self): + """Decompressed data buffering should be limited by buffer_size""" + bomb = lzma.compress(bytes(int(2e6)), preset=6) + BUFFER_SIZE = 3000 + self.assertLess(len(bomb), BUFFER_SIZE) + + decomp = LZMAFile(BytesIO(bomb), buffer_size=BUFFER_SIZE) + self.assertEqual(bytes(1), decomp.read(1)) + self.assertLessEqual(decomp._buffer.raw.tell(), 1 + BUFFER_SIZE, + "Excessive amount of data was decompressed") + def test_write(self): with BytesIO() as dst: with LZMAFile(dst, "w") as f: @@ -1090,7 +1106,8 @@ self.assertRaises(ValueError, f.seek, 0) with LZMAFile(BytesIO(COMPRESSED_XZ)) as f: self.assertRaises(ValueError, f.seek, 0, 3) - self.assertRaises(ValueError, f.seek, 9, ()) + # io.BufferedReader raises TypeError instead of ValueError + self.assertRaises((TypeError, ValueError), f.seek, 9, ()) self.assertRaises(TypeError, f.seek, None) self.assertRaises(TypeError, f.seek, b"derp")