# HG changeset patch # Parent 61a045ac00066df6bb89b3f56ef8954e95596926 Clarify StreamReader API and implement stateful zlib-codec reader Also fix simple bug uncovered in bz2-codec. diff -r 61a045ac0006 Doc/library/codecs.rst --- a/Doc/library/codecs.rst Thu Jan 15 00:05:18 2015 +0100 +++ b/Doc/library/codecs.rst Thu Jan 15 22:35:00 2015 +0000 @@ -288,7 +288,7 @@ +-------------------------+-----------------------------------------------+ | Value | Meaning | +=========================+===============================================+ -| ``'strict'`` | Raise :exc:`UnicodeError` (or a subclass); | +| ``'strict'`` | Raise :exc:`ValueError` (or a subclass); | | | this is the default. Implemented in | | | :func:`strict_errors`. | +-------------------------+-----------------------------------------------+ @@ -700,7 +700,8 @@ Python codec registry. The *stream* argument must be a file-like object open for reading - text or binary data, as appropriate for the specific codec. + text or binary data, as appropriate for the specific codec. This stream is + assumed to have ended as soon as a read from it returns no data. The :class:`StreamReader` may implement different error handling schemes by providing the *errors* keyword argument. See :ref:`error-handlers` for @@ -714,27 +715,31 @@ :func:`register_error`. - .. method:: read([size[, chars, [firstline]]]) + .. method:: read([size[, chars[, firstline]]]) Decodes data from the stream and returns the resulting object. + The *size* argument indicates an approximate + chunk size of encoded bytes or code points to read + for decoding. The decoder can modify this setting as + appropriate. The default value -1 indicates the codec uses + an arbitrary chunk size. This parameter is intended to + prevent having to decode huge files in one step. + The *chars* argument indicates the number of decoded code points or bytes to return. The :func:`read` method will - never return more data than requested, but it might return less, - if there is not enough available. - - The *size* argument indicates the approximate maximum - number of encoded bytes or code points to read - for decoding. The decoder can modify this setting as - appropriate. The default value -1 indicates to read and decode as much as - possible. This parameter is intended to - prevent having to decode huge files in one step. + only return less data than requested if the end of the stream is + reached. If *chars* is not specified, the behaviour depends on + the *size* parameter. If a chunk size is specified, an arbitrary + amount of data is returned (at least one code point or byte, + unless the end of the stream is reached). If a chunk size is not + specified, all remaining data is read and decoded. The *firstline* flag indicates that it would be sufficient to only return the first line, if there are decoding errors on later lines. - The method should use a greedy read strategy meaning that it should read + The method should use a greedy read strategy, meaning that it should read as much data as is allowed within the definition of the encoding and the given size, e.g. if optional encoding endings or state markers are available on the stream, these should be read too. @@ -742,12 +747,14 @@ .. method:: readline([size[, keepends]]) - Read one line from the input stream and return the decoded data. + Read one line from the input stream and return the decoded data. The + :term:`universal newlines` approach is used; for codecs that decode to + bytes, ASCII line-endings are assumed. *size*, if given, is passed as size argument to the stream's :meth:`read` method. - If *keepends* is false line-endings will be stripped from the lines + If *keepends* is false, line-endings will be stripped from the lines returned. @@ -756,10 +763,10 @@ Read all lines available on the input stream and return them as a list of lines. - Line-endings are implemented using the codec's decoder method and are + Line-endings are determined as for :meth:`readline`, and are included in the list entries if *keepends* is true. - *sizehint*, if given, is passed as the *size* argument to the stream's + *sizehint*, if given, is passed as the *size* argument to the :meth:`read` method. @@ -1310,7 +1317,7 @@ | | | decoding | | +----------------------+------------------+------------------------------+------------------------------+ | bz2_codec | bz2 | Compress the operand | :meth:`bz2.compress` / | -| | | using bz2 | :meth:`bz2.decompress` | +| [#firstline]_ | | using bz2 | :meth:`bz2.decompress` | +----------------------+------------------+------------------------------+------------------------------+ | hex_codec | hex | Convert operand to | :meth:`base64.b16encode` / | | | | hexadecimal | :meth:`base64.b16decode` | @@ -1325,13 +1332,17 @@ | | | uuencode | :meth:`uu.decode` | +----------------------+------------------+------------------------------+------------------------------+ | zlib_codec | zip, zlib | Compress the operand | :meth:`zlib.compress` / | -| | | using gzip | :meth:`zlib.decompress` | +| [#firstline]_ | | using gzip | :meth:`zlib.decompress` | +----------------------+------------------+------------------------------+------------------------------+ .. [#b64] In addition to :term:`bytes-like objects `, ``'base64_codec'`` also accepts ASCII-only instances of :class:`str` for decoding +.. [#firstline] The :meth:`StreamReader.read` methods of ``'bz2_codec'`` and + ``'zlib_codec'`` do not support reading a single line to avoid + decoding errors + .. versionadded:: 3.2 Restoration of the binary transforms. diff -r 61a045ac0006 Lib/codecs.py --- a/Lib/codecs.py Thu Jan 15 00:05:18 2015 +0100 +++ b/Lib/codecs.py Thu Jan 15 22:35:00 2015 +0000 @@ -449,25 +449,25 @@ """ Decodes data from the stream self.stream and returns the resulting object. - chars indicates the number of decoded code points or bytes to - return. read() will never return more data than requested, - but it might return less, if there is not enough available. - - size indicates the approximate maximum number of decoded + size indicates the approximate chunk size of decoded bytes or code points to read for decoding. The decoder can modify this setting as appropriate. The default value -1 indicates to read and decode as much as possible. size is intended to prevent having to decode huge files in one step. + chars indicates the number of decoded code points or bytes to + return. read() will never return more data than requested, + but it might return less, if the end of the stream is reached. + If firstline is true, and a UnicodeDecodeError happens - after the first line terminator in the input only the first line - will be returned, the rest of the input will be kept until the + after the first line terminator in the input, only the first line + will be returned; the rest of the input will be kept until the next call to read(). The method should use a greedy read strategy, meaning that it should read as much data as is allowed within the - definition of the encoding and the given size, e.g. if + definition of the encoding and the given size, e.g. if optional encoding endings or state markers are available on the stream, these should be read too. """ @@ -599,11 +599,9 @@ def readlines(self, sizehint=None, keepends=True): - """ Read all lines available on the input stream - and return them as a list. + """ Read all lines from the input stream and return them as a list. - Line breaks are implemented using the codec's decoder - method and are included in the list entries. + The universal newlines approach is used to determine line breaks. sizehint, if given, is ignored since there is no efficient way to finding the true end-of-line. diff -r 61a045ac0006 Lib/encodings/bz2_codec.py --- a/Lib/encodings/bz2_codec.py Thu Jan 15 00:05:18 2015 +0100 +++ b/Lib/encodings/bz2_codec.py Thu Jan 15 22:35:00 2015 +0000 @@ -52,7 +52,7 @@ try: return self.decompressobj.decompress(input) except EOFError: - return '' + return b'' def reset(self): self.decompressobj = bz2.BZ2Decompressor() diff -r 61a045ac0006 Lib/encodings/zlib_codec.py --- a/Lib/encodings/zlib_codec.py Thu Jan 15 00:05:18 2015 +0100 +++ b/Lib/encodings/zlib_codec.py Thu Jan 15 22:35:00 2015 +0000 @@ -7,6 +7,7 @@ import codecs import zlib # this codec needs the optional zlib module ! +from io import DEFAULT_BUFFER_SIZE ### Codec APIs @@ -59,9 +60,45 @@ class StreamWriter(Codec, codecs.StreamWriter): charbuffertype = bytes -class StreamReader(Codec, codecs.StreamReader): +class StreamReader(codecs.StreamReader): charbuffertype = bytes + def __init__(self, *pos, **kw): + super().__init__(*pos, **kw) + self.reset() + + def reset(self): + """Implements codecs.StreamReader.reset().""" + self._decompressor = zlib.decompressobj() + + def read(self, size=-1, chars=None, firstline=None): + """Implements codecs.StreamReader.read(). + + Does not support reading a single line to avoid decoding errors. + """ + buffer = bytearray() + read_chunk = size + if read_chunk < 0: + read_chunk = DEFAULT_BUFFER_SIZE + decompress_chunk = chars + if decompress_chunk is None: + decompress_chunk = DEFAULT_BUFFER_SIZE + while not self._decompressor.eof and ( + chars is None or len(buffer) < chars): + data = self._decompressor.unconsumed_tail + if not data: + data = self.stream.read(read_chunk) + if not data: + raise ValueError("Incomplete zlib stream") + if chars is None and size >= 0: # Decode arbitrary length + buffer += self._decompressor.decompress(data) + if buffer: + break + else: # Either decode everything, or exact length specified + data = self._decompressor.decompress(data, decompress_chunk) + buffer += data + return bytes(buffer) + ### encodings module API def getregentry(): diff -r 61a045ac0006 Lib/test/test_codecs.py --- a/Lib/test/test_codecs.py Thu Jan 15 00:05:18 2015 +0100 +++ b/Lib/test/test_codecs.py Thu Jan 15 22:35:00 2015 +0000 @@ -1722,6 +1722,33 @@ f = self.reader(self.stream) self.assertEqual(f.readlines(), ['\ud55c\n', '\uae00']) + def test_read_0(self): + broken_multibyte = { + "big5", "big5hkscs", "cp932", "cp949", "cp950", + "euc_jp", "euc_jis_2004", "euc_jisx0213", "euc_kr", + "gb2312", "gbk", "gb18030", "hz", + "iso2022_jp", "iso2022_jp_1", "iso2022_jp_2", "iso2022_jp_2004", + "iso2022_jp_3", "iso2022_jp_ext", "iso2022_kr", + "johab", "shift_jis", "shift_jis_2004", "shift_jisx0213", + } + for encoding in all_unicode_encodings: + if encoding in broken_multibyte: # read() rejects 2nd parameter + continue + with self.subTest(encoding=encoding): + encoded = codecs.encode("characters", encoding) + reader = codecs.getreader(encoding)(io.BytesIO(encoded)) + self.assertEqual("", reader.read(-1, 0)) + self.assertEqual("", reader.read(100, 0)) + self.assertEqual("", reader.read(0, 0)) + for encoding in bytes_transform_encodings: + with self.subTest(encoding=encoding): + encoded = codecs.encode(b"bytes", encoding) + reader = codecs.getreader(encoding)(io.BytesIO(encoded)) + self.assertEqual(b"", reader.read(-1, 0)) + self.assertEqual(b"", reader.read(100, 0)) + self.assertEqual(b"", reader.read(0, 0)) + # TODO: rot-13 StreamReader seems confused between bytes and text + class EncodedFileTest(unittest.TestCase): def test_basic(self): @@ -2521,12 +2548,52 @@ self.assertEqual(i, binput) def test_read(self): + data = b"\x80data" + broken_stateful = { # See Issue 20132 + "hex_codec", "base64_codec", "quopri_codec", "uu_codec", + "bz2_codec", + } for encoding in bytes_transform_encodings: with self.subTest(encoding=encoding): - sin = codecs.encode(b"\x80", encoding) + sin = codecs.encode(data, encoding) reader = codecs.getreader(encoding)(io.BytesIO(sin)) + sout = reader.read() - self.assertEqual(sout, b"\x80") + self.assertEqual(sout, data) + reader.reset() + reader.seek(0) + sout = reader.read(-1) + self.assertEqual(sout, data) + + if encoding not in broken_stateful: + for size in (1, 100): + with self.subTest(size=size): + reader.reset() + reader.seek(0) + for byte in data: + sout = reader.read(size, 1) + self.assertEqual(bytes((byte,)), sout) + self.assertEqual(b"", reader.read(size, 1)) + self.assertEqual(b"", reader.read(size, 1)) + + reader.reset() + reader.seek(0) + buffer = bytearray() + while True: + sout = reader.read(size) + if not len(sout): + break + buffer += sout + self.assertEqual(data, buffer) + self.assertEqual(b"", reader.read(size)) + + reader.reset() + reader.seek(0) + for byte in data: + sout = reader.read(-1, 1) + self.assertEqual(bytes((byte,)), sout) + self.assertEqual(b"", reader.read(-1, 1)) + self.assertEqual(b"", reader.read(-1, 1)) def test_readline(self): for encoding in bytes_transform_encodings: @@ -2598,6 +2665,27 @@ bad_input.decode("rot_13") self.assertIsNone(failure.exception.__cause__) + def test_decode_past_end(self): + """Should not decode a second stream past the end of the first""" + concatenable = {"hex_codec", "base64_codec", "quopri_codec"} + for encoding in set(bytes_transform_encodings) - concatenable: + with self.subTest(encoding=encoding): + encoded = codecs.encode(b"data", encoding) + + if encoding != "uu_codec": # Broken; see Issue 20132 + buffer = bytearray() + decoder = codecs.getincrementaldecoder(encoding)() + d1 = decoder.decode(encoded) + d2 = decoder.decode(encoded) + self.assertEqual(b"data", d1 + d2) + self.assertEqual(b"", decoder.decode(b"", final=True)) + + if encoding == "bz2_codec": # Concatenates both streams + continue + reader = codecs.getreader(encoding)(io.BytesIO(encoded * 2)) + self.assertEqual(b"data", reader.read()) + self.assertEqual(b"", reader.read()) + @unittest.skipUnless(zlib, "Requires zlib support") def test_custom_zlib_error_is_wrapped(self): # Check zlib codec gives a good error for malformed input @@ -2607,6 +2695,18 @@ self.assertIsInstance(failure.exception.__cause__, type(failure.exception)) + @unittest.skipUnless(zlib, "Requires zlib support") + def test_zlib(self): + incomplete = codecs.encode(b"data", "zlib-codec")[:-1] + decoder = codecs.getdecoder("zlib-codec") + self.assertRaises(zlib.error, decoder, incomplete) + if False: # Incomplete data not detected by IncrementalDecoder + decoder = codecs.getincrementaldecoder("zlib-codec")() + self.assertRaises(ValueError, + decoder.decode, incomplete, final=True) + reader = codecs.getreader("zlib-codec")(io.BytesIO(incomplete)) + self.assertRaises(ValueError, reader.read) + def test_custom_hex_error_is_wrapped(self): # Check hex codec gives a good error for malformed input msg = "^decoding with 'hex_codec' codec failed"