Index: Lib/io.py =================================================================== --- Lib/io.py (révision 67670) +++ Lib/io.py (copie de travail) @@ -1280,17 +1280,32 @@ translate=False, it ensures that the newline sequence is returned in one piece. """ - def __init__(self, decoder, translate, errors='strict'): + BOMS = ( + codecs.BOM_UTF32_LE, codecs.BOM_UTF16_LE, + codecs.BOM_UTF32_BE, codecs.BOM_UTF16_BE, + ) + def __init__(self, decoder, encoding, translate, errors='strict'): codecs.IncrementalDecoder.__init__(self, errors=errors) self.buffer = b'' self.translate = translate self.decoder = decoder self.seennl = 0 + self.crbytes = '\r'.encode(encoding) + for bom in self.BOMS: + if not self.crbytes.startswith(bom): + continue + self.crbytes = self.crbytes[len(bom):] + break def decode(self, input, final=False): # decode input (with the eventual \r from a previous pass) if self.buffer: - input = self.buffer + input + buf, flag = self.decoder.getstate() + if buf: + buf = self.buffer + buf + self.decoder.setstate((buf, flag)) + else: + input = self.buffer + input output = self.decoder.decode(input, final=final) @@ -1298,7 +1313,7 @@ # then readline() is sure to get \r\n in one pass if output.endswith("\r") and not final: output = output[:-1] - self.buffer = b'\r' + self.buffer = self.crbytes else: self.buffer = b'' @@ -1319,13 +1334,14 @@ def getstate(self): buf, flag = self.decoder.getstate() - return buf + self.buffer, flag + return self.buffer + buf, flag def setstate(self, state): buf, flag = state - if buf.endswith(b'\r'): - self.buffer = b'\r' - buf = buf[:-1] + if buf.startswith(self.crbytes): + pos = len(self.crbytes) + self.buffer = buf[:pos] + buf = buf[pos:] else: self.buffer = b'' self.decoder.setstate((buf, flag)) @@ -1506,7 +1522,7 @@ make_decoder = codecs.getincrementaldecoder(self._encoding) decoder = make_decoder(self._errors) if self._readuniversal: - decoder = IncrementalNewlineDecoder(decoder, self._readtranslate) + decoder = IncrementalNewlineDecoder(decoder, self.encoding, self._readtranslate) self._decoder = decoder return decoder Index: Lib/test/test_io.py =================================================================== --- Lib/test/test_io.py (révision 67670) +++ Lib/test/test_io.py (copie de travail) @@ -679,8 +679,9 @@ @classmethod def lookupTestDecoder(cls, name): if cls.codecEnabled and name == 'test_decoder': + latin1 = codecs.lookup('latin-1') return codecs.CodecInfo( - name='test_decoder', encode=None, decode=None, + name='test_decoder', encode=latin1.encode, decode=None, incrementalencoder=None, streamreader=None, streamwriter=None, incrementaldecoder=cls) @@ -840,9 +841,12 @@ [ '\r\n', [ "unix\nwindows\r\n", "os9\rlast\nnonl" ] ], [ '\r', [ "unix\nwindows\r", "\nos9\r", "last\nnonl" ] ], ] + encodings = ( + 'utf-8', 'latin-1', + 'utf-16', 'utf-16-le', 'utf-16-be', + 'utf-32', 'utf-32-le', 'utf-32-be', + ) - encodings = ('utf-8', 'latin-1') - # Try a range of buffer sizes to test the case where \r is the last # character in TextIOWrapper._pending_line. for encoding in encodings: @@ -1198,7 +1202,7 @@ def test_newline_decoder(self): import codecs decoder = codecs.getincrementaldecoder("utf-8")() - decoder = io.IncrementalNewlineDecoder(decoder, translate=True) + decoder = io.IncrementalNewlineDecoder(decoder, "utf-8", translate=True) self.assertEquals(decoder.decode(b'\xe8\xa2\x88'), "\u8888") @@ -1230,7 +1234,7 @@ self.assertEquals(decoder.decode(b'\n'), "\n") decoder = codecs.getincrementaldecoder("utf-8")() - decoder = io.IncrementalNewlineDecoder(decoder, translate=True) + decoder = io.IncrementalNewlineDecoder(decoder, "utf-8", translate=True) self.assertEquals(decoder.newlines, None) decoder.decode(b"abc\n\r") self.assertEquals(decoder.newlines, '\n')