Index: Lib/_pyio.py =================================================================== --- Lib/_pyio.py (révision 77361) +++ Lib/_pyio.py (copie de travail) @@ -16,6 +16,8 @@ from io import __all__ from io import SEEK_SET, SEEK_CUR, SEEK_END +from codecs import BOMS + # open() uses st_blksize whenever we can DEFAULT_BUFFER_SIZE = 8 * 1024 # bytes @@ -1428,21 +1430,7 @@ raise TypeError("illegal newline type: %r" % (type(newline),)) if newline not in (None, "", "\n", "\r", "\r\n"): raise ValueError("illegal newline value: %r" % (newline,)) - if encoding is None: - try: - encoding = os.device_encoding(buffer.fileno()) - except (AttributeError, UnsupportedOperation): - pass - if encoding is None: - try: - import locale - except ImportError: - # Importing locale may fail if Python is being built - encoding = "ascii" - else: - encoding = locale.getpreferredencoding() - - if not isinstance(encoding, str): + if encoding is not None and not isinstance(encoding, str): raise ValueError("invalid encoding: %r" % encoding) if errors is None: @@ -1453,7 +1441,20 @@ self.buffer = buffer self._line_buffering = line_buffering - self._encoding = encoding + if encoding == "BOM": + if self.writable(): + raise ValueError( + "BOM encoding can only be used to read a text file, " + "not for writing") + self._encoding = None + self._bom_checked = False + else: + if encoding is None: + self._encoding = self._create_encoding() + else: + self._encoding = encoding + self._bom_checked = True + self._has_bom = False self._errors = errors self._readuniversal = not newline self._readtranslate = newline is None @@ -1562,6 +1563,41 @@ self._decoder.reset() return length + def _create_encoding(self): + try: + encoding = os.device_encoding(self.buffer.fileno()) + except (AttributeError, UnsupportedOperation): + encoding = None + + if not encoding: + try: + import locale + except ImportError: + # Importing locale may fail if Python is being built + encoding = "ascii" + else: + encoding = locale.getpreferredencoding() + return encoding + + def _search_bom(self, chunk): + for encoding, bom in BOMS.items(): + if chunk.startswith(bom): + self._encoding = encoding + self._has_bom = True + self._get_decoder() + return chunk[len(bom):] + self._encoding = self._create_encoding() + self._get_decoder() + return chunk + + def _check_bom(self, chunk): + if not self._bom_checked: + self._bom_checked = True + chunk = self._search_bom(chunk) + elif self._decoder is None: + self._get_decoder() + return chunk + def _get_encoder(self): make_encoder = codecs.getincrementalencoder(self._encoding) self._encoder = make_encoder(self._errors) @@ -1610,9 +1646,12 @@ # some of it may remain buffered in the decoder, yet to be # converted. - if self._decoder is None: - raise ValueError("no decoder") + # Read a chunk, decode it, and put the result in self._decoded_chars. + input_chunk = self.buffer.read1(self._CHUNK_SIZE) + eof = not input_chunk + input_chunk = self._check_bom(input_chunk) + if self._telling: # To prepare for tell(), we need to snapshot a point in the # file where the decoder's input buffer is empty. @@ -1621,9 +1660,6 @@ # Given this, we know there was a valid snapshot point # len(dec_buffer) bytes ago with decoder state (b'', dec_flags). - # Read a chunk, decode it, and put the result in self._decoded_chars. - input_chunk = self.buffer.read1(self._CHUNK_SIZE) - eof = not input_chunk self._set_decoded_chars(self._decoder.decode(input_chunk, eof)) if self._telling: @@ -1769,6 +1805,8 @@ # Restore the decoder to its state from the safe start point. if cookie == 0 and self._decoder: + if self._bom_checked and self._has_bom: + self._bom_checked = False self._decoder.reset() elif self._decoder or dec_flags or chars_to_skip: self._decoder = self._decoder or self._get_decoder() @@ -1804,11 +1842,12 @@ self._checkReadable() if n is None: n = -1 - decoder = self._decoder or self._get_decoder() if n < 0: # Read everything. - result = (self._get_decoded_chars() + - decoder.decode(self.buffer.read(), final=True)) + result = self._get_decoded_chars() + chunk = self.buffer.read() + chunk = self._check_bom(chunk) + result += self._decoder.decode(chunk, final=True) self._set_decoded_chars('') self._snapshot = None return result @@ -1842,10 +1881,6 @@ line = self._get_decoded_chars() start = 0 - # Make the decoder if it doesn't already exist. - if not self._decoder: - self._get_decoder() - pos = endpos = None while True: if self._readtranslate: Index: Lib/test/test_io.py =================================================================== --- Lib/test/test_io.py (révision 77361) +++ Lib/test/test_io.py (copie de travail) @@ -1946,6 +1946,29 @@ self.assertEquals(f.read(), data * 2) self.assertEquals(buf.getvalue(), (data * 2).encode(encoding)) + def test_encoding_bom(self): + filename = support.TESTFN + text = "abc\ndef\n123" + lines = text.splitlines(1) + tests = ("utf-8-sig", + "utf-16", + "utf-32") + for encoding in tests: + with self.open(filename, 'w', encoding=encoding) as f: + f.write(text) + + with self.open(filename, encoding="BOM") as f: + self.assertEquals(f.read(), text) + f.seek(0) + self.assertEquals(f.read(), text) + + with self.open(filename, encoding="BOM") as f: + self.assertEquals(f.readlines(), lines) + f.seek(0) + self.assertEquals(f.readlines(), lines) + + self.assertRaises(ValueError, self.open, filename, "w", encoding="BOM") + def test_unreadable(self): class UnReadable(self.BytesIO): def readable(self): Index: Lib/codecs.py =================================================================== --- Lib/codecs.py (révision 77361) +++ Lib/codecs.py (copie de travail) @@ -47,6 +47,14 @@ # UTF-32, big endian BOM_UTF32_BE = b'\x00\x00\xfe\xff' +BOMS = { + "UTF_8": BOM_UTF8, + "UTF_16_LE": BOM_UTF16_LE, + "UTF_16_BE": BOM_UTF16_BE, + "UTF_32_LE": BOM_UTF32_LE, + "UTF_32_BE": BOM_UTF32_BE, +} + if sys.byteorder == 'little': # UTF-16, native endianness