Index: Lib/_pyio.py =================================================================== --- Lib/_pyio.py (révision 77361) +++ Lib/_pyio.py (copie de travail) @@ -16,6 +16,16 @@ from io import __all__ from io import SEEK_SET, SEEK_CUR, SEEK_END +from codecs import BOM_UTF8, BOM_UTF16_LE, BOM_UTF16_BE, BOM_UTF32_LE, BOM_UTF32_BE + +BOMS = ( + (BOM_UTF8, "UTF_8"), + (BOM_UTF16_LE, "UTF_16_LE"), + (BOM_UTF16_BE, "UTF_16_BE"), + (BOM_UTF32_LE, "UTF_32_LE"), + (BOM_UTF32_BE, "UTF_32_BE"), +) + # open() uses st_blksize whenever we can DEFAULT_BUFFER_SIZE = 8 * 1024 # bytes @@ -462,7 +472,7 @@ if not readahead: return 1 n = (readahead.find(b"\n") + 1) or len(readahead) - if limit >= 0: + if (limit is not None) and (limit >= 0): n = min(n, limit) return n else: @@ -1428,21 +1438,7 @@ raise TypeError("illegal newline type: %r" % (type(newline),)) if newline not in (None, "", "\n", "\r", "\r\n"): raise ValueError("illegal newline value: %r" % (newline,)) - if encoding is None: - try: - encoding = os.device_encoding(buffer.fileno()) - except (AttributeError, UnsupportedOperation): - pass - if encoding is None: - try: - import locale - except ImportError: - # Importing locale may fail if Python is being built - encoding = "ascii" - else: - encoding = locale.getpreferredencoding() - - if not isinstance(encoding, str): + if encoding is not None and not isinstance(encoding, str): raise ValueError("invalid encoding: %r" % encoding) if errors is None: @@ -1496,7 +1492,7 @@ @property def encoding(self): - return self._encoding + return self._get_encoding() @property def errors(self): @@ -1562,13 +1558,46 @@ self._decoder.reset() return length + def _create_encoding(self): + try: + encoding = os.device_encoding(self.buffer.fileno()) + except (AttributeError, UnsupportedOperation): + encoding = None + + if not encoding: + try: + import locale + except ImportError: + # Importing locale may fail if Python is being built + encoding = "ascii" + else: + encoding = locale.getpreferredencoding() + return encoding + + def _get_encoding(self): + if self._encoding is None: + self._encoding = self._create_encoding() + return self._encoding + + def _search_bom(self, chunk): + # Guess the encoding by search for a BOM and initialize the decoder. + # Return the chunk without the BOM if any. + for bom, encoding in BOMS: + if chunk.startswith(bom): + self._encoding = encoding + self._get_decoder() + return chunk[len(bom):] + self._encoding = self._create_encoding() + self._get_decoder() + return chunk + def _get_encoder(self): - make_encoder = codecs.getincrementalencoder(self._encoding) + make_encoder = codecs.getincrementalencoder(self._get_encoding()) self._encoder = make_encoder(self._errors) return self._encoder def _get_decoder(self): - make_decoder = codecs.getincrementaldecoder(self._encoding) + make_decoder = codecs.getincrementaldecoder(self._get_encoding()) decoder = make_decoder(self._errors) if self._readuniversal: decoder = IncrementalNewlineDecoder(decoder, self._readtranslate) @@ -1610,9 +1639,15 @@ # some of it may remain buffered in the decoder, yet to be # converted. - if self._decoder is None: - raise ValueError("no decoder") + # Read a chunk, decode it, and put the result in self._decoded_chars. + input_chunk = self.buffer.read1(self._CHUNK_SIZE) + eof = not input_chunk + if self._encoding is None: + input_chunk = self._search_bom(input_chunk) + elif self._decoder is None: + self._get_decoder() + if self._telling: # To prepare for tell(), we need to snapshot a point in the # file where the decoder's input buffer is empty. @@ -1621,9 +1656,6 @@ # Given this, we know there was a valid snapshot point # len(dec_buffer) bytes ago with decoder state (b'', dec_flags). - # Read a chunk, decode it, and put the result in self._decoded_chars. - input_chunk = self.buffer.read1(self._CHUNK_SIZE) - eof = not input_chunk self._set_decoded_chars(self._decoder.decode(input_chunk, eof)) if self._telling: @@ -1804,11 +1836,15 @@ self._checkReadable() if n is None: n = -1 - decoder = self._decoder or self._get_decoder() if n < 0: # Read everything. - result = (self._get_decoded_chars() + - decoder.decode(self.buffer.read(), final=True)) + result = self._get_decoded_chars() + chunk = self.buffer.read() + if self._encoding is None: + chunk = self._search_bom(chunk) + elif self._decoder is None: + self._get_decoder() + result += self._decoder.decode(chunk, final=True) self._set_decoded_chars('') self._snapshot = None return result @@ -1842,10 +1878,6 @@ line = self._get_decoded_chars() start = 0 - # Make the decoder if it doesn't already exist. - if not self._decoder: - self._get_decoder() - pos = endpos = None while True: if self._readtranslate: