Index: Lib/_pyio.py =================================================================== --- Lib/_pyio.py (révision 70643) +++ Lib/_pyio.py (copie de travail) @@ -1397,6 +1397,15 @@ self._snapshot = None # info for reconstructing decoder state self._seekable = self._telling = self.buffer.seekable() + if self._seekable and self.writable(): + position = self.buffer.tell() + if position != 0: + try: + self._get_encoder().setstate(0) + except LookupError: + # Sometimes the encoder doesn't exist + pass + # self._snapshot is either None, or a tuple (dec_flags, next_input) # where dec_flags is the second (integer) item of the decoder state # and next_input is the chunk of input bytes that comes next after the @@ -1693,6 +1702,17 @@ raise IOError("can't restore logical file position") self._decoded_chars_used = chars_to_skip + # Finally, reset the encoder (merely useful for proper BOM handling) + try: + encoder = self._encoder or self._get_encoder() + except LookupError: + # Sometimes the encoder doesn't exist + pass + else: + if cookie != 0: + encoder.setstate(0) + else: + encoder.reset() return cookie def read(self, n=None): Index: Lib/test/test_io.py =================================================================== --- Lib/test/test_io.py (révision 70643) +++ Lib/test/test_io.py (copie de travail) @@ -1819,6 +1819,37 @@ self.assertEqual(buffer.seekable(), txt.seekable()) + def test_append_bom(self): + # The BOM is not written again when appending to a non-empty file + filename = support.TESTFN + for charset in ('utf-8-sig', 'utf-16', 'utf-32'): + with self.open(filename, 'w', encoding=charset) as f: + f.write('aaa') + pos = f.tell() + with self.open(filename, 'rb') as f: + self.assertEquals(f.read(), 'aaa'.encode(charset)) + + with self.open(filename, 'a', encoding=charset) as f: + f.write('xxx') + with self.open(filename, 'rb') as f: + self.assertEquals(f.read(), 'aaaxxx'.encode(charset)) + + def test_seek_bom(self): + # Same test, but when seeking manually + filename = support.TESTFN + for charset in ('utf-8-sig', 'utf-16', 'utf-32'): + with self.open(filename, 'w', encoding=charset) as f: + f.write('aaa') + pos = f.tell() + with self.open(filename, 'r+', encoding=charset) as f: + f.seek(pos) + f.write('zzz') + f.seek(0) + f.write('bbb') + with self.open(filename, 'rb') as f: + self.assertEquals(f.read(), 'bbbzzz'.encode(charset)) + + class CTextIOWrapperTest(TextIOWrapperTest): def test_initialization(self): Index: Modules/_textio.c =================================================================== --- Modules/_textio.c (révision 70643) +++ Modules/_textio.c (copie de travail) @@ -632,6 +632,8 @@ char telling; /* Specialized encoding func (see below) */ encodefunc_t encodefunc; + /* Whether or not it's the start of the stream */ + char encoding_start_of_stream; /* Reads and writes are internally buffered in order to speed things up. However, any read will first flush the write buffer if itsn't empty. @@ -692,23 +694,52 @@ static PyObject * utf16_encode(PyTextIOWrapperObject *self, PyObject *text) { - PyObject *res; - res = PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(text), - PyUnicode_GET_SIZE(text), - PyBytes_AS_STRING(self->errors), 0); - if (res == NULL) - return NULL; - /* Next writes will skip the BOM and use native byte ordering */ + if (!self->encoding_start_of_stream) { + /* Skip the BOM and use native byte ordering */ #if defined(WORDS_BIGENDIAN) - self->encodefunc = (encodefunc_t) utf16be_encode; + return utf16be_encode(self, text); #else - self->encodefunc = (encodefunc_t) utf16le_encode; + return utf16le_encode(self, text); #endif - return res; + } + return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(text), + PyUnicode_GET_SIZE(text), + PyBytes_AS_STRING(self->errors), 0); } +static PyObject * +utf32be_encode(PyTextIOWrapperObject *self, PyObject *text) +{ + return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(text), + PyUnicode_GET_SIZE(text), + PyBytes_AS_STRING(self->errors), 1); +} static PyObject * +utf32le_encode(PyTextIOWrapperObject *self, PyObject *text) +{ + return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(text), + PyUnicode_GET_SIZE(text), + PyBytes_AS_STRING(self->errors), -1); +} + +static PyObject * +utf32_encode(PyTextIOWrapperObject *self, PyObject *text) +{ + if (!self->encoding_start_of_stream) { + /* Skip the BOM and use native byte ordering */ +#if defined(WORDS_BIGENDIAN) + return utf32be_encode(self, text); +#else + return utf32le_encode(self, text); +#endif + } + return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(text), + PyUnicode_GET_SIZE(text), + PyBytes_AS_STRING(self->errors), 0); +} + +static PyObject * utf8_encode(PyTextIOWrapperObject *self, PyObject *text) { return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(text), @@ -734,10 +765,13 @@ encodefuncentry encodefuncs[] = { {"ascii", (encodefunc_t) ascii_encode}, {"iso8859-1", (encodefunc_t) latin1_encode}, + {"utf-8", (encodefunc_t) utf8_encode}, {"utf-16-be", (encodefunc_t) utf16be_encode}, {"utf-16-le", (encodefunc_t) utf16le_encode}, {"utf-16", (encodefunc_t) utf16_encode}, - {"utf-8", (encodefunc_t) utf8_encode}, + {"utf-32-be", (encodefunc_t) utf32be_encode}, + {"utf-32-le", (encodefunc_t) utf32le_encode}, + {"utf-32", (encodefunc_t) utf32_encode}, {NULL, NULL} }; @@ -962,6 +996,31 @@ self->seekable = self->telling = PyObject_IsTrue(res); Py_DECREF(res); + if (self->seekable && self->encoder) { + PyObject *cookieObj; + int cmp; + + self->encoding_start_of_stream = 1; + + cookieObj = PyObject_CallMethod(buffer, "tell", NULL); + if (cookieObj == NULL) + goto error; + + cmp = PyObject_RichCompareBool(cookieObj, _PyIO_zero, Py_EQ); + Py_DECREF(cookieObj); + if (cmp < 0) { + goto error; + } + + if (cmp == 0) { + self->encoding_start_of_stream = 0; + res = PyObject_CallMethod(self->encoder, "setstate", "O", _PyIO_zero); + if (res == NULL) + goto error; + Py_DECREF(res); + } + } + self->ok = 1; return 0; @@ -1150,8 +1209,10 @@ needflush = 1; /* XXX What if we were just reading? */ - if (self->encodefunc != NULL) + if (self->encodefunc != NULL) { b = (*self->encodefunc)((PyObject *) self, text); + self->encoding_start_of_stream = 0; + } else b = PyObject_CallMethodObjArgs(self->encoder, _PyIO_str_encode, text, NULL); @@ -1800,24 +1861,38 @@ return 0; } +static int +_TextIOWrapper_encoder_setstate(PyTextIOWrapperObject *self, + CookieStruct *cookie) +{ + PyObject *res; + /* Same as _TextIOWrapper_decoder_setstate() above. */ + if (cookie->start_pos == 0 && cookie->dec_flags == 0) { + res = PyObject_CallMethodObjArgs(self->encoder, _PyIO_str_reset, NULL); + self->encoding_start_of_stream = 1; + } + else { + res = PyObject_CallMethodObjArgs(self->encoder, _PyIO_str_setstate, + _PyIO_zero, NULL); + self->encoding_start_of_stream = 0; + } + if (res == NULL) + return -1; + Py_DECREF(res); + return 0; +} + static PyObject * TextIOWrapper_seek(PyTextIOWrapperObject *self, PyObject *args) { PyObject *cookieObj, *posobj; CookieStruct cookie; int whence = 0; - static PyObject *zero = NULL; PyObject *res; int cmp; CHECK_INITIALIZED(self); - if (zero == NULL) { - zero = PyLong_FromLong(0L); - if (zero == NULL) - return NULL; - } - if (!PyArg_ParseTuple(args, "O|i:seek", &cookieObj, &whence)) return NULL; CHECK_CLOSED(self); @@ -1832,7 +1907,7 @@ if (whence == 1) { /* seek relative to current position */ - cmp = PyObject_RichCompareBool(cookieObj, zero, Py_EQ); + cmp = PyObject_RichCompareBool(cookieObj, _PyIO_zero, Py_EQ); if (cmp < 0) goto fail; @@ -1853,7 +1928,7 @@ else if (whence == 2) { /* seek relative to end of file */ - cmp = PyObject_RichCompareBool(cookieObj, zero, Py_EQ); + cmp = PyObject_RichCompareBool(cookieObj, _PyIO_zero, Py_EQ); if (cmp < 0) goto fail; @@ -1887,7 +1962,7 @@ goto fail; } - cmp = PyObject_RichCompareBool(cookieObj, zero, Py_LT); + cmp = PyObject_RichCompareBool(cookieObj, _PyIO_zero, Py_LT); if (cmp < 0) goto fail; @@ -1966,6 +2041,11 @@ goto fail; } + /* Finally, reset the encoder (merely useful for proper BOM handling) */ + if (self->encoder) { + if (_TextIOWrapper_encoder_setstate(self, &cookie) < 0) + goto fail; + } return cookieObj; fail: Py_XDECREF(cookieObj); Index: Modules/io.c =================================================================== --- Modules/io.c (révision 70643) +++ Modules/io.c (copie de travail) @@ -41,6 +41,7 @@ PyObject *_PyIO_str_reset; PyObject *_PyIO_str_seek; PyObject *_PyIO_str_seekable; +PyObject *_PyIO_str_setstate; PyObject *_PyIO_str_tell; PyObject *_PyIO_str_truncate; PyObject *_PyIO_str_writable; @@ -48,6 +49,7 @@ PyObject *_PyIO_empty_str; PyObject *_PyIO_empty_bytes; +PyObject *_PyIO_zero; PyDoc_STRVAR(module_doc, @@ -734,6 +736,8 @@ goto fail; if (!(_PyIO_str_seekable = PyUnicode_InternFromString("seekable"))) goto fail; + if (!(_PyIO_str_setstate = PyUnicode_InternFromString("setstate"))) + goto fail; if (!(_PyIO_str_tell = PyUnicode_InternFromString("tell"))) goto fail; if (!(_PyIO_str_truncate = PyUnicode_InternFromString("truncate"))) @@ -747,6 +751,8 @@ goto fail; if (!(_PyIO_empty_bytes = PyBytes_FromStringAndSize(NULL, 0))) goto fail; + if (!(_PyIO_zero = PyLong_FromLong(0L))) + goto fail; state->initialized = 1; Index: Modules/_iomodule.h =================================================================== --- Modules/_iomodule.h (révision 70643) +++ Modules/_iomodule.h (copie de travail) @@ -141,6 +141,7 @@ extern PyObject *_PyIO_str_reset; extern PyObject *_PyIO_str_seek; extern PyObject *_PyIO_str_seekable; +extern PyObject *_PyIO_str_setstate; extern PyObject *_PyIO_str_tell; extern PyObject *_PyIO_str_truncate; extern PyObject *_PyIO_str_writable; @@ -148,3 +149,4 @@ extern PyObject *_PyIO_empty_str; extern PyObject *_PyIO_empty_bytes; +extern PyObject *_PyIO_zero;