Index: Objects/unicodeobject.c =================================================================== --- Objects/unicodeobject.c (révision 78805) +++ Objects/unicodeobject.c (copie de travail) @@ -158,6 +158,11 @@ PyObject **errorHandler,const char *encoding, const char *reason, const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject, Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos); +static void raise_encode_exception(PyObject **exceptionObject, + const char *encoding, + const Py_UNICODE *unicode, Py_ssize_t size, + Py_ssize_t startpos, Py_ssize_t endpos, + const char *reason); /* Same for linebreaks */ static unsigned char ascii_linebreak[] = { @@ -2497,6 +2502,10 @@ { #define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */ +/* Maximum length in bytes of a surrogate escaped by the error handler: + * xmlcharrefreplace replaces U+DFFFF with '�' */ +#define MAX_SURROGATE_SIZE 8 + Py_ssize_t i; /* index into s of next input byte */ PyObject *result; /* result string object */ char *p; /* next free byte in output buffer */ @@ -2561,31 +2570,80 @@ if (ch >= 0xd800 && ch <= 0xdfff) { Py_ssize_t newpos; PyObject *rep; - char *prep; - int k; rep = unicode_encode_call_errorhandler - (errors, &errorHandler, "utf-8", "surrogates not allowed", + (errors, &errorHandler, "utf-8", "surrogates not allowed", s, size, &exc, i-1, i, &newpos); if (!rep) goto error; /* Implementation limitations: only support error handler that return bytes, and only support up to four replacement bytes. */ - if (!PyBytes_Check(rep)) { - PyErr_SetString(PyExc_TypeError, "error handler should have returned bytes"); + if (PyBytes_Check(rep)) { + char *prep; + int k; + if (PyBytes_Size(rep) > 4) { + PyErr_SetString(PyExc_TypeError, "error handler returned too many bytes"); + Py_DECREF(rep); + goto error; + } + prep = PyBytes_AsString(rep); + for(k = PyBytes_Size(rep); k > 0; k--) + *p++ = *prep++; Py_DECREF(rep); - goto error; - } - if (PyBytes_Size(rep) > 4) { - PyErr_SetString(PyExc_TypeError, "error handler returned too many bytes"); + continue; + } else /* rep is unicode */ { + Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep); + Py_ssize_t repsize = PyUnicode_GET_SIZE(rep); + Py_UNICODE c; + Py_ssize_t k; + + if (repsize > MAX_SURROGATE_SIZE) { + PyErr_SetString(PyExc_TypeError, "error handler returned too many characters"); + Py_DECREF(rep); + goto error; + } + + if (nallocated / MAX_SURROGATE_SIZE != size + || result == NULL) + { + Py_ssize_t offset; + + if (result == NULL) + offset = p - stackbuf; + else + offset = p - PyBytes_AS_STRING(result); + + /* first surrogate character: allocate more memory */ + nallocated = size * MAX_SURROGATE_SIZE; + if (nallocated / MAX_SURROGATE_SIZE != size) { + /* overflow! */ + PyErr_NoMemory(); + goto error; + } + if (result != NULL) { + if (_PyBytes_Resize(&result, nallocated) < 0) + goto error; + } else { + result = PyBytes_FromStringAndSize(NULL, nallocated); + if (result == NULL) + goto error; + Py_MEMCPY(PyBytes_AS_STRING(result), + stackbuf, offset); + } + p = PyBytes_AS_STRING(result) + offset; + } + + for(k=0; k 0; k--) - *p++ = *prep++; - Py_DECREF(rep); - continue; - } *p++ = (char)(0xe0 | (ch >> 12)); *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); Index: Lib/test/test_codecs.py =================================================================== --- Lib/test/test_codecs.py (révision 78805) +++ Lib/test/test_codecs.py (copie de travail) @@ -571,6 +571,16 @@ def test_lone_surrogates(self): self.assertRaises(UnicodeEncodeError, "\ud800".encode, "utf-8") self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "utf-8") + self.assertEqual("[\uDC80]".encode("utf-8", "backslashreplace"), + b'[\\udc80]') + self.assertEqual("[\uDC80]".encode("utf-8", "xmlcharrefreplace"), + b'[�]') + self.assertEqual("[\uDC80]".encode("utf-8", "surrogateescape"), + b'[\x80]') + self.assertEqual("[\uDC80]".encode("utf-8", "ignore"), + b'[]') + self.assertEqual("[\uDC80]".encode("utf-8", "replace"), + b'[?]') def test_surrogatepass_handler(self): self.assertEquals("abc\ud800def".encode("utf-8", "surrogatepass"),