diff -r 889023da7454 Doc/library/codecs.rst --- a/Doc/library/codecs.rst Mon Mar 16 08:31:38 2015 +0200 +++ b/Doc/library/codecs.rst Mon Mar 16 08:40:04 2015 +0200 @@ -351,7 +351,10 @@ In addition, the following error handler The ``'namereplace'`` error handler. .. versionchanged:: 3.5 - The ``'backslashreplace'`` error handlers now works with decoding and + The ``'backslashreplace'`` error handlers now works with decoding. + +.. versionchanged:: 3.5 + All standard error handlers except ``'surrogateescape'`` now support translating. The set of allowed values can be extended by registering a new named error diff -r 889023da7454 Lib/test/test_codeccallbacks.py --- a/Lib/test/test_codeccallbacks.py Mon Mar 16 08:31:38 2015 +0200 +++ b/Lib/test/test_codeccallbacks.py Mon Mar 16 08:40:04 2015 +0200 @@ -534,17 +534,12 @@ class CodecCallbackTest(unittest.TestCas codecs.xmlcharrefreplace_errors, UnicodeError("ouch") ) - # "xmlcharrefreplace" can only be used for encoding + # "xmlcharrefreplace" can not be used for decoding self.assertRaises( TypeError, codecs.xmlcharrefreplace_errors, UnicodeDecodeError("ascii", bytearray(b"\xff"), 0, 1, "ouch") ) - self.assertRaises( - TypeError, - codecs.xmlcharrefreplace_errors, - UnicodeTranslateError("\u3042", 0, 1, "ouch") - ) # Use the correct exception cs = (0, 1, 9, 10, 99, 100, 999, 1000, 9999, 10000, 99999, 100000, 999999, 1000000) @@ -557,6 +552,13 @@ class CodecCallbackTest(unittest.TestCas ), ("".join("&#%d;" % c for c in cs), 1 + len(s)) ) + self.assertEqual( + codecs.xmlcharrefreplace_errors( + UnicodeTranslateError("a" + s + "b", + 1, 1 + len(s), "ouch") + ), + ("".join("&#%d;" % c for c in cs), 1 + len(s)) + ) def test_badandgoodbackslashreplaceexceptions(self): # "backslashreplace" complains about a non-exception passed in @@ -629,17 +631,12 @@ class CodecCallbackTest(unittest.TestCas codecs.namereplace_errors, UnicodeError("ouch") ) - # "namereplace" can only be used for encoding + # "namereplace" can not be used for decoding self.assertRaises( TypeError, codecs.namereplace_errors, UnicodeDecodeError("ascii", bytearray(b"\xff"), 0, 1, "ouch") ) - self.assertRaises( - TypeError, - codecs.namereplace_errors, - UnicodeTranslateError("\u3042", 0, 1, "ouch") - ) # Use the correct exception tests = [ ("\u3042", "\\N{HIRAGANA LETTER A}"), @@ -661,6 +658,12 @@ class CodecCallbackTest(unittest.TestCas 1, 1 + len(s), "ouch")), (r, 1 + len(s)) ) + self.assertEqual( + codecs.namereplace_errors( + UnicodeTranslateError("a" + s + "b", + 1, 1 + len(s), "ouch")), + (r, 1 + len(s)) + ) def test_badandgoodsurrogateescapeexceptions(self): surrogateescape_errors = codecs.lookup_error('surrogateescape') @@ -720,12 +723,6 @@ class CodecCallbackTest(unittest.TestCas surrogatepass_errors, UnicodeError("ouch") ) - # "surrogatepass" can not be used for translating - self.assertRaises( - TypeError, - surrogatepass_errors, - UnicodeTranslateError("\ud800", 0, 1, "ouch") - ) # Use the correct exception for enc in ("utf-8", "utf-16le", "utf-16be", "utf-32le", "utf-32be"): with self.subTest(encoding=enc): @@ -739,6 +736,11 @@ class CodecCallbackTest(unittest.TestCas surrogatepass_errors, UnicodeDecodeError(enc, "a".encode(enc), 0, 1, "ouch") ) + self.assertRaises( + UnicodeTranslateError, + surrogatepass_errors, + UnicodeTranslateError("\u3042", 0, 1, "ouch") + ) for s in ("\ud800", "\udfff", "\ud800\udfff"): with self.subTest(str=s): self.assertRaises( @@ -746,6 +748,12 @@ class CodecCallbackTest(unittest.TestCas surrogatepass_errors, UnicodeEncodeError("ascii", s, 0, len(s), "ouch") ) + self.assertEqual( + surrogatepass_errors( + UnicodeTranslateError("a" + s + "b", + 1, 1 + len(s), "ouch")), + (s, 1 + len(s)) + ) tests = [ ("utf-8", "\ud800", b'\xed\xa0\x80', 3), ("utf-16le", "\ud800", b'\x00\xd8', 2), diff -r 889023da7454 Python/codecs.c --- a/Python/codecs.c Mon Mar 16 08:31:38 2015 +0200 +++ b/Python/codecs.c Mon Mar 16 08:40:04 2015 +0200 @@ -766,100 +766,108 @@ PyObject *PyCodec_ReplaceErrors(PyObject PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc) { + PyObject *object; + Py_ssize_t i; + Py_ssize_t start; + Py_ssize_t end; + PyObject *res; + unsigned char *outp; + Py_ssize_t ressize; + Py_UCS4 ch; + if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) { - PyObject *restuple; - PyObject *object; - Py_ssize_t i; - Py_ssize_t start; - Py_ssize_t end; - PyObject *res; - unsigned char *outp; - Py_ssize_t ressize; - Py_UCS4 ch; if (PyUnicodeEncodeError_GetStart(exc, &start)) return NULL; if (PyUnicodeEncodeError_GetEnd(exc, &end)) return NULL; if (!(object = PyUnicodeEncodeError_GetObject(exc))) return NULL; - if (end - start > PY_SSIZE_T_MAX / (2+7+1)) - end = start + PY_SSIZE_T_MAX / (2+7+1); - for (i = start, ressize = 0; i < end; ++i) { - /* object is guaranteed to be "ready" */ - ch = PyUnicode_READ_CHAR(object, i); - if (ch<10) - ressize += 2+1+1; - else if (ch<100) - ressize += 2+2+1; - else if (ch<1000) - ressize += 2+3+1; - else if (ch<10000) - ressize += 2+4+1; - else if (ch<100000) - ressize += 2+5+1; - else if (ch<1000000) - ressize += 2+6+1; - else - ressize += 2+7+1; - } - /* allocate replacement */ - res = PyUnicode_New(ressize, 127); - if (res == NULL) { - Py_DECREF(object); + } + else if (PyObject_IsInstance(exc, PyExc_UnicodeTranslateError)) { + if (PyUnicodeTranslateError_GetStart(exc, &start)) return NULL; - } - outp = PyUnicode_1BYTE_DATA(res); - /* generate replacement */ - for (i = start; i < end; ++i) { - int digits; - int base; - ch = PyUnicode_READ_CHAR(object, i); - *outp++ = '&'; - *outp++ = '#'; - if (ch<10) { - digits = 1; - base = 1; - } - else if (ch<100) { - digits = 2; - base = 10; - } - else if (ch<1000) { - digits = 3; - base = 100; - } - else if (ch<10000) { - digits = 4; - base = 1000; - } - else if (ch<100000) { - digits = 5; - base = 10000; - } - else if (ch<1000000) { - digits = 6; - base = 100000; - } - else { - digits = 7; - base = 1000000; - } - while (digits-->0) { - *outp++ = '0' + ch/base; - ch %= base; - base /= 10; - } - *outp++ = ';'; - } - assert(_PyUnicode_CheckConsistency(res, 1)); - restuple = Py_BuildValue("(Nn)", res, end); - Py_DECREF(object); - return restuple; + if (PyUnicodeTranslateError_GetEnd(exc, &end)) + return NULL; + if (!(object = PyUnicodeTranslateError_GetObject(exc))) + return NULL; } else { wrong_exception_type(exc); return NULL; } + + if (end - start > PY_SSIZE_T_MAX / (2+7+1)) + end = start + PY_SSIZE_T_MAX / (2+7+1); + for (i = start, ressize = 0; i < end; ++i) { + /* object is guaranteed to be "ready" */ + ch = PyUnicode_READ_CHAR(object, i); + if (ch<10) + ressize += 2+1+1; + else if (ch<100) + ressize += 2+2+1; + else if (ch<1000) + ressize += 2+3+1; + else if (ch<10000) + ressize += 2+4+1; + else if (ch<100000) + ressize += 2+5+1; + else if (ch<1000000) + ressize += 2+6+1; + else + ressize += 2+7+1; + } + /* allocate replacement */ + res = PyUnicode_New(ressize, 127); + if (res == NULL) { + Py_DECREF(object); + return NULL; + } + outp = PyUnicode_1BYTE_DATA(res); + /* generate replacement */ + for (i = start; i < end; ++i) { + int digits; + int base; + ch = PyUnicode_READ_CHAR(object, i); + *outp++ = '&'; + *outp++ = '#'; + if (ch<10) { + digits = 1; + base = 1; + } + else if (ch<100) { + digits = 2; + base = 10; + } + else if (ch<1000) { + digits = 3; + base = 100; + } + else if (ch<10000) { + digits = 4; + base = 1000; + } + else if (ch<100000) { + digits = 5; + base = 10000; + } + else if (ch<1000000) { + digits = 6; + base = 100000; + } + else { + digits = 7; + base = 1000000; + } + while (digits-->0) { + *outp++ = '0' + ch/base; + ch %= base; + base /= 10; + } + *outp++ = ';'; + } + assert(_PyUnicode_CheckConsistency(res, 1)); + Py_DECREF(object); + return Py_BuildValue("(Nn)", res, end); } PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc) @@ -977,96 +985,104 @@ static int ucnhash_initialized = 0; PyObject *PyCodec_NameReplaceErrors(PyObject *exc) { + PyObject *object; + Py_ssize_t i; + Py_ssize_t start; + Py_ssize_t end; + PyObject *res; + unsigned char *outp; + Py_ssize_t ressize; + int replsize; + Py_UCS4 c; + char buffer[256]; /* NAME_MAXLEN */ + if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) { - PyObject *restuple; - PyObject *object; - Py_ssize_t i; - Py_ssize_t start; - Py_ssize_t end; - PyObject *res; - unsigned char *outp; - Py_ssize_t ressize; - int replsize; - Py_UCS4 c; - char buffer[256]; /* NAME_MAXLEN */ if (PyUnicodeEncodeError_GetStart(exc, &start)) return NULL; if (PyUnicodeEncodeError_GetEnd(exc, &end)) return NULL; if (!(object = PyUnicodeEncodeError_GetObject(exc))) return NULL; - if (!ucnhash_initialized) { - /* load the unicode data module */ - ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import( - PyUnicodeData_CAPSULE_NAME, 1); - ucnhash_initialized = 1; - } - for (i = start, ressize = 0; i < end; ++i) { - /* object is guaranteed to be "ready" */ - c = PyUnicode_READ_CHAR(object, i); - if (ucnhash_CAPI && - ucnhash_CAPI->getname(NULL, c, buffer, sizeof(buffer), 1)) { - replsize = 1+1+1+(int)strlen(buffer)+1; - } - else if (c >= 0x10000) { - replsize = 1+1+8; - } - else if (c >= 0x100) { - replsize = 1+1+4; - } - else - replsize = 1+1+2; - if (ressize > PY_SSIZE_T_MAX - replsize) - break; - ressize += replsize; - } - end = i; - res = PyUnicode_New(ressize, 127); - if (res==NULL) + } + else if (PyObject_IsInstance(exc, PyExc_UnicodeTranslateError)) { + if (PyUnicodeTranslateError_GetStart(exc, &start)) return NULL; - for (i = start, outp = PyUnicode_1BYTE_DATA(res); - i < end; ++i) { - c = PyUnicode_READ_CHAR(object, i); - *outp++ = '\\'; - if (ucnhash_CAPI && - ucnhash_CAPI->getname(NULL, c, buffer, sizeof(buffer), 1)) { - *outp++ = 'N'; - *outp++ = '{'; - strcpy((char *)outp, buffer); - outp += strlen(buffer); - *outp++ = '}'; - continue; - } - if (c >= 0x00010000) { - *outp++ = 'U'; - *outp++ = Py_hexdigits[(c>>28)&0xf]; - *outp++ = Py_hexdigits[(c>>24)&0xf]; - *outp++ = Py_hexdigits[(c>>20)&0xf]; - *outp++ = Py_hexdigits[(c>>16)&0xf]; - *outp++ = Py_hexdigits[(c>>12)&0xf]; - *outp++ = Py_hexdigits[(c>>8)&0xf]; - } - else if (c >= 0x100) { - *outp++ = 'u'; - *outp++ = Py_hexdigits[(c>>12)&0xf]; - *outp++ = Py_hexdigits[(c>>8)&0xf]; - } - else - *outp++ = 'x'; - *outp++ = Py_hexdigits[(c>>4)&0xf]; - *outp++ = Py_hexdigits[c&0xf]; - } - - assert(outp == PyUnicode_1BYTE_DATA(res) + ressize); - assert(_PyUnicode_CheckConsistency(res, 1)); - restuple = Py_BuildValue("(Nn)", res, end); - Py_DECREF(object); - return restuple; + if (PyUnicodeTranslateError_GetEnd(exc, &end)) + return NULL; + if (!(object = PyUnicodeTranslateError_GetObject(exc))) + return NULL; } else { wrong_exception_type(exc); return NULL; } + + if (!ucnhash_initialized) { + /* load the unicode data module */ + ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import( + PyUnicodeData_CAPSULE_NAME, 1); + ucnhash_initialized = 1; + } + for (i = start, ressize = 0; i < end; ++i) { + /* object is guaranteed to be "ready" */ + c = PyUnicode_READ_CHAR(object, i); + if (ucnhash_CAPI && + ucnhash_CAPI->getname(NULL, c, buffer, sizeof(buffer), 1)) { + replsize = 1+1+1+(int)strlen(buffer)+1; + } + else if (c >= 0x10000) { + replsize = 1+1+8; + } + else if (c >= 0x100) { + replsize = 1+1+4; + } + else + replsize = 1+1+2; + if (ressize > PY_SSIZE_T_MAX - replsize) + break; + ressize += replsize; + } + end = i; + res = PyUnicode_New(ressize, 127); + if (res==NULL) + return NULL; + for (i = start, outp = PyUnicode_1BYTE_DATA(res); + i < end; ++i) { + c = PyUnicode_READ_CHAR(object, i); + *outp++ = '\\'; + if (ucnhash_CAPI && + ucnhash_CAPI->getname(NULL, c, buffer, sizeof(buffer), 1)) { + *outp++ = 'N'; + *outp++ = '{'; + strcpy((char *)outp, buffer); + outp += strlen(buffer); + *outp++ = '}'; + continue; + } + if (c >= 0x00010000) { + *outp++ = 'U'; + *outp++ = Py_hexdigits[(c>>28)&0xf]; + *outp++ = Py_hexdigits[(c>>24)&0xf]; + *outp++ = Py_hexdigits[(c>>20)&0xf]; + *outp++ = Py_hexdigits[(c>>16)&0xf]; + *outp++ = Py_hexdigits[(c>>12)&0xf]; + *outp++ = Py_hexdigits[(c>>8)&0xf]; + } + else if (c >= 0x100) { + *outp++ = 'u'; + *outp++ = Py_hexdigits[(c>>12)&0xf]; + *outp++ = Py_hexdigits[(c>>8)&0xf]; + } + else + *outp++ = 'x'; + *outp++ = Py_hexdigits[(c>>4)&0xf]; + *outp++ = Py_hexdigits[c&0xf]; + } + + assert(outp == PyUnicode_1BYTE_DATA(res) + ressize); + assert(_PyUnicode_CheckConsistency(res, 1)); + Py_DECREF(object); + return Py_BuildValue("(Nn)", res, end); } #define ENC_UNKNOWN -1 @@ -1140,7 +1156,6 @@ get_standard_encoding(const char *encodi static PyObject * PyCodec_SurrogatePassErrors(PyObject *exc) { - PyObject *restuple; PyObject *object; PyObject *encode; char *encoding; @@ -1222,10 +1237,8 @@ PyCodec_SurrogatePassErrors(PyObject *ex break; } } - restuple = Py_BuildValue("(On)", res, end); - Py_DECREF(res); Py_DECREF(object); - return restuple; + return Py_BuildValue("(Nn)", res, end); } else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) { unsigned char *p; @@ -1297,6 +1310,29 @@ PyCodec_SurrogatePassErrors(PyObject *ex return NULL; return Py_BuildValue("(Nn)", res, start + bytelength); } + else if (PyObject_IsInstance(exc, PyExc_UnicodeTranslateError)) { + if (PyUnicodeTranslateError_GetStart(exc, &start)) + return NULL; + if (PyUnicodeTranslateError_GetEnd(exc, &end)) + return NULL; + if (!(object = PyUnicodeTranslateError_GetObject(exc))) + return NULL; + for (i = start; i < end; i++) { + /* object is guaranteed to be "ready" */ + Py_UCS4 ch = PyUnicode_READ_CHAR(object, i); + if (!Py_UNICODE_IS_SURROGATE(ch)) { + /* Not a surrogate, fail with original exception */ + PyErr_SetObject(PyExceptionInstance_Class(exc), exc); + Py_DECREF(object); + return NULL; + } + } + res = PyUnicode_Substring(object, start, end); + Py_DECREF(object); + if (res == NULL) + return NULL; + return Py_BuildValue("(Nn)", res, i); + } else { wrong_exception_type(exc); return NULL;