diff -r 468d18bffdea Doc/c-api/codec.rst --- a/Doc/c-api/codec.rst Thu Nov 21 03:40:15 2013 +0100 +++ b/Doc/c-api/codec.rst Thu Nov 21 09:33:38 2013 +0200 @@ -116,3 +116,8 @@ Replace the unicode encode error with backslash escapes (``\x``, ``\u`` and ``\U``). +.. c:function:: PyObject* PyCodec_NameReplaceErrors(PyObject *exc) + + Replace the unicode encode error with `\N{...}` escapes. + + .. versionadded: 3.4 diff -r 468d18bffdea Doc/howto/unicode.rst --- a/Doc/howto/unicode.rst Thu Nov 21 03:40:15 2013 +0100 +++ b/Doc/howto/unicode.rst Thu Nov 21 09:33:38 2013 +0200 @@ -325,8 +325,9 @@ :meth:`~bytes.decode` method but supports a few more possible handlers. As well as ``'strict'``, ``'ignore'``, and ``'replace'`` (which in this case inserts a question mark instead of the unencodable character), there is -also ``'xmlcharrefreplace'`` (inserts an XML character reference) and -``backslashreplace`` (inserts a ``\uNNNN`` escape sequence). +also ``'xmlcharrefreplace'`` (inserts an XML character reference), +``backslashreplace`` (inserts a ``\uNNNN`` escape sequence) and +``namereplace`` (inserts a ``\N{...}`` escape sequence). The following example shows the different results:: @@ -346,6 +347,8 @@ b'ꀀabcd޴' >>> u.encode('ascii', 'backslashreplace') b'\\ua000abcd\\u07b4' + >>> u.encode('ascii', 'namereplace') + b'\\N{YI SYLLABLE IT}abcd\\u07b4' The low-level routines for registering and accessing the available encodings are found in the :mod:`codecs` module. Implementing new diff -r 468d18bffdea Doc/library/codecs.rst --- a/Doc/library/codecs.rst Thu Nov 21 03:40:15 2013 +0100 +++ b/Doc/library/codecs.rst Thu Nov 21 09:33:38 2013 +0200 @@ -97,6 +97,8 @@ reference (for encoding only) * ``'backslashreplace'``: replace with backslashed escape sequences (for encoding only) + * ``'namereplace'``: replace with ``\N{...}`` escape sequences (for + encoding only) * ``'surrogateescape'``: on decoding, replace with code points in the Unicode Private Use Area ranging from U+DC80 to U+DCFF. These private code points will then be turned back into the same bytes when the @@ -231,6 +233,11 @@ Implements the ``backslashreplace`` error handling (for encoding only): the unencodable character is replaced by a backslashed escape sequence. +.. function:: namereplace_errors(exception) + + Implements the ``namereplace`` error handling (for encoding only): the + unencodable character is replaced by a ``\N{...}`` escape sequence. + To simplify working with encoded files or stream, the module also defines these utility functions: @@ -361,6 +368,9 @@ | ``'backslashreplace'`` | Replace with backslashed escape sequences | | | (only for encoding). | +-------------------------+-----------------------------------------------+ +| ``'namereplace'`` | Replace with ``\N{...}`` escape sequences | +| | (only for encoding). | ++-------------------------+-----------------------------------------------+ | ``'surrogateescape'`` | Replace byte with surrogate U+DCxx, as defined| | | in :pep:`383`. | +-------------------------+-----------------------------------------------+ @@ -382,6 +392,9 @@ .. versionchanged:: 3.4 The ``'surrogatepass'`` error handlers now works with utf-16\* and utf-32\* codecs. +.. versionadded:: 3.4 + The ``'namereplace'`` error handler. + The set of allowed values can be extended via :meth:`register_error`. @@ -475,6 +488,8 @@ * ``'backslashreplace'`` Replace with backslashed escape sequences. + * ``'namereplace'`` Replace with ``\N{...}`` escape sequences. + The *errors* argument will be assigned to an attribute of the same name. Assigning to this attribute makes it possible to switch between different error handling strategies during the lifetime of the :class:`IncrementalEncoder` @@ -623,6 +638,8 @@ * ``'backslashreplace'`` Replace with backslashed escape sequences. + * ``'namereplace'`` Replace with ``\N{...}`` escape sequences. + The *errors* argument will be assigned to an attribute of the same name. Assigning to this attribute makes it possible to switch between different error handling strategies during the lifetime of the :class:`StreamWriter` object. diff -r 468d18bffdea Doc/library/functions.rst --- a/Doc/library/functions.rst Thu Nov 21 03:40:15 2013 +0100 +++ b/Doc/library/functions.rst Thu Nov 21 09:33:38 2013 +0200 @@ -947,6 +947,9 @@ replaces unsupported characters with Python's backslashed escape sequences. + * ``'namereplace'`` (also only supported when writing) + replaces unsupported characters with ``\N{...}`` escape sequences. + .. index:: single: universal newlines; open() built-in function diff -r 468d18bffdea Doc/library/io.rst --- a/Doc/library/io.rst Thu Nov 21 03:40:15 2013 +0100 +++ b/Doc/library/io.rst Thu Nov 21 09:33:38 2013 +0200 @@ -800,9 +800,10 @@ errors can lead to data loss.) ``'replace'`` causes a replacement marker (such as ``'?'``) to be inserted where there is malformed data. When writing, ``'xmlcharrefreplace'`` (replace with the appropriate XML character - reference) or ``'backslashreplace'`` (replace with backslashed escape - sequences) can be used. Any other error handling name that has been - registered with :func:`codecs.register_error` is also valid. + reference), ``'backslashreplace'`` (replace with backslashed escape + sequences) or ``'namereplace'`` (replace with ``\N{...}`` escape sequences) + can be used. Any other error handling name that has been registered with + :func:`codecs.register_error` is also valid. .. index:: single: universal newlines; io.TextIOWrapper class diff -r 468d18bffdea Include/codecs.h --- a/Include/codecs.h Thu Nov 21 03:40:15 2013 +0100 +++ b/Include/codecs.h Thu Nov 21 09:33:38 2013 +0200 @@ -174,6 +174,9 @@ /* replace the unicode encode error with backslash escapes (\x, \u and \U) */ PyAPI_FUNC(PyObject *) PyCodec_BackslashReplaceErrors(PyObject *exc); +/* replace the unicode encode error with backslash escapes (\N, \x, \u and \U) */ +PyAPI_FUNC(PyObject *) PyCodec_NameReplaceErrors(PyObject *exc); + PyAPI_DATA(const char *) Py_hexdigits; #ifdef __cplusplus diff -r 468d18bffdea Lib/codecs.py --- a/Lib/codecs.py Thu Nov 21 03:40:15 2013 +0100 +++ b/Lib/codecs.py Thu Nov 21 09:33:38 2013 +0200 @@ -22,6 +22,7 @@ "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE", "strict_errors", "ignore_errors", "replace_errors", "xmlcharrefreplace_errors", + "backslashreplace_errors", "namereplace_errors", "register_error", "lookup_error"] ### Constants @@ -1074,6 +1075,7 @@ replace_errors = lookup_error("replace") xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace") backslashreplace_errors = lookup_error("backslashreplace") + namereplace_errors = lookup_error("namereplace") except LookupError: # In --disable-unicode builds, these error handler are missing strict_errors = None @@ -1081,6 +1083,7 @@ replace_errors = None xmlcharrefreplace_errors = None backslashreplace_errors = None + namereplace_errors = None # Tell modulefinder that using codecs probably needs the encodings # package diff -r 468d18bffdea Lib/test/test_codeccallbacks.py --- a/Lib/test/test_codeccallbacks.py Thu Nov 21 03:40:15 2013 +0100 +++ b/Lib/test/test_codeccallbacks.py Thu Nov 21 09:33:38 2013 +0200 @@ -158,6 +158,22 @@ sout = b"a\xac\\u1234\xa4\\u8000\\U0010ffff" self.assertEqual(sin.encode("iso-8859-15", "backslashreplace"), sout) + def test_nameescape(self): + # Does the same as backslashescape, but prefers ``\N{...}`` escape + # sequences. + sin = "a\xac\u1234\u20ac\u8000\U0010ffff" + sout = (b'a\\N{NOT SIGN}\\N{ETHIOPIC SYLLABLE SEE}\\N{EURO SIGN}' + b'\\N{CJK UNIFIED IDEOGRAPH-8000}\\U0010ffff') + self.assertEqual(sin.encode("ascii", "namereplace"), sout) + + sout = (b'a\xac\\N{ETHIOPIC SYLLABLE SEE}\\N{EURO SIGN}' + b'\\N{CJK UNIFIED IDEOGRAPH-8000}\\U0010ffff') + self.assertEqual(sin.encode("latin-1", "namereplace"), sout) + + sout = (b'a\xac\\N{ETHIOPIC SYLLABLE SEE}\xa4' + b'\\N{CJK UNIFIED IDEOGRAPH-8000}\\U0010ffff') + self.assertEqual(sin.encode("iso-8859-15", "namereplace"), sout) + def test_decoding_callbacks(self): # This is a test for a decoding callback handler # that allows the decoding of the invalid sequence @@ -297,7 +313,7 @@ def test_longstrings(self): # test long strings to check for memory overflow problems errors = [ "strict", "ignore", "replace", "xmlcharrefreplace", - "backslashreplace"] + "backslashreplace", "namereplace"] # register the handlers under different names, # to prevent the codec from recognizing the name for err in errors: @@ -611,6 +627,81 @@ ("\\udfff", 1) ) + def test_badandgoodnamereplaceexceptions(self): + # "namereplace" complains about a non-exception passed in + self.assertRaises( + TypeError, + codecs.namereplace_errors, + 42 + ) + # "namereplace" complains about the wrong exception types + self.assertRaises( + TypeError, + codecs.namereplace_errors, + UnicodeError("ouch") + ) + # "namereplace" can only be used for encoding + self.assertRaises( + TypeError, + codecs.namereplace_errors, + UnicodeDecodeError("ascii", bytearray(b"\xff"), 0, 1, "ouch") + ) + self.assertRaises( + TypeError, + codecs.namereplace_errors, + UnicodeTranslateError("\u3042", 0, 1, "ouch") + ) + # Use the correct exception + self.assertEqual( + codecs.namereplace_errors( + UnicodeEncodeError("ascii", "\u3042", 0, 1, "ouch")), + ("\\N{HIRAGANA LETTER A}", 1) + ) + self.assertEqual( + codecs.namereplace_errors( + UnicodeEncodeError("ascii", "\x00", 0, 1, "ouch")), + ("\\x00", 1) + ) + self.assertEqual( + codecs.namereplace_errors( + UnicodeEncodeError("ascii", "\xff", 0, 1, "ouch")), + ("\\N{LATIN SMALL LETTER Y WITH DIAERESIS}", 1) + ) + self.assertEqual( + codecs.namereplace_errors( + UnicodeEncodeError("ascii", "\u0100", 0, 1, "ouch")), + ("\\N{LATIN CAPITAL LETTER A WITH MACRON}", 1) + ) + self.assertEqual( + codecs.namereplace_errors( + UnicodeEncodeError("ascii", "\uffff", 0, 1, "ouch")), + ("\\uffff", 1) + ) + if SIZEOF_WCHAR_T > 0: + self.assertEqual( + codecs.namereplace_errors( + UnicodeEncodeError("ascii", "\U00010000", + 0, 1, "ouch")), + ("\\N{LINEAR B SYLLABLE B008 A}", 1) + ) + self.assertEqual( + codecs.namereplace_errors( + UnicodeEncodeError("ascii", "\U0010ffff", + 0, 1, "ouch")), + ("\\U0010ffff", 1) + ) + # Lone surrogates (regardless of unicode width) + self.assertEqual( + codecs.namereplace_errors( + UnicodeEncodeError("ascii", "\ud800", 0, 1, "ouch")), + ("\\ud800", 1) + ) + self.assertEqual( + codecs.namereplace_errors( + UnicodeEncodeError("ascii", "\udfff", 0, 1, "ouch")), + ("\\udfff", 1) + ) + def test_badhandlerresults(self): results = ( 42, "foo", (1,2,3), ("foo", 1, 3), ("foo", None), ("foo",), ("foo", 1, 3), ("foo", None), ("foo",) ) encs = ("ascii", "latin-1", "iso-8859-1", "iso-8859-15") @@ -651,6 +742,10 @@ codecs.backslashreplace_errors, codecs.lookup_error("backslashreplace") ) + self.assertEqual( + codecs.namereplace_errors, + codecs.lookup_error("namereplace") + ) def test_unencodablereplacement(self): def unencrepl(exc): @@ -804,7 +899,8 @@ class D(dict): def __getitem__(self, key): raise ValueError - for err in ("strict", "replace", "xmlcharrefreplace", "backslashreplace", "test.posreturn"): + for err in ("strict", "replace", "xmlcharrefreplace", + "backslashreplace", "namereplace", "test.posreturn"): self.assertRaises(UnicodeError, codecs.charmap_encode, "\xff", err, {0xff: None}) self.assertRaises(ValueError, codecs.charmap_encode, "\xff", err, D()) self.assertRaises(TypeError, codecs.charmap_encode, "\xff", err, {0xff: 300}) diff -r 468d18bffdea Lib/test/test_codecs.py --- a/Lib/test/test_codecs.py Thu Nov 21 03:40:15 2013 +0100 +++ b/Lib/test/test_codecs.py Thu Nov 21 09:33:38 2013 +0200 @@ -306,6 +306,8 @@ self.assertRaises(UnicodeEncodeError, "\ud800".encode, self.encoding) self.assertEqual("[\uDC80]".encode(self.encoding, "backslashreplace"), "[\\udc80]".encode(self.encoding)) + self.assertEqual("[\uDC80]".encode(self.encoding, "namereplace"), + "[\\udc80]".encode(self.encoding)) self.assertEqual("[\uDC80]".encode(self.encoding, "xmlcharrefreplace"), "[�]".encode(self.encoding)) self.assertEqual("[\uDC80]".encode(self.encoding, "ignore"), @@ -763,6 +765,7 @@ ('\udc80', 'ignore', b''), ('\udc80', 'replace', b'?'), ('\udc80', 'backslashreplace', b'\\udc80'), + ('\udc80', 'namereplace', b'\\udc80'), ('\udc80', 'surrogatepass', b'\xed\xb2\x80'), )) else: @@ -824,6 +827,8 @@ self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "cp65001") self.assertEqual("[\uDC80]".encode("cp65001", "backslashreplace"), b'[\\udc80]') + self.assertEqual("[\uDC80]".encode("cp65001", "namereplace"), + b'[\\udc80]') self.assertEqual("[\uDC80]".encode("cp65001", "xmlcharrefreplace"), b'[�]') self.assertEqual("[\uDC80]".encode("cp65001", "surrogateescape"), @@ -2639,6 +2644,8 @@ ('[\xff]', 'replace', b'[y]'), ('[\u20ac]', 'replace', b'[?]'), ('[\xff]', 'backslashreplace', b'[\\xff]'), + ('[\xff]', 'namereplace', + b'[\\N{LATIN SMALL LETTER Y WITH DIAERESIS}]'), ('[\xff]', 'xmlcharrefreplace', b'[ÿ]'), )) self.check_decode(932, ( diff -r 468d18bffdea Python/codecs.c --- a/Python/codecs.c Thu Nov 21 03:40:15 2013 +0100 +++ b/Python/codecs.c Thu Nov 21 09:33:38 2013 +0200 @@ -9,6 +9,7 @@ ------------------------------------------------------------------------ */ #include "Python.h" +#include "ucnhash.h" #include const char *Py_hexdigits = "0123456789abcdef"; @@ -753,6 +754,96 @@ } } +static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL; +static int ucnhash_initialized = 0; + +PyObject *PyCodec_NameReplaceErrors(PyObject *exc) +{ + if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) { + PyObject *restuple; + PyObject *object; + Py_ssize_t i; + Py_ssize_t start; + Py_ssize_t end; + PyObject *res; + unsigned char *outp; + int ressize; + Py_UCS4 c; + char buffer[100]; + if (PyUnicodeEncodeError_GetStart(exc, &start)) + return NULL; + if (PyUnicodeEncodeError_GetEnd(exc, &end)) + return NULL; + if (!(object = PyUnicodeEncodeError_GetObject(exc))) + return NULL; + if (!ucnhash_initialized) { + /* load the unicode data module */ + ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import( + PyUnicodeData_CAPSULE_NAME, 1); + ucnhash_initialized = 1; + } + buffer[sizeof(buffer)-1] = '\0'; + for (i = start, ressize = 0; i < end; ++i) { + /* object is guaranteed to be "ready" */ + c = PyUnicode_READ_CHAR(object, i); + if (ucnhash_CAPI && ucnhash_CAPI->getname(NULL, c, buffer, sizeof(buffer)-1, 1)) { + ressize += 1+1+1+strlen(buffer)+1; + } + else if (c >= 0x10000) { + ressize += 1+1+8; + } + else if (c >= 0x100) { + ressize += 1+1+4; + } + else + ressize += 1+1+2; + } + res = PyUnicode_New(ressize, 127); + if (res==NULL) + return NULL; + for (i = start, outp = PyUnicode_1BYTE_DATA(res); + i < end; ++i) { + c = PyUnicode_READ_CHAR(object, i); + *outp++ = '\\'; + if (ucnhash_CAPI && ucnhash_CAPI->getname(NULL, c, buffer, sizeof(buffer)-1, 1)) { + *outp++ = 'N'; + *outp++ = '{'; + strcpy((char *)outp, buffer); + outp += strlen(buffer); + *outp++ = '}'; + continue; + } + if (c >= 0x00010000) { + *outp++ = 'U'; + *outp++ = Py_hexdigits[(c>>28)&0xf]; + *outp++ = Py_hexdigits[(c>>24)&0xf]; + *outp++ = Py_hexdigits[(c>>20)&0xf]; + *outp++ = Py_hexdigits[(c>>16)&0xf]; + *outp++ = Py_hexdigits[(c>>12)&0xf]; + *outp++ = Py_hexdigits[(c>>8)&0xf]; + } + else if (c >= 0x100) { + *outp++ = 'u'; + *outp++ = Py_hexdigits[(c>>12)&0xf]; + *outp++ = Py_hexdigits[(c>>8)&0xf]; + } + else + *outp++ = 'x'; + *outp++ = Py_hexdigits[(c>>4)&0xf]; + *outp++ = Py_hexdigits[c&0xf]; + } + + assert(_PyUnicode_CheckConsistency(res, 1)); + restuple = Py_BuildValue("(Nn)", res, end); + Py_DECREF(object); + return restuple; + } + else { + wrong_exception_type(exc); + return NULL; + } +} + #define ENC_UTF8 0 #define ENC_UTF16BE 1 #define ENC_UTF16LE 2 @@ -1075,6 +1166,11 @@ return PyCodec_BackslashReplaceErrors(exc); } +static PyObject *namereplace_errors(PyObject *self, PyObject *exc) +{ + return PyCodec_NameReplaceErrors(exc); +} + static PyObject *surrogatepass_errors(PyObject *self, PyObject *exc) { return PyCodec_SurrogatePassErrors(exc); @@ -1145,6 +1241,17 @@ } }, { + "namereplace", + { + "namereplace_errors", + namereplace_errors, + METH_O, + PyDoc_STR("Implements the 'namereplace' error handling, " + "which replaces an unencodable character with a " + "\\N{...} escape sequence.") + } + }, + { "surrogatepass", { "surrogatepass",