Index: Include/codecs.h =================================================================== RCS file: /cvsroot/python/python/dist/src/Include/codecs.h,v retrieving revision 2.3 diff -u -r2.3 codecs.h --- Include/codecs.h 3 Aug 2000 16:24:24 -0000 2.3 +++ Include/codecs.h 29 May 2002 20:43:58 -0000 @@ -117,7 +117,117 @@ const char *errors ); +/* Unicode encoding error handling callback registry API */ + +/* Register the encoding error handling callback function error under + the name name this function will be called by the encoder when it + encounters an unencodable character, name is specified as the + error parameter in the call to the encode function. + Return 0 on success, -1 on error */ +extern DL_IMPORT(int) PyCodec_RegisterUnicodeEncodeErrorHandler( + const char *name, + PyObject *error + ); + +/* Lookup the error handling callback function registered under the + name error if error is a string or unicode object. As a special + case NULL can be passed, in which case the error handling callback + for strict encoding will be returned. */ +extern DL_IMPORT(PyObject *) PyCodec_LookupUnicodeEncodeErrorHandler( + const char *name + ); + +/* Raises a Unicode exception */ +extern DL_IMPORT(void) PyCodec_RaiseUnicodeEncodeError( + const char *encoding, + const Py_UNICODE *str, + int startpos, + int endpos, + const char *reason + ); + +/* Encode error handler that raises an exception */ +extern DL_IMPORT(PyObject *) PyCodec_RaiseUnicodeEncodeErrors( + PyObject *self, + PyObject *args + ); + +/* Encode error handler that returns a empty string and so + ignores the unencodable characters */ +extern DL_IMPORT(PyObject *) PyCodec_IgnoreUnicodeEncodeErrors( + PyObject *self, + PyObject *args + ); + +/* Encode error handler that returns questions marks for each + unencodable character */ +extern DL_IMPORT(PyObject *) PyCodec_ReplaceUnicodeEncodeErrors( + PyObject *self, + PyObject *args + ); + +/* Encode error handler that returns XML character references + for the unencodable characters */ +extern DL_IMPORT(PyObject *) PyCodec_XMLCharRefReplaceUnicodeEncodeErrors( + PyObject *self, + PyObject *args + ); + +/* Encode error handler that returns an \x (or \u or \U) escape sequence + for each unencodable character */ +extern DL_IMPORT(PyObject *) PyCodec_BackslashReplaceUnicodeEncodeErrors( + PyObject *self, + PyObject *args + ); + + +/* Unicode decoding error handling callback registry API */ + +/* Register the decoding error handling callback function error under + the name name. Return 0 on success, -1 on error */ +extern DL_IMPORT(int) PyCodec_RegisterUnicodeDecodeErrorHandler( + const char *name, + PyObject *error + ); + +/* Lookup the decoding error handling callback function registered + under the name name. As a special case NULL can be passed, in which + case the error handling callback for strict encoding will be returned. */ +extern DL_IMPORT(PyObject *) PyCodec_LookupUnicodeDecodeErrorHandler( + const char *name + ); + +/* Raises a Unicode exception */ +extern DL_IMPORT(void) PyCodec_RaiseUnicodeDecodeError( + const char *encoding, + const char *str, + int startpos, + int endpos, + const char *reason + ); + +/* Decode error handler that raises an exception */ +extern DL_IMPORT(PyObject *) PyCodec_RaiseUnicodeDecodeErrors( + PyObject *self, + PyObject *args + ); + +/* Decode error handler that returns a empty string and so + ignores the undecodable bytes (probably resulting in + more errors from the next bytes) */ +extern DL_IMPORT(PyObject *) PyCodec_IgnoreUnicodeDecodeErrors( + PyObject *self, + PyObject *args + ); + +/* Decode error handler that returns "?" as a replacement for + the undecodable bytes. */ +extern DL_IMPORT(PyObject *) PyCodec_ReplaceUnicodeDecodeErrors( + PyObject *self, + PyObject *args + ); + #ifdef __cplusplus } #endif -#endif /* !Py_CODECREGISTRY_H */ +#endif Index: Lib/codecs.py =================================================================== RCS file: /cvsroot/python/python/dist/src/Lib/codecs.py,v retrieving revision 1.24 diff -u -r1.24 codecs.py --- Lib/codecs.py 5 Mar 2002 15:46:38 -0000 1.24 +++ Lib/codecs.py 29 May 2002 20:43:58 -0000 @@ -18,7 +18,13 @@ 'Failed to load the builtin codecs: %s' % why __all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE", - "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE"] + "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE", + "raise_unicodeencode_errors", "ignore_unicodeencode_errors", + "replace_unicodeencode_errors", "xmlcharrefreplace_unicodeencode_errors", + "backslashreplace_unicodeencode_errors", "raise_unicodedecode_errors", + "ignore_unicodedecode_errors", "replace_unicodedecode_errors", + "register_unicodeencodeerrorhandler", "lookup_unicodeencodeerrorhandler", + "register_unicodedecodeerrorhandler", "lookup_unicodedecodeerrorhandler" ] ### Constants Index: Modules/_codecsmodule.c =================================================================== RCS file: /cvsroot/python/python/dist/src/Modules/_codecsmodule.c,v retrieving revision 2.11 diff -u -r2.11 _codecsmodule.c --- Modules/_codecsmodule.c 17 Jan 2002 23:15:58 -0000 2.11 +++ Modules/_codecsmodule.c 29 May 2002 20:44:03 -0000 @@ -664,6 +664,56 @@ #endif /* MS_WIN32 */ #endif /* Py_USING_UNICODE */ +/* --- Error handler registry --------------------------------------------- */ + +static PyObject *register_unicodeencodeerrorhandler(PyObject *self, PyObject *args) +{ + const char *name; + PyObject *handler; + + if (!PyArg_ParseTuple(args, "sO:register_unicodeencodeerrorhandler", + &name, &handler)) + return NULL; + if (PyCodec_RegisterUnicodeEncodeErrorHandler(name, handler)) + return NULL; + Py_INCREF(Py_None); + return Py_None; +} + +static PyObject *lookup_unicodeencodeerrorhandler(PyObject *self, PyObject *args) +{ + const char *name; + + if (!PyArg_ParseTuple(args, "s:lookup_unicodeencodeerrorhandler", + &name)) + return NULL; + return PyCodec_LookupUnicodeEncodeErrorHandler(name); +} + +static PyObject *register_unicodedecodeerrorhandler(PyObject *self, PyObject *args) +{ + const char *name; + PyObject *handler; + + if (!PyArg_ParseTuple(args, "sO:register_unicodedecodeerrorhandler", + &name, &handler)) + return NULL; + if (PyCodec_RegisterUnicodeDecodeErrorHandler(name, handler)) + return NULL; + Py_INCREF(Py_None); + return Py_None; +} + +static PyObject *lookup_unicodedecodeerrorhandler(PyObject *self, PyObject *args) +{ + const char *name; + + if (!PyArg_ParseTuple(args, "s:lookup_unicodedecodeerrorhandler", + &name)) + return NULL; + return PyCodec_LookupUnicodeDecodeErrorHandler(name); +} + /* --- Module API --------------------------------------------------------- */ static PyMethodDef _codecs_functions[] = { @@ -699,7 +749,31 @@ {"mbcs_encode", mbcs_encode, METH_VARARGS}, {"mbcs_decode", mbcs_decode, METH_VARARGS}, #endif + {"raise_unicodeencode_errors", + PyCodec_RaiseUnicodeEncodeErrors, METH_VARARGS}, + {"ignore_unicodeencode_errors", + PyCodec_IgnoreUnicodeEncodeErrors, METH_VARARGS}, + {"replace_unicodeencode_errors", + PyCodec_ReplaceUnicodeEncodeErrors, METH_VARARGS}, + {"xmlcharrefreplace_unicodeencode_errors", + PyCodec_XMLCharRefReplaceUnicodeEncodeErrors, METH_VARARGS}, + {"backslashreplace_unicodeencode_errors", + PyCodec_BackslashReplaceUnicodeEncodeErrors, METH_VARARGS}, + {"raise_unicodedecode_errors", + PyCodec_RaiseUnicodeDecodeErrors, METH_VARARGS}, + {"ignore_unicodedecode_errors", + PyCodec_IgnoreUnicodeDecodeErrors, METH_VARARGS}, + {"replace_unicodedecode_errors", + PyCodec_ReplaceUnicodeDecodeErrors, METH_VARARGS}, #endif /* Py_USING_UNICODE */ + {"register_unicodeencodeerrorhandler", + register_unicodeencodeerrorhandler, METH_VARARGS}, + {"lookup_unicodeencodeerrorhandler", + lookup_unicodeencodeerrorhandler, METH_VARARGS}, + {"register_unicodedecodeerrorhandler", + register_unicodedecodeerrorhandler, METH_VARARGS}, + {"lookup_unicodedecodeerrorhandler", + lookup_unicodedecodeerrorhandler, METH_VARARGS}, {NULL, NULL} /* sentinel */ }; Index: Objects/stringobject.c =================================================================== RCS file: /cvsroot/python/python/dist/src/Objects/stringobject.c,v retrieving revision 2.164 diff -u -r2.164 stringobject.c --- Objects/stringobject.c 24 May 2002 19:01:58 -0000 2.164 +++ Objects/stringobject.c 29 May 2002 20:44:04 -0000 @@ -2211,7 +2211,9 @@ Encodes S using the codec registered for encoding. encoding defaults\n\ to the default encoding. errors may be given to set a different error\n\ handling scheme. Default is 'strict' meaning that encoding errors raise\n\ -a ValueError. Other possible values are 'ignore' and 'replace'."; +a ValueError. Other possible values are 'ignore', 'replace' and\n\ +'xmlcharrefreplace' as well as any other name registered with\n\ +codecs.register_unicodeencodeerrorhandler."; static PyObject * string_encode(PyStringObject *self, PyObject *args) @@ -2230,7 +2232,9 @@ Decodes S using the codec registered for encoding. encoding defaults\n\ to the default encoding. errors may be given to set a different error\n\ handling scheme. Default is 'strict' meaning that encoding errors raise\n\ -a ValueError. Other possible values are 'ignore' and 'replace'."; +a ValueError. Other possible values are 'ignore' and 'replace' as well\n\ +as any other name registerd with\n\ +codecs.register_unicodedecodeerrorhandler."; static PyObject * string_decode(PyStringObject *self, PyObject *args) Index: Objects/unicodeobject.c =================================================================== RCS file: /cvsroot/python/python/dist/src/Objects/unicodeobject.c,v retrieving revision 2.149 diff -u -r2.149 unicodeobject.c --- Objects/unicodeobject.c 24 May 2002 19:01:59 -0000 2.149 +++ Objects/unicodeobject.c 29 May 2002 20:44:05 -0000 @@ -489,8 +489,8 @@ const char *errors) { PyObject *buffer = NULL, *unicode; - - if (encoding == NULL) + + if (encoding == NULL) encoding = PyUnicode_GetDefaultEncoding(); /* Shortcuts for common default encodings */ @@ -641,6 +641,89 @@ return -1; } +/* error handling callback helper: + build arguments, call the callback and check the arguments, + if no exception occured, copy the replacement to the output + and adjust various state variables. + return 0 on success, -1 on error +*/ + +static +int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler, + const char *encoding, const char *reason, + const char *input, int insize, int *startinpos, int *endinpos, PyObject **inputObject, const char **inptr, + PyObject **output, int *outpos, Py_UNICODE **outptr) +{ + static char *argparse = "O!i;decoding error handler must return (unicode, int) tuple"; + + PyObject *args; + PyObject *restuple = NULL; + PyObject *repunicode = NULL; + int outsize = PyUnicode_GET_SIZE(*output); + int requiredsize; + int newpos; + Py_UNICODE *repptr; + int repsize; + int res = -1; + + if (*errorHandler == NULL) { + *errorHandler = PyCodec_LookupUnicodeDecodeErrorHandler(errors); + if (*errorHandler == NULL) + goto onError; + } + + if (*inputObject == NULL) { + *inputObject = PyString_FromStringAndSize(input, insize); + if (*inputObject == NULL) + goto onError; + } + + /* we don't need a state => use None */ + args = Py_BuildValue("sOiisO", encoding, *inputObject, *startinpos, *endinpos, reason, Py_None); + if (args == NULL) + goto onError; + restuple = PyEval_CallObject(*errorHandler, args); + Py_DECREF(args); + if (restuple == NULL) + goto onError; + if (!PyTuple_Check(restuple)) { + PyErr_Format(PyExc_TypeError, &argparse[4]); + goto onError; + } + if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos)) + goto onError; + if (newpos<0) + newpos = 0; + else if (newpos>insize) + newpos = insize; + + /* need more space? (at least enough for what we + have+the replacement+the rest of the string (starting + at the new input position), so we won't have to check space + when there are no errors in the rest of the string) */ + repptr = PyUnicode_AS_UNICODE(repunicode); + repsize = PyUnicode_GET_SIZE(repunicode); + requiredsize = *outpos + repsize + insize-newpos; + if (requiredsize > outsize) { + if (requiredsize<2*outsize) + requiredsize = 2*outsize; + if (PyUnicode_Resize(output, requiredsize)) + goto onError; + *outptr = PyUnicode_AS_UNICODE(*output) + *outpos; + } + *endinpos = newpos; + *inptr = input + newpos; + Py_UNICODE_COPY(*outptr, repptr, repsize); + *outptr += repsize; + *outpos += repsize; + /* we made it! */ + res = 0; + + onError: + Py_XDECREF(restuple); + return res; +} + /* --- UTF-7 Codec -------------------------------------------------------- */ /* see RFC2152 for details */ @@ -699,40 +782,14 @@ } \ } \ -static -int utf7_decoding_error(Py_UNICODE **dest, - const char *errors, - const char *details) -{ - if ((errors == NULL) || - (strcmp(errors,"strict") == 0)) { - PyErr_Format(PyExc_UnicodeError, - "UTF-7 decoding error: %.400s", - details); - return -1; - } - else if (strcmp(errors,"ignore") == 0) { - return 0; - } - else if (strcmp(errors,"replace") == 0) { - if (dest != NULL) { - **dest = Py_UNICODE_REPLACEMENT_CHARACTER; - (*dest)++; - } - return 0; - } - else { - PyErr_Format(PyExc_ValueError, - "UTF-7 decoding error; unknown error handling code: %.400s", - errors); - return -1; - } -} - PyObject *PyUnicode_DecodeUTF7(const char *s, int size, const char *errors) { + const char *starts = s; + int startinpos; + int endinpos; + int outpos; const char *e; PyUnicodeObject *unicode; Py_UNICODE *p; @@ -740,7 +797,9 @@ int inShift = 0; unsigned int bitsleft = 0; unsigned long charsleft = 0; - int surrogate = 0; + int surrogate = 0; + PyObject *errorHandler = NULL; + PyObject *inputObject = NULL; unicode = _PyUnicode_New(size); if (!unicode) @@ -752,7 +811,9 @@ e = s + size; while (s < e) { - Py_UNICODE ch = *s; + Py_UNICODE ch; + restart: + ch = *s; if (inShift) { if ((ch == '-') || !B64CHAR(ch)) { @@ -797,6 +858,7 @@ } } else if ( ch == '+' ) { + startinpos = s-starts; s++; if (s < e && *s == '-') { s++; @@ -818,21 +880,39 @@ } continue; utf7Error: - if (utf7_decoding_error(&p, errors, errmsg)) - goto onError; + outpos = p-PyUnicode_AS_UNICODE(unicode); + endinpos = s-starts; + if (unicode_decode_call_errorhandler( + errors, &errorHandler, + "utf7", errmsg, + starts, size, &startinpos, &endinpos, &inputObject, &s, + (PyObject **)&unicode, &outpos, &p)) + goto onError; } if (inShift) { - if (utf7_decoding_error(&p, errors, "unterminated shift sequence")) + outpos = p-PyUnicode_AS_UNICODE(unicode); + endinpos = size; + if (unicode_decode_call_errorhandler( + errors, &errorHandler, + "utf7", "unterminated shift sequence", + starts, size, &startinpos, &endinpos, &inputObject, &s, + (PyObject **)&unicode, &outpos, &p)) goto onError; + if (s < e) + goto restart; } - if (_PyUnicode_Resize(&unicode, p - unicode->str)) + if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode))) goto onError; + Py_XDECREF(errorHandler); + Py_XDECREF(inputObject); return (PyObject *)unicode; onError: + Py_XDECREF(errorHandler); + Py_XDECREF(inputObject); Py_DECREF(unicode); return NULL; } @@ -962,46 +1042,21 @@ 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0 }; -static -int utf8_decoding_error(const char **source, - Py_UNICODE **dest, - const char *errors, - const char *details) -{ - if ((errors == NULL) || - (strcmp(errors,"strict") == 0)) { - PyErr_Format(PyExc_UnicodeError, - "UTF-8 decoding error: %.400s", - details); - return -1; - } - else if (strcmp(errors,"ignore") == 0) { - (*source)++; - return 0; - } - else if (strcmp(errors,"replace") == 0) { - (*source)++; - **dest = Py_UNICODE_REPLACEMENT_CHARACTER; - (*dest)++; - return 0; - } - else { - PyErr_Format(PyExc_ValueError, - "UTF-8 decoding error; unknown error handling code: %.400s", - errors); - return -1; - } -} - PyObject *PyUnicode_DecodeUTF8(const char *s, int size, const char *errors) { + const char *starts = s; int n; + int startinpos; + int endinpos; + int outpos; const char *e; PyUnicodeObject *unicode; Py_UNICODE *p; const char *errmsg = ""; + PyObject *errorHandler = NULL; + PyObject *inputObject = NULL; /* Note: size will always be longer than the resulting Unicode character count */ @@ -1028,6 +1083,8 @@ if (s + n > e) { errmsg = "unexpected end of data"; + startinpos = s-starts; + endinpos = size; goto utf8Error; } @@ -1035,19 +1092,27 @@ case 0: errmsg = "unexpected code byte"; + startinpos = s-starts; + endinpos = startinpos+1; goto utf8Error; case 1: errmsg = "internal error"; + startinpos = s-starts; + endinpos = startinpos+1; goto utf8Error; case 2: if ((s[1] & 0xc0) != 0x80) { errmsg = "invalid data"; + startinpos = s-starts; + endinpos = startinpos+2; goto utf8Error; } ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f); if (ch < 0x80) { + startinpos = s-starts; + endinpos = startinpos+2; errmsg = "illegal encoding"; goto utf8Error; } @@ -1059,6 +1124,8 @@ if ((s[1] & 0xc0) != 0x80 || (s[2] & 0xc0) != 0x80) { errmsg = "invalid data"; + startinpos = s-starts; + endinpos = startinpos+3; goto utf8Error; } ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f); @@ -1071,6 +1138,8 @@ unit. */ errmsg = "illegal encoding"; + startinpos = s-starts; + endinpos = startinpos+3; goto utf8Error; } else @@ -1082,6 +1151,8 @@ (s[2] & 0xc0) != 0x80 || (s[3] & 0xc0) != 0x80) { errmsg = "invalid data"; + startinpos = s-starts; + endinpos = startinpos+4; goto utf8Error; } ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) + @@ -1093,6 +1164,8 @@ UTF-16 */ { errmsg = "illegal encoding"; + startinpos = s-starts; + endinpos = startinpos+4; goto utf8Error; } #ifdef Py_UNICODE_WIDE @@ -1114,23 +1187,34 @@ default: /* Other sizes are only needed for UCS-4 */ errmsg = "unsupported Unicode code range"; + startinpos = s-starts; + endinpos = startinpos+n; goto utf8Error; } s += n; continue; utf8Error: - if (utf8_decoding_error(&s, &p, errors, errmsg)) - goto onError; + outpos = p-PyUnicode_AS_UNICODE(unicode); + if (unicode_decode_call_errorhandler( + errors, &errorHandler, + "utf8", errmsg, + starts, size, &startinpos, &endinpos, &inputObject, &s, + (PyObject **)&unicode, &outpos, &p)) + goto onError; } /* Adjust length */ if (_PyUnicode_Resize(&unicode, p - unicode->str)) goto onError; + Py_XDECREF(errorHandler); + Py_XDECREF(inputObject); return (PyObject *)unicode; onError: + Py_XDECREF(errorHandler); + Py_XDECREF(inputObject); Py_DECREF(unicode); return NULL; } @@ -1248,43 +1332,16 @@ /* --- UTF-16 Codec ------------------------------------------------------- */ -static -int utf16_decoding_error(Py_UNICODE **dest, - const char *errors, - const char *details) -{ - if ((errors == NULL) || - (strcmp(errors,"strict") == 0)) { - PyErr_Format(PyExc_UnicodeError, - "UTF-16 decoding error: %.400s", - details); - return -1; - } - else if (strcmp(errors,"ignore") == 0) { - return 0; - } - else if (strcmp(errors,"replace") == 0) { - if (dest) { - **dest = Py_UNICODE_REPLACEMENT_CHARACTER; - (*dest)++; - } - return 0; - } - else { - PyErr_Format(PyExc_ValueError, - "UTF-16 decoding error; " - "unknown error handling code: %.400s", - errors); - return -1; - } -} - PyObject * PyUnicode_DecodeUTF16(const char *s, int size, const char *errors, int *byteorder) { + const char *starts = s; + int startinpos; + int endinpos; + int outpos; PyUnicodeObject *unicode; Py_UNICODE *p; const unsigned char *q, *e; @@ -1296,13 +1353,8 @@ #else int ihi = 0, ilo = 1; #endif - - /* size should be an even number */ - if (size & 1) { - if (utf16_decoding_error(NULL, errors, "truncated data")) - return NULL; - --size; /* else ignore the oddball byte */ - } + PyObject *errorHandler = NULL; + PyObject *inputObject = NULL; /* Note: size will always be longer than the resulting Unicode character count */ @@ -1359,7 +1411,18 @@ } while (q < e) { - Py_UNICODE ch = (q[ihi] << 8) | q[ilo]; + Py_UNICODE ch; + /* remaing bytes at the end? (size should be even) */ + if (e-q<2) { + errmsg = "truncated data"; + startinpos = ((const char *)q)-starts; + endinpos = ((const char *)e)-starts; + goto utf16Error; + /* The remaining input chars are ignored if the callback + chooses to skip the input */ + } + ch = (q[ihi] << 8) | q[ilo]; + q += 2; if (ch < 0xD800 || ch > 0xDFFF) { @@ -1370,6 +1433,8 @@ /* UTF-16 code pair: */ if (q >= e) { errmsg = "unexpected end of data"; + startinpos = (((const char *)q)-2)-starts; + endinpos = ((const char *)e)-starts; goto utf16Error; } if (0xD800 <= ch && ch <= 0xDBFF) { @@ -1386,15 +1451,24 @@ } else { errmsg = "illegal UTF-16 surrogate"; + startinpos = (((const char *)q)-4)-starts; + endinpos = startinpos+2; goto utf16Error; } } errmsg = "illegal encoding"; + startinpos = (((const char *)q)-2)-starts; + endinpos = startinpos+2; /* Fall through to report the error */ utf16Error: - if (utf16_decoding_error(&p, errors, errmsg)) + outpos = p-PyUnicode_AS_UNICODE(unicode); + if (unicode_decode_call_errorhandler( + errors, &errorHandler, + "utf16", errmsg, + starts, size, &startinpos, &endinpos, &inputObject, (const char **)&q, + (PyObject **)&unicode, &outpos, &p)) goto onError; } @@ -1405,10 +1479,14 @@ if (_PyUnicode_Resize(&unicode, p - unicode->str)) goto onError; + Py_XDECREF(errorHandler); + Py_XDECREF(inputObject); return (PyObject *)unicode; onError: Py_DECREF(unicode); + Py_XDECREF(errorHandler); + Py_XDECREF(inputObject); return NULL; } @@ -1489,70 +1567,50 @@ /* --- Unicode Escape Codec ----------------------------------------------- */ -static -int unicodeescape_decoding_error(Py_UNICODE **x, - const char *errors, - const char *details) -{ - if ((errors == NULL) || - (strcmp(errors,"strict") == 0)) { - PyErr_Format(PyExc_UnicodeError, - "Unicode-Escape decoding error: %.400s", - details); - return -1; - } - else if (strcmp(errors,"ignore") == 0) { - return 0; - } - else if (strcmp(errors,"replace") == 0) { - **x = Py_UNICODE_REPLACEMENT_CHARACTER; - (*x)++; - return 0; - } - else { - PyErr_Format(PyExc_ValueError, - "Unicode-Escape decoding error; " - "unknown error handling code: %.400s", - errors); - return -1; - } -} - static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL; PyObject *PyUnicode_DecodeUnicodeEscape(const char *s, int size, const char *errors) { + const char *starts = s; + int startinpos; + int endinpos; + int outpos; + int i; PyUnicodeObject *v; - Py_UNICODE *p, *buf; + Py_UNICODE *p; const char *end; char* message; Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */ + PyObject *errorHandler = NULL; + PyObject *inputObject = NULL; /* Escaped strings will always be longer than the resulting Unicode string, so we start with size here and then reduce the - length after conversion to the true value. */ + length after conversion to the true value. + (but if the error callback returns a long replacement string + we'll have to allocate more space) */ v = _PyUnicode_New(size); if (v == NULL) goto onError; if (size == 0) return (PyObject *)v; - p = buf = PyUnicode_AS_UNICODE(v); + p = PyUnicode_AS_UNICODE(v); end = s + size; while (s < end) { unsigned char c; Py_UNICODE x; - int i, digits; + int digits; /* Non-escape characters are interpreted as Unicode ordinals */ if (*s != '\\') { *p++ = (unsigned char) *s++; continue; } - + startinpos = s-starts; /* \ - Escapes */ s++; switch (*s++) { @@ -1601,14 +1659,28 @@ message = "truncated \\UXXXXXXXX escape"; hexescape: chr = 0; - for (i = 0; i < digits; i++) { + outpos = p-PyUnicode_AS_UNICODE(v); + if (s+digits>end) { + endinpos = size; + if (unicode_decode_call_errorhandler( + errors, &errorHandler, + "unicodeescape", "end of string in escape sequence", + starts, size, &startinpos, &endinpos, &inputObject, &s, + (PyObject **)&v, &outpos, &p)) + goto onError; + goto nextByte; + } + for (i = 0; i < digits; ++i) { c = (unsigned char) s[i]; if (!isxdigit(c)) { - if (unicodeescape_decoding_error(&p, errors, message)) + endinpos = (s+i+1)-starts; + if (unicode_decode_call_errorhandler( + errors, &errorHandler, + "unicodeescape", message, + starts, size, &startinpos, &endinpos, &inputObject, &s, + (PyObject **)&v, &outpos, &p)) goto onError; - chr = 0xffffffff; - i++; - break; + goto nextByte; } chr = (chr<<4) & ~0xF; if (c >= '0' && c <= '9') @@ -1620,9 +1692,9 @@ } s += i; if (chr == 0xffffffff) - /* _decoding_error will have already written into the - target buffer. */ - break; + /* _decoding_error will have already written into the + target buffer. */ + break; store: /* when we get here, chr is a 32-bit unicode character */ if (chr <= 0xffff) @@ -1639,10 +1711,13 @@ *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF); #endif } else { - if (unicodeescape_decoding_error( - &p, errors, - "illegal Unicode character") - ) + endinpos = s-starts; + outpos = p-PyUnicode_AS_UNICODE(v); + if (unicode_decode_call_errorhandler( + errors, &errorHandler, + "unicodeescape", "illegal Unicode character", + starts, size, &startinpos, &endinpos, &inputObject, &s, + (PyObject **)&v, &outpos, &p)) goto onError; } break; @@ -1678,13 +1753,28 @@ goto store; } } - if (unicodeescape_decoding_error(&p, errors, message)) + /* s--; */ + endinpos = s-starts; + outpos = p-PyUnicode_AS_UNICODE(v); + if (unicode_decode_call_errorhandler( + errors, &errorHandler, + "unicodeescape", message, + starts, size, &startinpos, &endinpos, &inputObject, &s, + (PyObject **)&v, &outpos, &p)) goto onError; break; default: if (s > end) { - if (unicodeescape_decoding_error(&p, errors, "\\ at end of string")) + message = "\\ at end of string"; + s--; + endinpos = s-starts; + outpos = p-PyUnicode_AS_UNICODE(v); + if (unicode_decode_call_errorhandler( + errors, &errorHandler, + "unicodeescape", message, + starts, size, &startinpos, &endinpos, &inputObject, &s, + (PyObject **)&v, &outpos, &p)) goto onError; } else { @@ -1693,9 +1783,11 @@ } break; } + nextByte: + ; } - if (_PyUnicode_Resize(&v, (int)(p - buf))) - goto onError; + if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v)))) + goto onError; return (PyObject *)v; ucnhashError: @@ -1703,10 +1795,14 @@ PyExc_UnicodeError, "\\N escapes not supported (can't load unicodedata module)" ); + Py_XDECREF(errorHandler); + Py_XDECREF(inputObject); return NULL; onError: Py_XDECREF(v); + Py_XDECREF(errorHandler); + Py_XDECREF(inputObject); return NULL; } @@ -1870,20 +1966,27 @@ int size, const char *errors) { + const char *starts = s; + int startinpos; + int endinpos; + int outpos; PyUnicodeObject *v; - Py_UNICODE *p, *buf; + Py_UNICODE *p; const char *end; const char *bs; + PyObject *errorHandler = NULL; + PyObject *inputObject = NULL; /* Escaped strings will always be longer than the resulting Unicode string, so we start with size here and then reduce the - length after conversion to the true value. */ + length after conversion to the true value. (But decoding error + handler might have to resize the string) */ v = _PyUnicode_New(size); if (v == NULL) goto onError; if (size == 0) return (PyObject *)v; - p = buf = PyUnicode_AS_UNICODE(v); + p = PyUnicode_AS_UNICODE(v); end = s + size; while (s < end) { unsigned char c; @@ -1895,6 +1998,7 @@ *p++ = (unsigned char)*s++; continue; } + startinpos = s-starts; /* \u-escapes are only interpreted iff the number of leading backslashes if odd */ @@ -1913,15 +2017,18 @@ s++; /* \uXXXX with 4 hex digits */ - for (x = 0, i = 0; i < 4; i++) { - c = (unsigned char)s[i]; + outpos = p-PyUnicode_AS_UNICODE(v); + for (x = 0, i = 0; i < 4; ++i, ++s) { + c = (unsigned char)*s; if (!isxdigit(c)) { - if (unicodeescape_decoding_error(&p, errors, - "truncated \\uXXXX")) + endinpos = s-starts; + if (unicode_decode_call_errorhandler( + errors, &errorHandler, + "rawunicodeescape", "truncated \\uXXXX", + starts, size, &startinpos, &endinpos, &inputObject, &s, + (PyObject **)&v, &outpos, &p)) goto onError; - x = 0xffffffff; - i++; - break; + goto nextByte; } x = (x<<4) & ~0xF; if (c >= '0' && c <= '9') @@ -1931,16 +2038,20 @@ else x += 10 + c - 'A'; } - s += i; - if (x != 0xffffffff) - *p++ = x; + *p++ = x; + nextByte: + ; } - if (_PyUnicode_Resize(&v, (int)(p - buf))) + if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v)))) goto onError; + Py_XDECREF(errorHandler); + Py_XDECREF(inputObject); return (PyObject *)v; onError: Py_XDECREF(v); + Py_XDECREF(errorHandler); + Py_XDECREF(inputObject); return NULL; } @@ -2020,71 +2131,237 @@ return NULL; } -static -int latin1_encoding_error(const Py_UNICODE **source, - char **dest, - const char *errors, - const char *details) -{ - if ((errors == NULL) || - (strcmp(errors,"strict") == 0)) { - PyErr_Format(PyExc_UnicodeError, - "Latin-1 encoding error: %.400s", - details); - return -1; +/* error handling callback helper: + build arguments, call the callback and check the arguments, + put the result into newpos and return the replacement string, which + has to be freed by the caller +*/ + +static PyObject *unicode_encode_call_errorhandler(const char *errors, PyObject **errorHandler, + const char *encoding, const char *reason, + const Py_UNICODE *unicode, int size, PyObject **unicodeObject, int startpos, int endpos, + int *newpos) +{ + static char *argparse = "O!i;encoding error handler must return (unicode, int) tuple"; + + PyObject *args; + PyObject *restuple; + PyObject *resunicode; + + if (*errorHandler == NULL) { + *errorHandler = PyCodec_LookupUnicodeEncodeErrorHandler(errors); + if (*errorHandler == NULL) + return NULL; } - else if (strcmp(errors,"ignore") == 0) { - return 0; + + if (*unicodeObject == NULL) { + *unicodeObject = PyUnicode_FromUnicode(unicode, size); + if (*unicodeObject == NULL) + return NULL; } - else if (strcmp(errors,"replace") == 0) { - **dest = '?'; - (*dest)++; - return 0; + + /* we don't need a state */ + args = Py_BuildValue("sOiisO", encoding, *unicodeObject, startpos, endpos, reason, Py_None); + if (args == NULL) + return NULL; + restuple = PyEval_CallObject(*errorHandler, args); + Py_DECREF(args); + if (restuple == NULL) + return NULL; + if (!PyTuple_Check(restuple)) { + PyErr_Format(PyExc_TypeError, &argparse[4]); + Py_DECREF(restuple); + return NULL; } - else { - PyErr_Format(PyExc_ValueError, - "Latin-1 encoding error; " - "unknown error handling code: %.400s", - errors); - return -1; + if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &resunicode, newpos)) { + Py_DECREF(restuple); + return NULL; } + if (*newpos<0) + *newpos = 0; + else if (*newpos>size) + *newpos = size; + Py_INCREF(resunicode); + Py_DECREF(restuple); + return resunicode; } -PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p, +static PyObject *unicode_encode_ucs1(const Py_UNICODE *p, int size, - const char *errors) + const char *errors, + int limit) { - PyObject *repr; - char *s, *start; - - repr = PyString_FromStringAndSize(NULL, size); - if (repr == NULL) - return NULL; + /* output object */ + PyObject *res; + /* object version of input */ + PyObject *unicodeObject = NULL; + /* pointers to the beginning and end+1 of input */ + const Py_UNICODE *startp = p; + const Py_UNICODE *endp = p + size; + /* pointer to the beginning of the unencodable characters */ + /* const Py_UNICODE *badp = NULL; */ + /* pointer into the output */ + char *str; + /* current output position */ + int respos = 0; + int ressize; + char *encoding = (limit == 256) ? "latin-1" : "ascii"; + char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)"; + PyObject *errorHandler = NULL; + /* the following variable is used for caching string comparisons + * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */ + int known_errorHandler = -1; + + /* allocate enough for a simple encoding without + replacements, if we need more, we'll resize */ + res = PyString_FromStringAndSize(NULL, size); + if (res == NULL) + goto onError; if (size == 0) - return repr; - - s = PyString_AS_STRING(repr); - start = s; - while (size-- > 0) { - Py_UNICODE ch = *p++; - if (ch >= 256) { - if (latin1_encoding_error(&p, &s, errors, - "ordinal not in range(256)")) - goto onError; + return res; + str = PyString_AS_STRING(res); + ressize = size; + + while (p=limit)) + ++collend; + /* cache callback name lookup (if not done yet, i.e. it's the first error) */ + if (known_errorHandler==-1) { + if ((errors==NULL) || (!strcmp(errors, "strict"))) + known_errorHandler = 1; + else if (!strcmp(errors, "replace")) + known_errorHandler = 2; + else if (!strcmp(errors, "ignore")) + known_errorHandler = 3; + else if (!strcmp(errors, "xmlcharrefreplace")) + known_errorHandler = 4; + else + known_errorHandler = 0; + } + switch (known_errorHandler) { + case 1: /* strict */ + PyCodec_RaiseUnicodeEncodeError(encoding, startp, collstart-startp, collend-startp, reason); + goto onError; + case 2: /* replace */ + while (collstart++ ressize) { + if (requiredsize<2*ressize) + requiredsize = 2*ressize; + if (_PyString_Resize(&res, requiredsize)) + goto onError; + str = PyString_AS_STRING(res) + respos; + ressize = requiredsize; + } + /* generate replacement (temporarily (mis)uses p) */ + for (p = collstart; p < collend; ++p) { + str += sprintf(str, "&#%d;", (int)*p); + } + p = collend; + break; + default: + repunicode = unicode_encode_call_errorhandler(errors, &errorHandler, + encoding, reason, startp, size, &unicodeObject, collstart-startp, collend-startp, &newpos); + if (repunicode == NULL) + goto onError; + /* need more space? (at least enough for what we + have+the replacement+the rest of the string, so + we won't have to check space for encodable characters) */ + respos = str-PyString_AS_STRING(res); + repsize = PyUnicode_GET_SIZE(repunicode); + requiredsize = respos+repsize+(endp-collend); + if (requiredsize > ressize) { + if (requiredsize<2*ressize) + requiredsize = 2*ressize; + if (_PyString_Resize(&res, requiredsize)) { + Py_DECREF(repunicode); + goto onError; + } + str = PyString_AS_STRING(res) + respos; + ressize = requiredsize; + } + /* check if there is anything unencodable in the replacement + and copy it to the output */ + for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) { + c = *uni2; + if (c >= limit) { + PyCodec_RaiseUnicodeEncodeError(encoding, startp, unicodepos, unicodepos+1, reason); + Py_DECREF(repunicode); + goto onError; + } + *str = (char)c; + } + p = startp + newpos; + Py_DECREF(repunicode); + } } - else - *s++ = (char)ch; } - /* Resize if error handling skipped some characters */ - if (s - start < PyString_GET_SIZE(repr)) - _PyString_Resize(&repr, s - start); - return repr; + /* Resize if we allocated to much */ + respos = str-PyString_AS_STRING(res); + if (respos 0) { - register unsigned char c; - - c = (unsigned char)*s++; - if (c < 128) + e = s + size; + while (s < e) { + register unsigned char c = (unsigned char)*s; + if (c < 128) { *p++ = c; - else if (ascii_decoding_error(&s, &p, errors, - "ordinal not in range(128)")) + ++s; + } + else { + startinpos = s-starts; + endinpos = startinpos + 1; + outpos = p-PyUnicode_AS_UNICODE(v); + if (unicode_decode_call_errorhandler( + errors, &errorHandler, + "ascii", "ordinal not in range(128)", + starts, size, &startinpos, &endinpos, &inputObject, &s, + (PyObject **)&v, &outpos, &p)) goto onError; + } } if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v)) if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v)))) goto onError; + Py_XDECREF(errorHandler); + Py_XDECREF(inputObject); return (PyObject *)v; onError: Py_XDECREF(v); + Py_XDECREF(errorHandler); + Py_XDECREF(inputObject); return NULL; } -static -int ascii_encoding_error(const Py_UNICODE **source, - char **dest, - const char *errors, - const char *details) -{ - if ((errors == NULL) || - (strcmp(errors,"strict") == 0)) { - PyErr_Format(PyExc_UnicodeError, - "ASCII encoding error: %.400s", - details); - return -1; - } - else if (strcmp(errors,"ignore") == 0) { - return 0; - } - else if (strcmp(errors,"replace") == 0) { - **dest = '?'; - (*dest)++; - return 0; - } - else { - PyErr_Format(PyExc_ValueError, - "ASCII encoding error; " - "unknown error handling code: %.400s", - errors); - return -1; - } -} - PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p, int size, const char *errors) { - PyObject *repr; - char *s, *start; - - repr = PyString_FromStringAndSize(NULL, size); - if (repr == NULL) - return NULL; - if (size == 0) - return repr; - - s = PyString_AS_STRING(repr); - start = s; - while (size-- > 0) { - Py_UNICODE ch = *p++; - if (ch >= 128) { - if (ascii_encoding_error(&p, &s, errors, - "ordinal not in range(128)")) - goto onError; - } - else - *s++ = (char)ch; - } - /* Resize if error handling skipped some characters */ - if (s - start < PyString_GET_SIZE(repr)) - _PyString_Resize(&repr, s - start); - return repr; - - onError: - Py_DECREF(repr); - return NULL; + return unicode_encode_ucs1(p, size, errors, 128); } PyObject *PyUnicode_AsASCIIString(PyObject *unicode) @@ -2309,44 +2518,21 @@ /* --- Character Mapping Codec -------------------------------------------- */ -static -int charmap_decoding_error(const char **source, - Py_UNICODE **dest, - const char *errors, - const char *details) -{ - if ((errors == NULL) || - (strcmp(errors,"strict") == 0)) { - PyErr_Format(PyExc_UnicodeError, - "charmap decoding error: %.400s", - details); - return -1; - } - else if (strcmp(errors,"ignore") == 0) { - return 0; - } - else if (strcmp(errors,"replace") == 0) { - **dest = Py_UNICODE_REPLACEMENT_CHARACTER; - (*dest)++; - return 0; - } - else { - PyErr_Format(PyExc_ValueError, - "charmap decoding error; " - "unknown error handling code: %.400s", - errors); - return -1; - } -} - PyObject *PyUnicode_DecodeCharmap(const char *s, int size, PyObject *mapping, const char *errors) { + const char *starts = s; + int startinpos; + int endinpos; + int outpos; + const char *e; PyUnicodeObject *v; Py_UNICODE *p; int extrachars = 0; + PyObject *errorHandler = NULL; + PyObject *inputObject = NULL; /* Default to Latin-1 */ if (mapping == NULL) @@ -2358,8 +2544,9 @@ if (size == 0) return (PyObject *)v; p = PyUnicode_AS_UNICODE(v); - while (size-- > 0) { - unsigned char ch = *s++; + e = s + size; + while (s < e) { + unsigned char ch = *s; PyObject *w, *x; /* Get mapping (char ordinal -> integer, Unicode char or None) */ @@ -2391,11 +2578,18 @@ } else if (x == Py_None) { /* undefined mapping */ - if (charmap_decoding_error(&s, &p, errors, - "character maps to ")) { + outpos = p-PyUnicode_AS_UNICODE(v); + startinpos = s-starts; + endinpos = startinpos+1; + if (unicode_decode_call_errorhandler( + errors, &errorHandler, + "charmap", "character maps to ", + starts, size, &startinpos, &endinpos, &inputObject, &s, + (PyObject **)&v, &outpos, &p)) { Py_DECREF(x); goto onError; } + continue; } else if (PyUnicode_Check(x)) { int targetsize = PyUnicode_GET_SIZE(x); @@ -2435,45 +2629,229 @@ goto onError; } Py_DECREF(x); + ++s; } if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v)) if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v)))) goto onError; + Py_XDECREF(errorHandler); + Py_XDECREF(inputObject); return (PyObject *)v; onError: + Py_XDECREF(errorHandler); + Py_XDECREF(inputObject); Py_XDECREF(v); return NULL; } -static -int charmap_encoding_error(const Py_UNICODE **source, - char **dest, - const char *errors, - const char *details) -{ - if ((errors == NULL) || - (strcmp(errors,"strict") == 0)) { - PyErr_Format(PyExc_UnicodeError, - "charmap encoding error: %.400s", - details); - return -1; +/* Lookup the character ch in the mapping. If the character + can't be found, Py_None is returned (or NULL, if another + error occured). */ +static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping) +{ + PyObject *w = PyInt_FromLong((long)c); + PyObject *x; + + if (w == NULL) + return NULL; + x = PyObject_GetItem(mapping, w); + Py_DECREF(w); + if (x == NULL) { + if (PyErr_ExceptionMatches(PyExc_LookupError)) { + /* No mapping found means: mapping is undefined. */ + PyErr_Clear(); + x = Py_None; + Py_INCREF(x); + return x; + } else + return NULL; } - else if (strcmp(errors,"ignore") == 0) { - return 0; + else if (PyInt_Check(x)) { + long value = PyInt_AS_LONG(x); + if (value < 0 || value > 255) { + PyErr_SetString(PyExc_TypeError, + "character mapping must be in range(256)"); + Py_DECREF(x); + return NULL; + } + return x; } - else if (strcmp(errors,"replace") == 0) { - **dest = '?'; - (*dest)++; - return 0; + else if (PyString_Check(x)) + return x; + else { + /* wrong return value */ + PyErr_SetString(PyExc_TypeError, + "character mapping must return integer, None or str"); + Py_DECREF(x); + return NULL; + } +} + +/* lookup the character, put the result in the output string and adjust + various state variables. Return a new reference to the object that + was put in the output buffer, or Py_None, if the mapping was undefined + (in which case no character was written) or NULL, if a + reallocation error ocurred. The called must decref the result */ +static +PyObject *charmapencode_output(Py_UNICODE c, PyObject *mapping, + PyObject **outobj, int *outpos) +{ + char *outstart; + int outsize; + int requiredsize; + const char *repchars; + char repchar; + int repsize; + PyObject *rep = charmapencode_lookup(c, mapping); + + if (rep==NULL) + return NULL; + else if (rep==Py_None) + return rep; + else if (PyInt_Check(rep)) { + repchar = (char)PyInt_AS_LONG(rep); + repchars = &repchar; + repsize = 1; } else { - PyErr_Format(PyExc_ValueError, - "charmap encoding error; " - "unknown error handling code: %.400s", - errors); - return -1; + repsize = PyString_GET_SIZE(rep); + repchars = PyString_AS_STRING(rep); + } + outstart = PyString_AS_STRING(*outobj); + outsize = PyString_GET_SIZE(*outobj); + requiredsize = *outpos+repsize; + + if (outsize0; ++uni2) { + x = charmapencode_output(*uni2, mapping, res, respos); + if (x==NULL) { + Py_DECREF(repunicode); + return -1; + } + else if (x==Py_None) { + Py_DECREF(repunicode); + Py_DECREF(x); + PyCodec_RaiseUnicodeEncodeError(encoding, p, + collstartpos, collendpos, reason); + return -1; + } + Py_DECREF(x); + } + *inpos = newpos; + Py_DECREF(repunicode); } + return 0; } PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p, @@ -2481,101 +2859,63 @@ PyObject *mapping, const char *errors) { - PyObject *v; - char *s; - int extrachars = 0; + /* output object */ + PyObject *res = NULL; + /* object version of input */ + PyObject *unicodeObject = NULL; + /* current input position */ + int inpos = 0; + /* current output position */ + int respos = 0; + PyObject *errorHandler = NULL; + /* the following variable is used for caching string comparisons + * -1=not initialized, 0=unknown, 1=strict, 2=replace, + * 3=ignore, 4=xmlcharrefreplace */ + int known_errorHandler = -1; /* Default to Latin-1 */ if (mapping == NULL) return PyUnicode_EncodeLatin1(p, size, errors); - v = PyString_FromStringAndSize(NULL, size); - if (v == NULL) - return NULL; + /* allocate enough for a simple encoding without + replacements, if we need more, we'll resize */ + res = PyString_FromStringAndSize(NULL, size); + if (res == NULL) + goto onError; if (size == 0) - return v; - s = PyString_AS_STRING(v); - while (size-- > 0) { - Py_UNICODE ch = *p++; - PyObject *w, *x; + return res; - /* Get mapping (Unicode ordinal -> string char, integer or None) */ - w = PyInt_FromLong((long)ch); - if (w == NULL) + while (inpos adjust input position */ + ++inpos; + Py_DECREF(x); + } - /* Apply mapping */ - if (PyInt_Check(x)) { - long value = PyInt_AS_LONG(x); - if (value < 0 || value > 255) { - PyErr_SetString(PyExc_TypeError, - "character mapping must be in range(256)"); - Py_DECREF(x); - goto onError; - } - *s++ = (char)value; - } - else if (x == Py_None) { - /* undefined mapping */ - if (charmap_encoding_error(&p, &s, errors, - "character maps to ")) { - Py_DECREF(x); - goto onError; - } - } - else if (PyString_Check(x)) { - int targetsize = PyString_GET_SIZE(x); - - if (targetsize == 1) - /* 1-1 mapping */ - *s++ = *PyString_AS_STRING(x); - - else if (targetsize > 1) { - /* 1-n mapping */ - if (targetsize > extrachars) { - /* resize first */ - int oldpos = (int)(s - PyString_AS_STRING(v)); - int needed = (targetsize - extrachars) + \ - (targetsize << 2); - extrachars += needed; - if (_PyString_Resize(&v, PyString_GET_SIZE(v) + needed)) { - Py_DECREF(x); - goto onError; - } - s = PyString_AS_STRING(v) + oldpos; - } - memcpy(s, PyString_AS_STRING(x), targetsize); - s += targetsize; - extrachars -= targetsize; - } - /* 1-0 mapping: skip the character */ - } - else { - /* wrong return value */ - PyErr_SetString(PyExc_TypeError, - "character mapping must return integer, None or unicode"); - Py_DECREF(x); + /* Resize if we allocated to much */ + if (respos max) { + PyErr_Format(PyExc_TypeError, + "character mapping must be in range(0x%lx)", max+1); + Py_DECREF(x); + return -1; + } + *result = x; + return 0; + } + else if (PyUnicode_Check(x)) { + *result = x; return 0; } else { - PyErr_Format(PyExc_ValueError, - "translate error; " - "unknown error handling code: %.400s", - errors); + /* wrong return value */ + PyErr_SetString(PyExc_TypeError, + "character mapping must return integer, None or unicode"); return -1; } } +/* ensure that *outobj is at least requiredsize characters long, +if not reallocate and adjust various state variables. +Return 0 on success, -1 on error */ +static +int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp, int *outsize, + int requiredsize) +{ + if (requiredsize > *outsize) { + /* remember old output position */ + int outpos = *outp-PyUnicode_AS_UNICODE(*outobj); + /* exponentially overallocate to minimize reallocations */ + if (requiredsize < 2 * *outsize) + requiredsize = 2 * *outsize; + if (_PyUnicode_Resize(outobj, requiredsize)) + return -1; + *outp = PyUnicode_AS_UNICODE(*outobj) + outpos; + *outsize = requiredsize; + } + return 0; +} +/* lookup the character, put the result in the output string and adjust + various state variables. Return a new reference to the object that + was put in the output buffer in *result, or Py_None, if the mapping was + undefined (in which case no character was written). + The called must decref result. + Return 0 on success, -1 on error. */ +static +int charmaptranslate_output(Py_UNICODE c, PyObject *mapping, + PyObject **outobj, int *outsize, Py_UNICODE **outp, PyObject **res) +{ + if (charmaptranslate_lookup(c, mapping, res)) + return -1; + if (*res==NULL) { + /* not found => default to 1:1 mapping */ + *(*outp)++ = (Py_UNICODE)c; + } + else if (*res==Py_None) + ; + else if (PyInt_Check(*res)) { + /* no overflow check, because we know that the space is enough */ + *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res); + } + else if (PyUnicode_Check(*res)) { + int repsize = PyUnicode_GET_SIZE(*res); + if (repsize==1) { + /* no overflow check, because we know that the space is enough */ + *(*outp)++ = *PyUnicode_AS_UNICODE(*res); + } + else if (repsize!=0) { + /* more than one character */ + int requiredsize = *outsize + repsize - 1; + if (charmaptranslate_makespace(outobj, outp, outsize, requiredsize)) + return -1; + memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize); + *outp += repsize; + } + } + else + return -1; + return 0; +} -PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s, +PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p, int size, PyObject *mapping, const char *errors) { - PyUnicodeObject *v; - Py_UNICODE *p; - + /* output object */ + PyObject *res = NULL; + /* object version of input */ + PyObject *unicodeObject = NULL; + /* pointers to the beginning and end+1 of input */ + const Py_UNICODE *startp = p; + const Py_UNICODE *endp = p + size; + /* pointer into the output */ + Py_UNICODE *str; + /* current output position */ + int respos = 0; + int ressize; + char *encoding = "charmap"; + char *reason = "character maps to "; + PyObject *errorHandler = NULL; + /* the following variable is used for caching string comparisons + * -1=not initialized, 0=unknown, 1=strict, 2=replace, + * 3=ignore, 4=xmlcharrefreplace */ + int known_errorHandler = -1; + if (mapping == NULL) { PyErr_BadArgument(); return NULL; } - - /* Output will never be longer than input */ - v = _PyUnicode_New(size); - if (v == NULL) - goto onError; - if (size == 0) - goto done; - p = PyUnicode_AS_UNICODE(v); - while (size-- > 0) { - Py_UNICODE ch = *s++; - PyObject *w, *x; - /* Get mapping */ - w = PyInt_FromLong(ch); - if (w == NULL) - goto onError; - x = PyObject_GetItem(mapping, w); - Py_DECREF(w); - if (x == NULL) { - if (PyErr_ExceptionMatches(PyExc_LookupError)) { - /* No mapping found: default to 1-1 mapping */ - PyErr_Clear(); - *p++ = ch; - continue; - } + /* allocate enough for a simple 1:1 translation without + replacements, if we need more, we'll resize */ + res = PyUnicode_FromUnicode(NULL, size); + if (res == NULL) + goto onError; + if (size == 0) + return res; + str = PyUnicode_AS_UNICODE(res); + ressize = size; + + while (p")) { - Py_DECREF(x); - goto onError; - } - } - else if (PyUnicode_Check(x)) { - if (PyUnicode_GET_SIZE(x) != 1) { - /* 1-n mapping */ - PyErr_SetString(PyExc_NotImplementedError, - "1-n mappings are currently not implemented"); - Py_DECREF(x); - goto onError; + if (x!=Py_None) /* it worked => adjust input pointer */ + ++p; + else { /* untranslatable character */ + PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ + int repsize; + int newpos; + Py_UNICODE *uni2; + /* startpos for collecting untranslatable chars */ + const Py_UNICODE *collstart = p; + const Py_UNICODE *collend = p+1; + const Py_UNICODE *coll; + + Py_XDECREF(x); + /* find all untranslatable characters */ + while (collend < endp) { + if (charmaptranslate_lookup(*collend, mapping, &x)) + goto onError; + Py_XDECREF(x); + if (x!=Py_None) + break; + ++collend; + } + /* cache callback name lookup + * (if not done yet, i.e. it's the first error) */ + if (known_errorHandler==-1) { + if ((errors==NULL) || (!strcmp(errors, "strict"))) + known_errorHandler = 1; + else if (!strcmp(errors, "replace")) + known_errorHandler = 2; + else if (!strcmp(errors, "ignore")) + known_errorHandler = 3; + else if (!strcmp(errors, "xmlcharrefreplace")) + known_errorHandler = 4; + else + known_errorHandler = 0; + } + switch (known_errorHandler) { + case 1: /* strict */ + PyCodec_RaiseUnicodeEncodeError(encoding, startp, collstart-startp, collend-startp, reason); + goto onError; + case 2: /* replace */ + /* No need to check for space, this is a 1:1 replacement */ + for (coll = collstart; coll0; ++uni2) + *str++ = *uni2; + p = startp + newpos; + Py_DECREF(repunicode); } - *p++ = *PyUnicode_AS_UNICODE(x); } - else { - /* wrong return value */ - PyErr_SetString(PyExc_TypeError, - "translate mapping must return integer, None or unicode"); - Py_DECREF(x); - goto onError; - } - Py_DECREF(x); } - if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v)) - if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v)))) + /* Resize if we allocated to much */ + respos = str-PyUnicode_AS_UNICODE(res); + if (respos= 0) { *output++ = '0' + decimal; + ++p; continue; } if (0 < ch && ch < 256) { *output++ = (char)ch; + ++p; continue; } - /* All other characters are considered invalid */ - if (errors == NULL || strcmp(errors, "strict") == 0) { - PyErr_SetString(PyExc_ValueError, - "invalid decimal Unicode string"); - goto onError; + /* All other characters are considered unencodable */ + collstart = p; + collend = p+1; + while (collend < end) { + if ((0 < *collend && *collend < 256) || + !Py_UNICODE_ISSPACE(*collend) || + Py_UNICODE_TODECIMAL(*collend)) + break; } - else if (strcmp(errors, "ignore") == 0) - continue; - else if (strcmp(errors, "replace") == 0) { - *output++ = '?'; - continue; + /* cache callback name lookup + * (if not done yet, i.e. it's the first error) */ + if (known_errorHandler==-1) { + if ((errors==NULL) || (!strcmp(errors, "strict"))) + known_errorHandler = 1; + else if (!strcmp(errors, "replace")) + known_errorHandler = 2; + else if (!strcmp(errors, "ignore")) + known_errorHandler = 3; + else if (!strcmp(errors, "xmlcharrefreplace")) + known_errorHandler = 4; + else + known_errorHandler = 0; + } + switch (known_errorHandler) { + case 1: /* strict */ + PyCodec_RaiseUnicodeEncodeError(encoding, s, collstart-s, collend-s, reason); + goto onError; + case 2: /* replace */ + for (p = collstart; p < collend; ++p) + *output++ = '?'; + /* fall through */ + case 3: /* ignore */ + p = collend; + break; + case 4: /* xmlcharrefreplace */ + /* generate replacement (temporarily (mis)uses p) */ + for (p = collstart; p < collend; ++p) + output += sprintf(output, "&#%d;", (int)*p); + p = collend; + break; + default: + repunicode = unicode_encode_call_errorhandler(errors, &errorHandler, + encoding, reason, s, length, &unicodeObject, + collstart-s, collend-s, &newpos); + if (repunicode == NULL) + goto onError; + /* generate replacement */ + repsize = PyUnicode_GET_SIZE(repunicode); + for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) { + Py_UNICODE ch = *uni2; + if (Py_UNICODE_ISSPACE(ch)) + *output++ = ' '; + else { + decimal = Py_UNICODE_TODECIMAL(ch); + if (decimal >= 0) + *output++ = '0' + decimal; + else if (0 < ch && ch < 256) + *output++ = (char)ch; + else { + Py_DECREF(repunicode); + PyCodec_RaiseUnicodeEncodeError(encoding, s, + collstart-s, collend-s, reason); + goto onError; + } + } + } + p = s + newpos; + Py_DECREF(repunicode); } } /* 0-terminate the output string */ *output++ = '\0'; + Py_XDECREF(unicodeObject); + Py_XDECREF(errorHandler); return 0; onError: + Py_XDECREF(unicodeObject); + Py_XDECREF(errorHandler); return -1; } @@ -3865,7 +4428,9 @@ Return an encoded string version of S. Default encoding is the current\n\ default string encoding. errors may be given to set a different error\n\ handling scheme. Default is 'strict' meaning that encoding errors raise\n\ -a ValueError. Other possible values are 'ignore' and 'replace'."; +a ValueError. Other possible values are 'ignore', 'replace' and\n\ +'xmlcharrefreplace' as well as any other name registered with\n\ +codecs.register_unicodeencodeerrorhandler."; static PyObject * unicode_encode(PyUnicodeObject *self, PyObject *args) Index: Python/codecs.c =================================================================== RCS file: /cvsroot/python/python/dist/src/Python/codecs.c,v retrieving revision 2.13 diff -u -r2.13 codecs.c --- Python/codecs.c 26 Sep 2000 05:46:01 -0000 2.13 +++ Python/codecs.c 29 May 2002 20:44:05 -0000 @@ -416,12 +416,487 @@ return NULL; } +static PyObject *_PyCodec_UnicodeEncodeErrorHandlerRegistry; + +/* Register the error handling callback function error under the name name + this function will be called by the encode when it encounters + an unencodable character and doesn't know the callback name, + as name is specified as the error parameter in the call to the encode function. + Return 0 on success, -1 on error */ +int PyCodec_RegisterUnicodeEncodeErrorHandler(const char *name, PyObject *error) +{ + if (!PyCallable_Check(error)) { + PyErr_SetString(PyExc_TypeError, + "handler must be callable"); + return -1; + } + return PyDict_SetItemString( + _PyCodec_UnicodeEncodeErrorHandlerRegistry, (char *)name, error); +} + +/* Lookup the error handling callback function registered under the name name + As a special cases NULL can be passed which means "strict". */ +PyObject *PyCodec_LookupUnicodeEncodeErrorHandler(const char *name) +{ + PyObject *handler = NULL; + + if (name==NULL) + name = "strict"; + handler = PyDict_GetItemString( + _PyCodec_UnicodeEncodeErrorHandlerRegistry, (char *)name); + if (!handler) + PyErr_Format(PyExc_LookupError, "unknown error handler name '%.400s'", name); + else + Py_INCREF(handler); + return handler; +} + +void PyCodec_RaiseUnicodeEncodeError( + const char *encoding, const Py_UNICODE *str, int startpos, int endpos, + const char *reason) +{ + if (endpos==startpos+1) { + PyErr_Format(PyExc_UnicodeError, + "'%.400s' codec can't encode character '\\u%x' in position %d: %.400s", + encoding, (int)str[startpos], startpos, reason); + } + else { + PyErr_Format(PyExc_UnicodeError, + "'%.400s' codec can't encode characters in position %d-%d: %.400s", + encoding, startpos, endpos-1, reason); + } +} + +PyObject *PyCodec_RaiseUnicodeEncodeErrors(PyObject *self, PyObject *args) +{ + char *encoding; + Py_UNICODE *unicode; + int size; + int startpos; + int endpos; + const char *reason; + PyObject *state; + + if (PyArg_ParseTuple(args, "su#iisO:raise_unicodeencode_errors", + &encoding, &unicode, &size, &startpos, &endpos, &reason, &state)) + PyCodec_RaiseUnicodeEncodeError(encoding, unicode, startpos, endpos, reason); + return NULL; +} + + +PyObject *PyCodec_IgnoreUnicodeEncodeErrors(PyObject *self, PyObject *args) +{ + PyObject *encoding; + PyObject *unicode; + int startpos; + int endpos; + PyObject *reason; + PyObject *state; + + if (!PyArg_ParseTuple(args, "OOiiOO:ignore_unicodeencode_errors", + &encoding, &unicode, &startpos, &endpos, &reason, &state)) + return NULL; + /* ouch: passing NULL, 0, pos gives None instead of u'' */ + return Py_BuildValue("(u#i)", &endpos, 0, endpos); +} + + +PyObject *PyCodec_ReplaceUnicodeEncodeErrors(PyObject *self, PyObject *args) +{ + PyObject *encoding; + PyObject *unicode; + int startpos; + int endpos; + PyObject *reason; + PyObject *state; + PyObject *res; + PyObject *restuple; + Py_UNICODE *p; + Py_UNICODE *end; + + if (!PyArg_ParseTuple(args, "OOiiOO:replace_unicodeencode_errors", + &encoding, &unicode, &startpos, &endpos, &reason, &state)) + return NULL; + + res = PyUnicode_FromUnicode(NULL, endpos-startpos); + if (res == NULL) + return NULL; + for (p = PyUnicode_AS_UNICODE(res), end = p + PyUnicode_GET_SIZE(res); + p0) { + *outp++ = '0' + c/base; + c %= base; + base /= 10; + } + *outp++ = ';'; + } + + restuple = Py_BuildValue("(Oi)", res, endpos); + if (restuple==NULL) { + Py_DECREF(res); + return NULL; + } + return restuple; +} + +static Py_UNICODE hexdigits[] = { + '0', '1', '2', '3', '4', '5', '6', '7', + '8', '9', 'a', 'b', 'c', 'd', 'e', 'f' +}; + +PyObject *PyCodec_BackslashReplaceUnicodeEncodeErrors( + PyObject *self, PyObject *args) +{ + PyObject *encoding; + Py_UNICODE *unicode; + int size; + int startpos; + int endpos; + PyObject *reason; + PyObject *state; + Py_UNICODE *inp; + Py_UNICODE *outp; + PyObject *res; + PyObject *restuple; + int ressize; + + if (!PyArg_ParseTuple(args, "Ou#iiOO:backslashreplace_unicodeencode_errors", + &encoding, &unicode, &size, &startpos, &endpos, &reason, &state)) + return NULL; + + for (inp = unicode+startpos, ressize = 0; inp < unicode+endpos; ++inp) { + if (*inp >= 0x00010000) + ressize += 1+1+8; + else if (*inp >= 0x100) { + ressize += 1+1+4; + } + else + ressize += 1+1+2; + } + res = PyUnicode_FromUnicode(NULL, ressize); + if (res==NULL) + return NULL; + for (inp = unicode+startpos, outp = PyUnicode_AS_UNICODE(res); + inp < unicode+endpos; ++inp) { + Py_UNICODE c = *inp; + *outp++ = '\\'; + if (c >= 0x00010000) { + *outp++ = 'U'; + *outp++ = hexdigits[(c>>28)&0xf]; + *outp++ = hexdigits[(c>>24)&0xf]; + *outp++ = hexdigits[(c>>20)&0xf]; + *outp++ = hexdigits[(c>>16)&0xf]; + *outp++ = hexdigits[(c>>12)&0xf]; + *outp++ = hexdigits[(c>>8)&0xf]; + } + else if (c >= 0x100) { + *outp++ = 'u'; + *outp++ = hexdigits[(c>>12)&0xf]; + *outp++ = hexdigits[(c>>8)&0xf]; + } + else + *outp++ = 'x'; + *outp++ = hexdigits[(c>>4)&0xf]; + *outp++ = hexdigits[c&0xf]; + } + + restuple = Py_BuildValue("(Oi)", res, endpos); + if (restuple==NULL) { + Py_DECREF(res); + return NULL; + } + return restuple; +} + +static PyObject *_PyCodec_UnicodeDecodeErrorHandlerRegistry; + +int PyCodec_RegisterUnicodeDecodeErrorHandler(const char *name, PyObject *error) +{ + if (!PyCallable_Check(error)) { + PyErr_SetString(PyExc_TypeError, "handler must be callable"); + return -1; + } + return PyDict_SetItemString( + _PyCodec_UnicodeDecodeErrorHandlerRegistry, (char *)name, error); +} + +PyObject *PyCodec_LookupUnicodeDecodeErrorHandler(const char *name) +{ + PyObject *handler = NULL; + + if (name==NULL) + name = "strict"; + handler = PyDict_GetItemString( + _PyCodec_UnicodeDecodeErrorHandlerRegistry, (char *)name); + if (!handler) + PyErr_Format(PyExc_LookupError, "unknown error handler name '%.400s'", name); + else + Py_INCREF(handler); + return handler; +} + +void PyCodec_RaiseUnicodeDecodeError( + const char *encoding, const char *str, int startpos, int endpos, + const char *reason) +{ + if (endpos==startpos+1) { + PyErr_Format(PyExc_UnicodeError, + "'%.400s' codec can't decode byte 0x%x in position %d: %.400s", + encoding, ((int)str[startpos])&0xff, startpos, reason); + } + else { + PyErr_Format(PyExc_UnicodeError, + "'%.400s' codec can't decode bytes in position %d-%d: %.400s", + encoding, startpos, endpos-1, reason); + } +} + + +PyObject *PyCodec_RaiseUnicodeDecodeErrors(PyObject *self, PyObject *args) +{ + char *encoding; + char *str; + int size; + int startpos; + int endpos; + const char *reason; + PyObject *state; + + if (PyArg_ParseTuple(args, "ss#iisO:raise_unicodedecode_errors", + &encoding, &str, &size, &startpos, &endpos, &reason, &state)) + PyCodec_RaiseUnicodeDecodeError(encoding, str, startpos, endpos, reason); + return NULL; +} + + +PyObject *PyCodec_IgnoreUnicodeDecodeErrors(PyObject *self, PyObject *args) +{ + PyObject *encoding; + PyObject *str; + int startpos; + int endpos; + PyObject *reason; + PyObject *state; + + if (!PyArg_ParseTuple(args, "OOiiOO:ignore_unicodedecode_errors", + &encoding, &str, &startpos, &endpos, &reason, &state)) + return NULL; + /* ouch: passing NULL, 0, pos gives None instead of u'' */ + return Py_BuildValue("(u#i)", &endpos, 0, endpos); +} + + +PyObject *PyCodec_ReplaceUnicodeDecodeErrors(PyObject *self, PyObject *args) +{ + PyObject *encoding; + PyObject *str; + int startpos; + int endpos; + PyObject *reason; + PyObject *state; + Py_UNICODE res = Py_UNICODE_REPLACEMENT_CHARACTER; + + if (!PyArg_ParseTuple(args, "OOiiOO:replace_unicodedecode_errors", + &encoding, &str, &startpos, &endpos, &reason, &state)) + return NULL; + return Py_BuildValue("(u#i)", &res, 1, endpos); +} + void _PyCodecRegistry_Init(void) { + static struct { + char *name; + PyMethodDef def; + } methods[] = + { + { + "strict", + { + "raise_unicodeencode_errors", + PyCodec_RaiseUnicodeEncodeErrors, + METH_VARARGS + } + }, + { + "ignore", + { + "ignore_unicodeencode_errors", + PyCodec_IgnoreUnicodeEncodeErrors, + METH_VARARGS + } + }, + { + "replace", + { + "replace_unicodeencode_errors", + PyCodec_ReplaceUnicodeEncodeErrors, + METH_VARARGS + } + }, + { + "xmlcharrefreplace", + { + "xmlcharrefreplace_unicodeencode_errors", + PyCodec_XMLCharRefReplaceUnicodeEncodeErrors, + METH_VARARGS + } + }, + { + "backslashreplace", + { + "backslashreplace_unicodeencode_errors", + PyCodec_BackslashReplaceUnicodeEncodeErrors, + METH_VARARGS + } + }, + { + "strict", + { + "raise_unicodedecode_errors", + PyCodec_RaiseUnicodeDecodeErrors, + METH_VARARGS + } + }, + { + "ignore", + { + "ignore_unicodedecode_errors", + PyCodec_IgnoreUnicodeDecodeErrors, + METH_VARARGS + } + }, + { + "replace", + { + "replace_unicodedecode_errors", + PyCodec_ReplaceUnicodeDecodeErrors, + METH_VARARGS + } + } + }; if (_PyCodec_SearchPath == NULL) _PyCodec_SearchPath = PyList_New(0); if (_PyCodec_SearchCache == NULL) _PyCodec_SearchCache = PyDict_New(); + if (_PyCodec_UnicodeEncodeErrorHandlerRegistry == NULL) { + int i; + _PyCodec_UnicodeEncodeErrorHandlerRegistry = PyDict_New(); + + if (_PyCodec_UnicodeEncodeErrorHandlerRegistry) { + for (i = 0; i < 5; ++i) { + PyObject *func = PyCFunction_New(&methods[i].def, NULL); + int res; + if (!func) + Py_FatalError("can't initialize codec registry"); + res = PyCodec_RegisterUnicodeEncodeErrorHandler(methods[i].name, func); + Py_DECREF(func); + if (res) + Py_FatalError("can't initialize codec registry"); + } + } + } + if (_PyCodec_UnicodeDecodeErrorHandlerRegistry == NULL) { + int i; + _PyCodec_UnicodeDecodeErrorHandlerRegistry = PyDict_New(); + + if (_PyCodec_UnicodeDecodeErrorHandlerRegistry) { + for (i = 5; i < 8; ++i) { + PyObject *func = PyCFunction_New(&methods[i].def, NULL); + int res; + if (!func) + Py_FatalError("can't initialize codec registry"); + res = PyCodec_RegisterUnicodeDecodeErrorHandler(methods[i].name, func); + Py_DECREF(func); + if (res) + Py_FatalError("can't initialize codec registry"); + } + } + } if (_PyCodec_SearchPath == NULL || _PyCodec_SearchCache == NULL) Py_FatalError("can't initialize codec registry"); @@ -433,4 +908,8 @@ _PyCodec_SearchPath = NULL; Py_XDECREF(_PyCodec_SearchCache); _PyCodec_SearchCache = NULL; + Py_XDECREF(_PyCodec_UnicodeEncodeErrorHandlerRegistry); + _PyCodec_UnicodeEncodeErrorHandlerRegistry = NULL; + Py_XDECREF(_PyCodec_UnicodeDecodeErrorHandlerRegistry); + _PyCodec_UnicodeDecodeErrorHandlerRegistry = NULL; }