Index: Include/codecs.h =================================================================== RCS file: /cvsroot/python/python/dist/src/Include/codecs.h,v retrieving revision 2.3 diff -u -c -5 -r2.3 codecs.h *** Include/codecs.h 3 Aug 2000 16:24:24 -0000 2.3 --- Include/codecs.h 24 Jul 2002 18:52:18 -0000 *************** *** 115,123 **** const char *encoding, PyObject *stream, const char *errors ); #ifdef __cplusplus } #endif ! #endif /* !Py_CODECREGISTRY_H */ --- 115,158 ---- const char *encoding, PyObject *stream, const char *errors ); + /* Unicode encoding error handling callback registry API */ + + /* Register the error handling callback function error under the name + name. This function will be called by the codec when it encounters + an unencodable characters/undecodable bytes and doesn't know the + callback name, when name is specified as the error parameter + in the call to the encode/decode function. + Return 0 on success, -1 on error */ + extern DL_IMPORT(int) PyCodec_RegisterError( + const char *name, + PyObject *error + ); + + /* Lookup the error handling callback function registered under the + name error. As a special case NULL can be passed, in which case + the error handling callback for strict encoding will be returned. */ + extern DL_IMPORT(PyObject *) PyCodec_LookupError( + const char *name + ); + + /* raise exc as an exception */ + extern DL_IMPORT(PyObject *) PyCodec_StrictErrors(PyObject *exc); + + /* ignore the unicode error, skipping the faulty input */ + extern DL_IMPORT(PyObject *) PyCodec_IgnoreErrors(PyObject *exc); + + /* replace the unicode error with ? or U+FFFD */ + extern DL_IMPORT(PyObject *) PyCodec_ReplaceErrors(PyObject *exc); + + /* replace the unicode encode error with XML character references */ + extern DL_IMPORT(PyObject *) PyCodec_XMLCharRefReplaceErrors(PyObject *exc); + + /* replace the unicode encode error with backslash escapes (\x, \u and \U) */ + extern DL_IMPORT(PyObject *) PyCodec_BackslashReplaceErrors(PyObject *exc); + #ifdef __cplusplus } #endif ! #endif Index: Include/pyerrors.h =================================================================== RCS file: /cvsroot/python/python/dist/src/Include/pyerrors.h,v retrieving revision 2.54 diff -u -c -5 -r2.54 pyerrors.h *** Include/pyerrors.h 29 May 2002 15:54:54 -0000 2.54 --- Include/pyerrors.h 24 Jul 2002 18:52:18 -0000 *************** *** 52,61 **** --- 52,64 ---- extern DL_IMPORT(PyObject *) PyExc_SystemError; extern DL_IMPORT(PyObject *) PyExc_SystemExit; extern DL_IMPORT(PyObject *) PyExc_TypeError; extern DL_IMPORT(PyObject *) PyExc_UnboundLocalError; extern DL_IMPORT(PyObject *) PyExc_UnicodeError; + extern DL_IMPORT(PyObject *) PyExc_UnicodeEncodeError; + extern DL_IMPORT(PyObject *) PyExc_UnicodeDecodeError; + extern DL_IMPORT(PyObject *) PyExc_UnicodeTranslateError; extern DL_IMPORT(PyObject *) PyExc_ValueError; extern DL_IMPORT(PyObject *) PyExc_ZeroDivisionError; #ifdef MS_WINDOWS extern DL_IMPORT(PyObject *) PyExc_WindowsError; #endif *************** *** 107,116 **** --- 110,183 ---- extern DL_IMPORT(void) PyErr_SetInterrupt(void); /* Support for adding program text to SyntaxErrors */ extern DL_IMPORT(void) PyErr_SyntaxLocation(char *, int); extern DL_IMPORT(PyObject *) PyErr_ProgramText(char *, int); + + /* The following functions are used to create and modify unicode + exceptions from C */ + /* create a UnicodeDecodeError object */ + extern DL_IMPORT(PyObject *) PyUnicodeDecodeError_Create( + const char *, const char *, int, int, int, const char *); + + /* create a UnicodeEncodeError object */ + extern DL_IMPORT(PyObject *) PyUnicodeEncodeError_Create( + const char *, const Py_UNICODE *, int, int, int, const char *); + + /* create a UnicodeTranslateError object */ + extern DL_IMPORT(PyObject *) PyUnicodeTranslateError_Create( + const Py_UNICODE *, int, int, int, const char *); + + /* get the encoding attribute */ + extern DL_IMPORT(PyObject *) PyUnicodeEncodeError_GetEncoding(PyObject *); + extern DL_IMPORT(PyObject *) PyUnicodeDecodeError_GetEncoding(PyObject *); + extern DL_IMPORT(PyObject *) PyUnicodeTranslateError_GetEncoding(PyObject *); + + /* get the object attribute */ + extern DL_IMPORT(PyObject *) PyUnicodeEncodeError_GetObject(PyObject *); + extern DL_IMPORT(PyObject *) PyUnicodeDecodeError_GetObject(PyObject *); + extern DL_IMPORT(PyObject *) PyUnicodeTranslateError_GetObject(PyObject *); + + /* get the value of the start attribute (the int * may not be NULL) + return -1 on success, 0 on failure */ + extern DL_IMPORT(int) PyUnicodeEncodeError_GetStart(PyObject *, int *); + extern DL_IMPORT(int) PyUnicodeDecodeError_GetStart(PyObject *, int *); + extern DL_IMPORT(int) PyUnicodeTranslateError_GetStart(PyObject *, int *); + + /* assign a new value to the start attribute + return -1 on success, 0 on failure */ + extern DL_IMPORT(int) PyUnicodeEncodeError_SetStart(PyObject *, int); + extern DL_IMPORT(int) PyUnicodeDecodeError_SetStart(PyObject *, int); + extern DL_IMPORT(int) PyUnicodeTranslateError_SetStart(PyObject *, int); + + /* get the value of the end attribute (the int *may not be NULL) + return -1 on success, 0 on failure */ + extern DL_IMPORT(int) PyUnicodeEncodeError_GetEnd(PyObject *, int *); + extern DL_IMPORT(int) PyUnicodeDecodeError_GetEnd(PyObject *, int *); + extern DL_IMPORT(int) PyUnicodeTranslateError_GetEnd(PyObject *, int *); + + /* assign a new value to the end attribute + return -1 on success, 0 on failure */ + extern DL_IMPORT(int) PyUnicodeEncodeError_SetEnd(PyObject *, int); + extern DL_IMPORT(int) PyUnicodeDecodeError_SetEnd(PyObject *, int); + extern DL_IMPORT(int) PyUnicodeTranslateError_SetEnd(PyObject *, int); + + /* get the value of the reason attribute + return -1 on success, 0 on failure */ + extern DL_IMPORT(PyObject *) PyUnicodeEncodeError_GetReason(PyObject *); + extern DL_IMPORT(PyObject *) PyUnicodeDecodeError_GetReason(PyObject *); + extern DL_IMPORT(PyObject *) PyUnicodeTranslateError_GetReason(PyObject *); + + /* assign a new value to the reason attribute + return -1 on success, 0 on failure */ + extern DL_IMPORT(int) PyUnicodeEncodeError_SetReason( + PyObject *, const char *); + extern DL_IMPORT(int) PyUnicodeDecodeError_SetReason( + PyObject *, const char *); + extern DL_IMPORT(int) PyUnicodeTranslateError_SetReason( + PyObject *, const char *); + /* These APIs aren't really part of the error implementation, but often needed to format error messages; the native C lib APIs are not available on all platforms, which is why we provide emulations for those platforms in Python/mysnprintf.c, Index: Lib/codecs.py =================================================================== RCS file: /cvsroot/python/python/dist/src/Lib/codecs.py,v retrieving revision 1.26 diff -u -c -5 -r1.26 codecs.py *** Lib/codecs.py 4 Jun 2002 15:16:29 -0000 1.26 --- Lib/codecs.py 24 Jul 2002 18:52:19 -0000 *************** *** 18,28 **** 'Failed to load the builtin codecs: %s' % why __all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE", "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE", "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE", ! "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE"] ### Constants # # Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF) --- 18,31 ---- 'Failed to load the builtin codecs: %s' % why __all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE", "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE", "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE", ! "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE", ! "strict_errors", "ignore_errors", "replace_errors", ! "xmlcharrefreplace_errors", ! "register_error", "lookup_error"] ### Constants # # Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF) Index: Modules/_codecsmodule.c =================================================================== RCS file: /cvsroot/python/python/dist/src/Modules/_codecsmodule.c,v retrieving revision 2.12 diff -u -c -5 -r2.12 _codecsmodule.c *** Modules/_codecsmodule.c 30 Jun 2002 15:26:09 -0000 2.12 --- Modules/_codecsmodule.c 24 Jul 2002 18:52:21 -0000 *************** *** 662,671 **** --- 662,725 ---- } #endif /* MS_WINDOWS */ #endif /* Py_USING_UNICODE */ + /* --- Error handler registry --------------------------------------------- */ + + static PyObject *register_error(PyObject *self, PyObject *args) + { + const char *name; + PyObject *handler; + + if (!PyArg_ParseTuple(args, "sO:register_error", + &name, &handler)) + return NULL; + if (PyCodec_RegisterError(name, handler)) + return NULL; + Py_INCREF(Py_None); + return Py_None; + } + + static PyObject *lookup_error(PyObject *self, PyObject *args) + { + const char *name; + + if (!PyArg_ParseTuple(args, "s:lookup_error", + &name)) + return NULL; + return PyCodec_LookupError(name); + } + + static PyObject *strict_errors(PyObject *self, PyObject *exc) + { + return PyCodec_StrictErrors(exc); + } + + + static PyObject *ignore_errors(PyObject *self, PyObject *exc) + { + return PyCodec_IgnoreErrors(exc); + } + + + static PyObject *replace_errors(PyObject *self, PyObject *exc) + { + return PyCodec_ReplaceErrors(exc); + } + + + static PyObject *xmlcharrefreplace_errors(PyObject *self, PyObject *exc) + { + return PyCodec_XMLCharRefReplaceErrors(exc); + } + + + static PyObject *backslashreplace_errors(PyObject *self, PyObject *exc) + { + return PyCodec_BackslashReplaceErrors(exc); + } /* --- Module API --------------------------------------------------------- */ static PyMethodDef _codecs_functions[] = { {"register", codecregister, METH_VARARGS}, {"lookup", codeclookup, METH_VARARGS}, *************** *** 698,707 **** --- 752,768 ---- #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T) {"mbcs_encode", mbcs_encode, METH_VARARGS}, {"mbcs_decode", mbcs_decode, METH_VARARGS}, #endif #endif /* Py_USING_UNICODE */ + {"register_error", register_error, METH_VARARGS}, + {"lookup_error", lookup_error, METH_VARARGS}, + {"strict_errors", strict_errors, METH_O}, + {"ignore_errors", ignore_errors, METH_O}, + {"replace_errors", replace_errors, METH_O}, + {"xmlcharrefreplace_errors",xmlcharrefreplace_errors, METH_O}, + {"backslashreplace_errors", backslashreplace_errors, METH_O}, {NULL, NULL} /* sentinel */ }; DL_EXPORT(void) init_codecs(void) Index: Objects/stringobject.c =================================================================== RCS file: /cvsroot/python/python/dist/src/Objects/stringobject.c,v retrieving revision 2.170 diff -u -c -5 -r2.170 stringobject.c *** Objects/stringobject.c 17 Jul 2002 16:30:38 -0000 2.170 --- Objects/stringobject.c 24 Jul 2002 18:52:22 -0000 *************** *** 2260,2270 **** "S.encode([encoding[,errors]]) -> object\n\ \n\ Encodes S using the codec registered for encoding. encoding defaults\n\ to the default encoding. errors may be given to set a different error\n\ handling scheme. Default is 'strict' meaning that encoding errors raise\n\ ! a ValueError. Other possible values are 'ignore' and 'replace'."); static PyObject * string_encode(PyStringObject *self, PyObject *args) { char *encoding = NULL; --- 2260,2272 ---- "S.encode([encoding[,errors]]) -> object\n\ \n\ Encodes S using the codec registered for encoding. encoding defaults\n\ to the default encoding. errors may be given to set a different error\n\ handling scheme. Default is 'strict' meaning that encoding errors raise\n\ ! a UnicodeError. Other possible values are 'ignore', 'replace' and\n\ ! 'xmlcharrefreplace' as well as any other name registered with\n\ ! codecs.register_error."); static PyObject * string_encode(PyStringObject *self, PyObject *args) { char *encoding = NULL; *************** *** 2279,2289 **** "S.decode([encoding[,errors]]) -> object\n\ \n\ Decodes S using the codec registered for encoding. encoding defaults\n\ to the default encoding. errors may be given to set a different error\n\ handling scheme. Default is 'strict' meaning that encoding errors raise\n\ ! a ValueError. Other possible values are 'ignore' and 'replace'."); static PyObject * string_decode(PyStringObject *self, PyObject *args) { char *encoding = NULL; --- 2281,2292 ---- "S.decode([encoding[,errors]]) -> object\n\ \n\ Decodes S using the codec registered for encoding. encoding defaults\n\ to the default encoding. errors may be given to set a different error\n\ handling scheme. Default is 'strict' meaning that encoding errors raise\n\ ! a UnicodeError. Other possible values are 'ignore' and 'replace' as well\n\ ! as any other name registerd with codecs.register_error."); static PyObject * string_decode(PyStringObject *self, PyObject *args) { char *encoding = NULL; Index: Objects/unicodeobject.c =================================================================== RCS file: /cvsroot/python/python/dist/src/Objects/unicodeobject.c,v retrieving revision 2.156 diff -u -c -5 -r2.156 unicodeobject.c *** Objects/unicodeobject.c 17 Jul 2002 16:30:38 -0000 2.156 --- Objects/unicodeobject.c 24 Jul 2002 18:52:24 -0000 *************** *** 487,498 **** int size, const char *encoding, const char *errors) { PyObject *buffer = NULL, *unicode; ! ! if (encoding == NULL) encoding = PyUnicode_GetDefaultEncoding(); /* Shortcuts for common default encodings */ if (strcmp(encoding, "utf-8") == 0) return PyUnicode_DecodeUTF8(s, size, errors); --- 487,498 ---- int size, const char *encoding, const char *errors) { PyObject *buffer = NULL, *unicode; ! ! if (encoding == NULL) encoding = PyUnicode_GetDefaultEncoding(); /* Shortcuts for common default encodings */ if (strcmp(encoding, "utf-8") == 0) return PyUnicode_DecodeUTF8(s, size, errors); *************** *** 639,648 **** --- 639,734 ---- onError: return -1; } + /* error handling callback helper: + build arguments, call the callback and check the arguments, + if no exception occured, copy the replacement to the output + and adjust various state variables. + return 0 on success, -1 on error + */ + + static + int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler, + const char *encoding, const char *reason, + const char *input, int insize, int *startinpos, int *endinpos, PyObject **exceptionObject, const char **inptr, + PyObject **output, int *outpos, Py_UNICODE **outptr) + { + static char *argparse = "O!i;decoding error handler must return (unicode, int) tuple"; + + PyObject *restuple = NULL; + PyObject *repunicode = NULL; + int outsize = PyUnicode_GET_SIZE(*output); + int requiredsize; + int newpos; + Py_UNICODE *repptr; + int repsize; + int res = -1; + + if (*errorHandler == NULL) { + *errorHandler = PyCodec_LookupError(errors); + if (*errorHandler == NULL) + goto onError; + } + + if (*exceptionObject == NULL) { + *exceptionObject = PyUnicodeDecodeError_Create( + encoding, input, insize, *startinpos, *endinpos, reason); + if (*exceptionObject == NULL) + goto onError; + } + else { + if (!PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos)) + goto onError; + if (!PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos)) + goto onError; + if (!PyUnicodeDecodeError_SetReason(*exceptionObject, reason)) + goto onError; + } + + restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL); + if (restuple == NULL) + goto onError; + if (!PyTuple_Check(restuple)) { + PyErr_Format(PyExc_TypeError, &argparse[4]); + goto onError; + } + if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos)) + goto onError; + if (newpos<0) + newpos = 0; + else if (newpos>insize) + newpos = insize; + + /* need more space? (at least enough for what we + have+the replacement+the rest of the string (starting + at the new input position), so we won't have to check space + when there are no errors in the rest of the string) */ + repptr = PyUnicode_AS_UNICODE(repunicode); + repsize = PyUnicode_GET_SIZE(repunicode); + requiredsize = *outpos + repsize + insize-newpos; + if (requiredsize > outsize) { + if (requiredsize<2*outsize) + requiredsize = 2*outsize; + if (PyUnicode_Resize(output, requiredsize)) + goto onError; + *outptr = PyUnicode_AS_UNICODE(*output) + *outpos; + } + *endinpos = newpos; + *inptr = input + newpos; + Py_UNICODE_COPY(*outptr, repptr, repsize); + *outptr += repsize; + *outpos += repsize; + /* we made it! */ + res = 0; + + onError: + Py_XDECREF(restuple); + return res; + } + /* --- UTF-7 Codec -------------------------------------------------------- */ /* see RFC2152 for details */ static *************** *** 697,748 **** } else { \ *out++ = outCh; \ } \ } \ - static - int utf7_decoding_error(Py_UNICODE **dest, - const char *errors, - const char *details) - { - if ((errors == NULL) || - (strcmp(errors,"strict") == 0)) { - PyErr_Format(PyExc_UnicodeError, - "UTF-7 decoding error: %.400s", - details); - return -1; - } - else if (strcmp(errors,"ignore") == 0) { - return 0; - } - else if (strcmp(errors,"replace") == 0) { - if (dest != NULL) { - **dest = Py_UNICODE_REPLACEMENT_CHARACTER; - (*dest)++; - } - return 0; - } - else { - PyErr_Format(PyExc_ValueError, - "UTF-7 decoding error; unknown error handling code: %.400s", - errors); - return -1; - } - } - PyObject *PyUnicode_DecodeUTF7(const char *s, int size, const char *errors) { const char *e; PyUnicodeObject *unicode; Py_UNICODE *p; const char *errmsg = ""; int inShift = 0; unsigned int bitsleft = 0; unsigned long charsleft = 0; ! int surrogate = 0; unicode = _PyUnicode_New(size); if (!unicode) return NULL; if (size == 0) --- 783,810 ---- } else { \ *out++ = outCh; \ } \ } \ PyObject *PyUnicode_DecodeUTF7(const char *s, int size, const char *errors) { + const char *starts = s; + int startinpos; + int endinpos; + int outpos; const char *e; PyUnicodeObject *unicode; Py_UNICODE *p; const char *errmsg = ""; int inShift = 0; unsigned int bitsleft = 0; unsigned long charsleft = 0; ! int surrogate = 0; ! PyObject *errorHandler = NULL; ! PyObject *exc = NULL; unicode = _PyUnicode_New(size); if (!unicode) return NULL; if (size == 0) *************** *** 750,760 **** p = unicode->str; e = s + size; while (s < e) { ! Py_UNICODE ch = *s; if (inShift) { if ((ch == '-') || !B64CHAR(ch)) { inShift = 0; s++; --- 812,824 ---- p = unicode->str; e = s + size; while (s < e) { ! Py_UNICODE ch; ! restart: ! ch = *s; if (inShift) { if ((ch == '-') || !B64CHAR(ch)) { inShift = 0; s++; *************** *** 795,804 **** --- 859,869 ---- s++; /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate); } } else if ( ch == '+' ) { + startinpos = s-starts; s++; if (s < e && *s == '-') { s++; *p++ = '+'; } else *************** *** 816,840 **** *p++ = ch; s++; } continue; utf7Error: ! if (utf7_decoding_error(&p, errors, errmsg)) ! goto onError; } if (inShift) { ! if (utf7_decoding_error(&p, errors, "unterminated shift sequence")) goto onError; } ! if (_PyUnicode_Resize(&unicode, p - unicode->str)) goto onError; return (PyObject *)unicode; onError: Py_DECREF(unicode); return NULL; } --- 881,923 ---- *p++ = ch; s++; } continue; utf7Error: ! outpos = p-PyUnicode_AS_UNICODE(unicode); ! endinpos = s-starts; ! if (unicode_decode_call_errorhandler( ! errors, &errorHandler, ! "utf7", errmsg, ! starts, size, &startinpos, &endinpos, &exc, &s, ! (PyObject **)&unicode, &outpos, &p)) ! goto onError; } if (inShift) { ! outpos = p-PyUnicode_AS_UNICODE(unicode); ! endinpos = size; ! if (unicode_decode_call_errorhandler( ! errors, &errorHandler, ! "utf7", "unterminated shift sequence", ! starts, size, &startinpos, &endinpos, &exc, &s, ! (PyObject **)&unicode, &outpos, &p)) goto onError; + if (s < e) + goto restart; } ! if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode))) goto onError; + Py_XDECREF(errorHandler); + Py_XDECREF(exc); return (PyObject *)unicode; onError: + Py_XDECREF(errorHandler); + Py_XDECREF(exc); Py_DECREF(unicode); return NULL; } *************** *** 960,1009 **** 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0 }; - static - int utf8_decoding_error(const char **source, - Py_UNICODE **dest, - const char *errors, - const char *details) - { - if ((errors == NULL) || - (strcmp(errors,"strict") == 0)) { - PyErr_Format(PyExc_UnicodeError, - "UTF-8 decoding error: %.400s", - details); - return -1; - } - else if (strcmp(errors,"ignore") == 0) { - (*source)++; - return 0; - } - else if (strcmp(errors,"replace") == 0) { - (*source)++; - **dest = Py_UNICODE_REPLACEMENT_CHARACTER; - (*dest)++; - return 0; - } - else { - PyErr_Format(PyExc_ValueError, - "UTF-8 decoding error; unknown error handling code: %.400s", - errors); - return -1; - } - } - PyObject *PyUnicode_DecodeUTF8(const char *s, int size, const char *errors) { int n; const char *e; PyUnicodeObject *unicode; Py_UNICODE *p; const char *errmsg = ""; /* Note: size will always be longer than the resulting Unicode character count */ unicode = _PyUnicode_New(size); if (!unicode) --- 1043,1067 ---- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0 }; PyObject *PyUnicode_DecodeUTF8(const char *s, int size, const char *errors) { + const char *starts = s; int n; + int startinpos; + int endinpos; + int outpos; const char *e; PyUnicodeObject *unicode; Py_UNICODE *p; const char *errmsg = ""; + PyObject *errorHandler = NULL; + PyObject *exc = NULL; /* Note: size will always be longer than the resulting Unicode character count */ unicode = _PyUnicode_New(size); if (!unicode) *************** *** 1026,1055 **** --- 1084,1123 ---- n = utf8_code_length[ch]; if (s + n > e) { errmsg = "unexpected end of data"; + startinpos = s-starts; + endinpos = size; goto utf8Error; } switch (n) { case 0: errmsg = "unexpected code byte"; + startinpos = s-starts; + endinpos = startinpos+1; goto utf8Error; case 1: errmsg = "internal error"; + startinpos = s-starts; + endinpos = startinpos+1; goto utf8Error; case 2: if ((s[1] & 0xc0) != 0x80) { errmsg = "invalid data"; + startinpos = s-starts; + endinpos = startinpos+2; goto utf8Error; } ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f); if (ch < 0x80) { + startinpos = s-starts; + endinpos = startinpos+2; errmsg = "illegal encoding"; goto utf8Error; } else *p++ = (Py_UNICODE)ch; *************** *** 1057,1066 **** --- 1125,1136 ---- case 3: if ((s[1] & 0xc0) != 0x80 || (s[2] & 0xc0) != 0x80) { errmsg = "invalid data"; + startinpos = s-starts; + endinpos = startinpos+3; goto utf8Error; } ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f); if (ch < 0x0800) { /* Note: UTF-8 encodings of surrogates are considered *************** *** 1069,1078 **** --- 1139,1150 ---- XXX For wide builds (UCS-4) we should probably try to recombine the surrogates into a single code unit. */ errmsg = "illegal encoding"; + startinpos = s-starts; + endinpos = startinpos+3; goto utf8Error; } else *p++ = (Py_UNICODE)ch; break; *************** *** 1080,1089 **** --- 1152,1163 ---- case 4: if ((s[1] & 0xc0) != 0x80 || (s[2] & 0xc0) != 0x80 || (s[3] & 0xc0) != 0x80) { errmsg = "invalid data"; + startinpos = s-starts; + endinpos = startinpos+4; goto utf8Error; } ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) + ((s[2] & 0x3f) << 6) + (s[3] & 0x3f); /* validate and convert to UTF-16 */ *************** *** 1091,1100 **** --- 1165,1176 ---- byte encoding */ || (ch > 0x10ffff)) /* maximum value allowed for UTF-16 */ { errmsg = "illegal encoding"; + startinpos = s-starts; + endinpos = startinpos+4; goto utf8Error; } #ifdef Py_UNICODE_WIDE *p++ = (Py_UNICODE)ch; #else *************** *** 1112,1138 **** break; default: /* Other sizes are only needed for UCS-4 */ errmsg = "unsupported Unicode code range"; goto utf8Error; } s += n; continue; utf8Error: ! if (utf8_decoding_error(&s, &p, errors, errmsg)) ! goto onError; } /* Adjust length */ if (_PyUnicode_Resize(&unicode, p - unicode->str)) goto onError; return (PyObject *)unicode; onError: Py_DECREF(unicode); return NULL; } /* Allocation strategy: if the string is short, convert into a stack buffer --- 1188,1225 ---- break; default: /* Other sizes are only needed for UCS-4 */ errmsg = "unsupported Unicode code range"; + startinpos = s-starts; + endinpos = startinpos+n; goto utf8Error; } s += n; continue; utf8Error: ! outpos = p-PyUnicode_AS_UNICODE(unicode); ! if (unicode_decode_call_errorhandler( ! errors, &errorHandler, ! "utf8", errmsg, ! starts, size, &startinpos, &endinpos, &exc, &s, ! (PyObject **)&unicode, &outpos, &p)) ! goto onError; } /* Adjust length */ if (_PyUnicode_Resize(&unicode, p - unicode->str)) goto onError; + Py_XDECREF(errorHandler); + Py_XDECREF(exc); return (PyObject *)unicode; onError: + Py_XDECREF(errorHandler); + Py_XDECREF(exc); Py_DECREF(unicode); return NULL; } /* Allocation strategy: if the string is short, convert into a stack buffer *************** *** 1246,1292 **** NULL); } /* --- UTF-16 Codec ------------------------------------------------------- */ - static - int utf16_decoding_error(Py_UNICODE **dest, - const char *errors, - const char *details) - { - if ((errors == NULL) || - (strcmp(errors,"strict") == 0)) { - PyErr_Format(PyExc_UnicodeError, - "UTF-16 decoding error: %.400s", - details); - return -1; - } - else if (strcmp(errors,"ignore") == 0) { - return 0; - } - else if (strcmp(errors,"replace") == 0) { - if (dest) { - **dest = Py_UNICODE_REPLACEMENT_CHARACTER; - (*dest)++; - } - return 0; - } - else { - PyErr_Format(PyExc_ValueError, - "UTF-16 decoding error; " - "unknown error handling code: %.400s", - errors); - return -1; - } - } - PyObject * PyUnicode_DecodeUTF16(const char *s, int size, const char *errors, int *byteorder) { PyUnicodeObject *unicode; Py_UNICODE *p; const unsigned char *q, *e; int bo = 0; /* assume native ordering by default */ const char *errmsg = ""; --- 1333,1352 ---- NULL); } /* --- UTF-16 Codec ------------------------------------------------------- */ PyObject * PyUnicode_DecodeUTF16(const char *s, int size, const char *errors, int *byteorder) { + const char *starts = s; + int startinpos; + int endinpos; + int outpos; PyUnicodeObject *unicode; Py_UNICODE *p; const unsigned char *q, *e; int bo = 0; /* assume native ordering by default */ const char *errmsg = ""; *************** *** 1294,1310 **** #ifdef BYTEORDER_IS_LITTLE_ENDIAN int ihi = 1, ilo = 0; #else int ihi = 0, ilo = 1; #endif ! ! /* size should be an even number */ ! if (size & 1) { ! if (utf16_decoding_error(NULL, errors, "truncated data")) ! return NULL; ! --size; /* else ignore the oddball byte */ ! } /* Note: size will always be longer than the resulting Unicode character count */ unicode = _PyUnicode_New(size); if (!unicode) --- 1354,1365 ---- #ifdef BYTEORDER_IS_LITTLE_ENDIAN int ihi = 1, ilo = 0; #else int ihi = 0, ilo = 1; #endif ! PyObject *errorHandler = NULL; ! PyObject *exc = NULL; /* Note: size will always be longer than the resulting Unicode character count */ unicode = _PyUnicode_New(size); if (!unicode) *************** *** 1357,1377 **** ihi = 0; ilo = 1; } while (q < e) { ! Py_UNICODE ch = (q[ihi] << 8) | q[ilo]; q += 2; if (ch < 0xD800 || ch > 0xDFFF) { *p++ = ch; continue; } /* UTF-16 code pair: */ if (q >= e) { errmsg = "unexpected end of data"; goto utf16Error; } if (0xD800 <= ch && ch <= 0xDBFF) { Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo]; q += 2; --- 1412,1445 ---- ihi = 0; ilo = 1; } while (q < e) { ! Py_UNICODE ch; ! /* remaing bytes at the end? (size should be even) */ ! if (e-q<2) { ! errmsg = "truncated data"; ! startinpos = ((const char *)q)-starts; ! endinpos = ((const char *)e)-starts; ! goto utf16Error; ! /* The remaining input chars are ignored if the callback ! chooses to skip the input */ ! } ! ch = (q[ihi] << 8) | q[ilo]; ! q += 2; if (ch < 0xD800 || ch > 0xDFFF) { *p++ = ch; continue; } /* UTF-16 code pair: */ if (q >= e) { errmsg = "unexpected end of data"; + startinpos = (((const char *)q)-2)-starts; + endinpos = ((const char *)e)-starts; goto utf16Error; } if (0xD800 <= ch && ch <= 0xDBFF) { Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo]; q += 2; *************** *** 1384,1416 **** #endif continue; } else { errmsg = "illegal UTF-16 surrogate"; goto utf16Error; } } errmsg = "illegal encoding"; /* Fall through to report the error */ utf16Error: ! if (utf16_decoding_error(&p, errors, errmsg)) goto onError; } if (byteorder) *byteorder = bo; /* Adjust length */ if (_PyUnicode_Resize(&unicode, p - unicode->str)) goto onError; return (PyObject *)unicode; onError: Py_DECREF(unicode); return NULL; } PyObject * PyUnicode_EncodeUTF16(const Py_UNICODE *s, --- 1452,1497 ---- #endif continue; } else { errmsg = "illegal UTF-16 surrogate"; + startinpos = (((const char *)q)-4)-starts; + endinpos = startinpos+2; goto utf16Error; } } errmsg = "illegal encoding"; + startinpos = (((const char *)q)-2)-starts; + endinpos = startinpos+2; /* Fall through to report the error */ utf16Error: ! outpos = p-PyUnicode_AS_UNICODE(unicode); ! if (unicode_decode_call_errorhandler( ! errors, &errorHandler, ! "utf16", errmsg, ! starts, size, &startinpos, &endinpos, &exc, (const char **)&q, ! (PyObject **)&unicode, &outpos, &p)) goto onError; } if (byteorder) *byteorder = bo; /* Adjust length */ if (_PyUnicode_Resize(&unicode, p - unicode->str)) goto onError; + Py_XDECREF(errorHandler); + Py_XDECREF(exc); return (PyObject *)unicode; onError: Py_DECREF(unicode); + Py_XDECREF(errorHandler); + Py_XDECREF(exc); return NULL; } PyObject * PyUnicode_EncodeUTF16(const Py_UNICODE *s, *************** *** 1487,1560 **** 0); } /* --- Unicode Escape Codec ----------------------------------------------- */ - static - int unicodeescape_decoding_error(Py_UNICODE **x, - const char *errors, - const char *details) - { - if ((errors == NULL) || - (strcmp(errors,"strict") == 0)) { - PyErr_Format(PyExc_UnicodeError, - "Unicode-Escape decoding error: %.400s", - details); - return -1; - } - else if (strcmp(errors,"ignore") == 0) { - return 0; - } - else if (strcmp(errors,"replace") == 0) { - **x = Py_UNICODE_REPLACEMENT_CHARACTER; - (*x)++; - return 0; - } - else { - PyErr_Format(PyExc_ValueError, - "Unicode-Escape decoding error; " - "unknown error handling code: %.400s", - errors); - return -1; - } - } - static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL; PyObject *PyUnicode_DecodeUnicodeEscape(const char *s, int size, const char *errors) { PyUnicodeObject *v; ! Py_UNICODE *p, *buf; const char *end; char* message; Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */ /* Escaped strings will always be longer than the resulting Unicode string, so we start with size here and then reduce the ! length after conversion to the true value. */ v = _PyUnicode_New(size); if (v == NULL) goto onError; if (size == 0) return (PyObject *)v; ! p = buf = PyUnicode_AS_UNICODE(v); end = s + size; while (s < end) { unsigned char c; Py_UNICODE x; ! int i, digits; /* Non-escape characters are interpreted as Unicode ordinals */ if (*s != '\\') { *p++ = (unsigned char) *s++; continue; } ! /* \ - Escapes */ s++; switch (*s++) { /* \x escapes */ --- 1568,1621 ---- 0); } /* --- Unicode Escape Codec ----------------------------------------------- */ static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL; PyObject *PyUnicode_DecodeUnicodeEscape(const char *s, int size, const char *errors) { + const char *starts = s; + int startinpos; + int endinpos; + int outpos; + int i; PyUnicodeObject *v; ! Py_UNICODE *p; const char *end; char* message; Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */ + PyObject *errorHandler = NULL; + PyObject *exc = NULL; /* Escaped strings will always be longer than the resulting Unicode string, so we start with size here and then reduce the ! length after conversion to the true value. ! (but if the error callback returns a long replacement string ! we'll have to allocate more space) */ v = _PyUnicode_New(size); if (v == NULL) goto onError; if (size == 0) return (PyObject *)v; ! p = PyUnicode_AS_UNICODE(v); end = s + size; while (s < end) { unsigned char c; Py_UNICODE x; ! int digits; /* Non-escape characters are interpreted as Unicode ordinals */ if (*s != '\\') { *p++ = (unsigned char) *s++; continue; } ! startinpos = s-starts; /* \ - Escapes */ s++; switch (*s++) { /* \x escapes */ *************** *** 1599,1616 **** case 'U': digits = 8; message = "truncated \\UXXXXXXXX escape"; hexescape: chr = 0; ! for (i = 0; i < digits; i++) { c = (unsigned char) s[i]; if (!isxdigit(c)) { ! if (unicodeescape_decoding_error(&p, errors, message)) goto onError; ! chr = 0xffffffff; ! i++; ! break; } chr = (chr<<4) & ~0xF; if (c >= '0' && c <= '9') chr += c - '0'; else if (c >= 'a' && c <= 'f') --- 1660,1691 ---- case 'U': digits = 8; message = "truncated \\UXXXXXXXX escape"; hexescape: chr = 0; ! outpos = p-PyUnicode_AS_UNICODE(v); ! if (s+digits>end) { ! endinpos = size; ! if (unicode_decode_call_errorhandler( ! errors, &errorHandler, ! "unicodeescape", "end of string in escape sequence", ! starts, size, &startinpos, &endinpos, &exc, &s, ! (PyObject **)&v, &outpos, &p)) ! goto onError; ! goto nextByte; ! } ! for (i = 0; i < digits; ++i) { c = (unsigned char) s[i]; if (!isxdigit(c)) { ! endinpos = (s+i+1)-starts; ! if (unicode_decode_call_errorhandler( ! errors, &errorHandler, ! "unicodeescape", message, ! starts, size, &startinpos, &endinpos, &exc, &s, ! (PyObject **)&v, &outpos, &p)) goto onError; ! goto nextByte; } chr = (chr<<4) & ~0xF; if (c >= '0' && c <= '9') chr += c - '0'; else if (c >= 'a' && c <= 'f') *************** *** 1618,1630 **** else chr += 10 + c - 'A'; } s += i; if (chr == 0xffffffff) ! /* _decoding_error will have already written into the ! target buffer. */ ! break; store: /* when we get here, chr is a 32-bit unicode character */ if (chr <= 0xffff) /* UCS-2 character */ *p++ = (Py_UNICODE) chr; --- 1693,1705 ---- else chr += 10 + c - 'A'; } s += i; if (chr == 0xffffffff) ! /* _decoding_error will have already written into the ! target buffer. */ ! break; store: /* when we get here, chr is a 32-bit unicode character */ if (chr <= 0xffff) /* UCS-2 character */ *p++ = (Py_UNICODE) chr; *************** *** 1637,1650 **** chr -= 0x10000L; *p++ = 0xD800 + (Py_UNICODE) (chr >> 10); *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF); #endif } else { ! if (unicodeescape_decoding_error( ! &p, errors, ! "illegal Unicode character") ! ) goto onError; } break; /* \N{name} */ --- 1712,1728 ---- chr -= 0x10000L; *p++ = 0xD800 + (Py_UNICODE) (chr >> 10); *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF); #endif } else { ! endinpos = s-starts; ! outpos = p-PyUnicode_AS_UNICODE(v); ! if (unicode_decode_call_errorhandler( ! errors, &errorHandler, ! "unicodeescape", "illegal Unicode character", ! starts, size, &startinpos, &endinpos, &exc, &s, ! (PyObject **)&v, &outpos, &p)) goto onError; } break; /* \N{name} */ *************** *** 1676,1714 **** s++; if (ucnhash_CAPI->getcode(start, s-start-1, &chr)) goto store; } } ! if (unicodeescape_decoding_error(&p, errors, message)) goto onError; break; default: if (s > end) { ! if (unicodeescape_decoding_error(&p, errors, "\\ at end of string")) goto onError; } else { *p++ = '\\'; *p++ = (unsigned char)s[-1]; } break; } } ! if (_PyUnicode_Resize(&v, (int)(p - buf))) ! goto onError; return (PyObject *)v; ucnhashError: PyErr_SetString( PyExc_UnicodeError, "\\N escapes not supported (can't load unicodedata module)" ); return NULL; onError: Py_XDECREF(v); return NULL; } /* Return a Unicode-Escape string version of the Unicode object. --- 1754,1813 ---- s++; if (ucnhash_CAPI->getcode(start, s-start-1, &chr)) goto store; } } ! /* s--; */ ! endinpos = s-starts; ! outpos = p-PyUnicode_AS_UNICODE(v); ! if (unicode_decode_call_errorhandler( ! errors, &errorHandler, ! "unicodeescape", message, ! starts, size, &startinpos, &endinpos, &exc, &s, ! (PyObject **)&v, &outpos, &p)) goto onError; break; default: if (s > end) { ! message = "\\ at end of string"; ! s--; ! endinpos = s-starts; ! outpos = p-PyUnicode_AS_UNICODE(v); ! if (unicode_decode_call_errorhandler( ! errors, &errorHandler, ! "unicodeescape", message, ! starts, size, &startinpos, &endinpos, &exc, &s, ! (PyObject **)&v, &outpos, &p)) goto onError; } else { *p++ = '\\'; *p++ = (unsigned char)s[-1]; } break; } + nextByte: + ; } ! if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v)))) ! goto onError; return (PyObject *)v; ucnhashError: PyErr_SetString( PyExc_UnicodeError, "\\N escapes not supported (can't load unicodedata module)" ); + Py_XDECREF(errorHandler); + Py_XDECREF(exc); return NULL; onError: Py_XDECREF(v); + Py_XDECREF(errorHandler); + Py_XDECREF(exc); return NULL; } /* Return a Unicode-Escape string version of the Unicode object. *************** *** 1868,1891 **** PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s, int size, const char *errors) { PyUnicodeObject *v; ! Py_UNICODE *p, *buf; const char *end; const char *bs; /* Escaped strings will always be longer than the resulting Unicode string, so we start with size here and then reduce the ! length after conversion to the true value. */ v = _PyUnicode_New(size); if (v == NULL) goto onError; if (size == 0) return (PyObject *)v; ! p = buf = PyUnicode_AS_UNICODE(v); end = s + size; while (s < end) { unsigned char c; Py_UCS4 x; int i; --- 1967,1997 ---- PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s, int size, const char *errors) { + const char *starts = s; + int startinpos; + int endinpos; + int outpos; PyUnicodeObject *v; ! Py_UNICODE *p; const char *end; const char *bs; + PyObject *errorHandler = NULL; + PyObject *exc = NULL; /* Escaped strings will always be longer than the resulting Unicode string, so we start with size here and then reduce the ! length after conversion to the true value. (But decoding error ! handler might have to resize the string) */ v = _PyUnicode_New(size); if (v == NULL) goto onError; if (size == 0) return (PyObject *)v; ! p = PyUnicode_AS_UNICODE(v); end = s + size; while (s < end) { unsigned char c; Py_UCS4 x; int i; *************** *** 1893,1902 **** --- 1999,2009 ---- /* Non-escape characters are interpreted as Unicode ordinals */ if (*s != '\\') { *p++ = (unsigned char)*s++; continue; } + startinpos = s-starts; /* \u-escapes are only interpreted iff the number of leading backslashes if odd */ bs = s; for (;s < end;) { *************** *** 1911,1948 **** } p--; s++; /* \uXXXX with 4 hex digits */ ! for (x = 0, i = 0; i < 4; i++) { ! c = (unsigned char)s[i]; if (!isxdigit(c)) { ! if (unicodeescape_decoding_error(&p, errors, ! "truncated \\uXXXX")) goto onError; ! x = 0xffffffff; ! i++; ! break; } x = (x<<4) & ~0xF; if (c >= '0' && c <= '9') x += c - '0'; else if (c >= 'a' && c <= 'f') x += 10 + c - 'a'; else x += 10 + c - 'A'; } ! s += i; ! if (x != 0xffffffff) ! *p++ = x; } ! if (_PyUnicode_Resize(&v, (int)(p - buf))) goto onError; return (PyObject *)v; onError: Py_XDECREF(v); return NULL; } PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s, int size) --- 2018,2062 ---- } p--; s++; /* \uXXXX with 4 hex digits */ ! outpos = p-PyUnicode_AS_UNICODE(v); ! for (x = 0, i = 0; i < 4; ++i, ++s) { ! c = (unsigned char)*s; if (!isxdigit(c)) { ! endinpos = s-starts; ! if (unicode_decode_call_errorhandler( ! errors, &errorHandler, ! "rawunicodeescape", "truncated \\uXXXX", ! starts, size, &startinpos, &endinpos, &exc, &s, ! (PyObject **)&v, &outpos, &p)) goto onError; ! goto nextByte; } x = (x<<4) & ~0xF; if (c >= '0' && c <= '9') x += c - '0'; else if (c >= 'a' && c <= 'f') x += 10 + c - 'a'; else x += 10 + c - 'A'; } ! *p++ = x; ! nextByte: ! ; } ! if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v)))) goto onError; + Py_XDECREF(errorHandler); + Py_XDECREF(exc); return (PyObject *)v; onError: Py_XDECREF(v); + Py_XDECREF(errorHandler); + Py_XDECREF(exc); return NULL; } PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s, int size) *************** *** 2018,2092 **** onError: Py_XDECREF(v); return NULL; } ! static ! int latin1_encoding_error(const Py_UNICODE **source, ! char **dest, ! const char *errors, ! const char *details) ! { ! if ((errors == NULL) || ! (strcmp(errors,"strict") == 0)) { ! PyErr_Format(PyExc_UnicodeError, ! "Latin-1 encoding error: %.400s", ! details); ! return -1; } ! else if (strcmp(errors,"ignore") == 0) { ! return 0; } ! else if (strcmp(errors,"replace") == 0) { ! **dest = '?'; ! (*dest)++; ! return 0; } ! else { ! PyErr_Format(PyExc_ValueError, ! "Latin-1 encoding error; " ! "unknown error handling code: %.400s", ! errors); ! return -1; } } ! PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p, int size, ! const char *errors) { ! PyObject *repr; ! char *s, *start; ! ! repr = PyString_FromStringAndSize(NULL, size); ! if (repr == NULL) ! return NULL; if (size == 0) ! return repr; ! ! s = PyString_AS_STRING(repr); ! start = s; ! while (size-- > 0) { ! Py_UNICODE ch = *p++; ! if (ch >= 256) { ! if (latin1_encoding_error(&p, &s, errors, ! "ordinal not in range(256)")) ! goto onError; } - else - *s++ = (char)ch; } ! /* Resize if error handling skipped some characters */ ! if (s - start < PyString_GET_SIZE(repr)) ! _PyString_Resize(&repr, s - start); ! return repr; ! onError: ! Py_DECREF(repr); return NULL; } PyObject *PyUnicode_AsLatin1String(PyObject *unicode) { if (!PyUnicode_Check(unicode)) { PyErr_BadArgument(); return NULL; --- 2132,2406 ---- onError: Py_XDECREF(v); return NULL; } ! /* create or adjust a UnicodeEncodeError */ ! static void make_encode_exception(PyObject **exceptionObject, ! const char *encoding, ! const Py_UNICODE *unicode, int size, ! int startpos, int endpos, ! const char *reason) ! { ! if (*exceptionObject == NULL) { ! *exceptionObject = PyUnicodeEncodeError_Create( ! encoding, unicode, size, startpos, endpos, reason); } ! else { ! if (!PyUnicodeEncodeError_SetStart(*exceptionObject, startpos)) ! goto onError; ! if (!PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos)) ! goto onError; ! if (!PyUnicodeEncodeError_SetReason(*exceptionObject, reason)) ! goto onError; ! return; ! onError: ! Py_DECREF(*exceptionObject); ! *exceptionObject = NULL; } ! } ! ! /* raises a UnicodeEncodeError */ ! static void raise_encode_exception(PyObject **exceptionObject, ! const char *encoding, ! const Py_UNICODE *unicode, int size, ! int startpos, int endpos, ! const char *reason) ! { ! make_encode_exception(exceptionObject, ! encoding, unicode, size, startpos, endpos, reason); ! if (*exceptionObject != NULL) ! PyCodec_StrictErrors(*exceptionObject); ! } ! ! /* error handling callback helper: ! build arguments, call the callback and check the arguments, ! put the result into newpos and return the replacement string, which ! has to be freed by the caller */ ! static PyObject *unicode_encode_call_errorhandler(const char *errors, ! PyObject **errorHandler, ! const char *encoding, const char *reason, ! const Py_UNICODE *unicode, int size, PyObject **exceptionObject, ! int startpos, int endpos, ! int *newpos) ! { ! static char *argparse = "O!i;encoding error handler must return (unicode, int) tuple"; ! ! PyObject *restuple; ! PyObject *resunicode; ! ! if (*errorHandler == NULL) { ! *errorHandler = PyCodec_LookupError(errors); ! if (*errorHandler == NULL) ! return NULL; ! } ! ! make_encode_exception(exceptionObject, ! encoding, unicode, size, startpos, endpos, reason); ! if (*exceptionObject == NULL) ! return NULL; ! ! restuple = PyObject_CallFunctionObjArgs( ! *errorHandler, *exceptionObject, NULL); ! if (restuple == NULL) ! return NULL; ! if (!PyTuple_Check(restuple)) { ! PyErr_Format(PyExc_TypeError, &argparse[4]); ! Py_DECREF(restuple); ! return NULL; } ! if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, ! &resunicode, newpos)) { ! Py_DECREF(restuple); ! return NULL; } + if (*newpos<0) + *newpos = 0; + else if (*newpos>size) + *newpos = size; + Py_INCREF(resunicode); + Py_DECREF(restuple); + return resunicode; } ! static PyObject *unicode_encode_ucs1(const Py_UNICODE *p, int size, ! const char *errors, ! int limit) { ! /* output object */ ! PyObject *res; ! /* pointers to the beginning and end+1 of input */ ! const Py_UNICODE *startp = p; ! const Py_UNICODE *endp = p + size; ! /* pointer to the beginning of the unencodable characters */ ! /* const Py_UNICODE *badp = NULL; */ ! /* pointer into the output */ ! char *str; ! /* current output position */ ! int respos = 0; ! int ressize; ! char *encoding = (limit == 256) ? "latin-1" : "ascii"; ! char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)"; ! PyObject *errorHandler = NULL; ! PyObject *exc = NULL; ! /* the following variable is used for caching string comparisons ! * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */ ! int known_errorHandler = -1; ! ! /* allocate enough for a simple encoding without ! replacements, if we need more, we'll resize */ ! res = PyString_FromStringAndSize(NULL, size); ! if (res == NULL) ! goto onError; if (size == 0) ! return res; ! str = PyString_AS_STRING(res); ! ressize = size; ! ! while (p=limit)) ! ++collend; ! /* cache callback name lookup (if not done yet, i.e. it's the first error) */ ! if (known_errorHandler==-1) { ! if ((errors==NULL) || (!strcmp(errors, "strict"))) ! known_errorHandler = 1; ! else if (!strcmp(errors, "replace")) ! known_errorHandler = 2; ! else if (!strcmp(errors, "ignore")) ! known_errorHandler = 3; ! else if (!strcmp(errors, "xmlcharrefreplace")) ! known_errorHandler = 4; ! else ! known_errorHandler = 0; ! } ! switch (known_errorHandler) { ! case 1: /* strict */ ! raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason); ! goto onError; ! case 2: /* replace */ ! while (collstart++ ressize) { ! if (requiredsize<2*ressize) ! requiredsize = 2*ressize; ! if (_PyString_Resize(&res, requiredsize)) ! goto onError; ! str = PyString_AS_STRING(res) + respos; ! ressize = requiredsize; ! } ! /* generate replacement (temporarily (mis)uses p) */ ! for (p = collstart; p < collend; ++p) { ! str += sprintf(str, "&#%d;", (int)*p); ! } ! p = collend; ! break; ! default: ! repunicode = unicode_encode_call_errorhandler(errors, &errorHandler, ! encoding, reason, startp, size, &exc, ! collstart-startp, collend-startp, &newpos); ! if (repunicode == NULL) ! goto onError; ! /* need more space? (at least enough for what we ! have+the replacement+the rest of the string, so ! we won't have to check space for encodable characters) */ ! respos = str-PyString_AS_STRING(res); ! repsize = PyUnicode_GET_SIZE(repunicode); ! requiredsize = respos+repsize+(endp-collend); ! if (requiredsize > ressize) { ! if (requiredsize<2*ressize) ! requiredsize = 2*ressize; ! if (_PyString_Resize(&res, requiredsize)) { ! Py_DECREF(repunicode); ! goto onError; ! } ! str = PyString_AS_STRING(res) + respos; ! ressize = requiredsize; ! } ! /* check if there is anything unencodable in the replacement ! and copy it to the output */ ! for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) { ! c = *uni2; ! if (c >= limit) { ! raise_encode_exception(&exc, encoding, startp, size, ! unicodepos, unicodepos+1, reason); ! Py_DECREF(repunicode); ! goto onError; ! } ! *str = (char)c; ! } ! p = startp + newpos; ! Py_DECREF(repunicode); ! } } } ! /* Resize if we allocated to much */ ! respos = str-PyString_AS_STRING(res); ! if (respos 0) { ! register unsigned char c; ! ! c = (unsigned char)*s++; ! if (c < 128) *p++ = c; ! else if (ascii_decoding_error(&s, &p, errors, ! "ordinal not in range(128)")) goto onError; } if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v)) if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v)))) goto onError; return (PyObject *)v; onError: Py_XDECREF(v); return NULL; } - static - int ascii_encoding_error(const Py_UNICODE **source, - char **dest, - const char *errors, - const char *details) - { - if ((errors == NULL) || - (strcmp(errors,"strict") == 0)) { - PyErr_Format(PyExc_UnicodeError, - "ASCII encoding error: %.400s", - details); - return -1; - } - else if (strcmp(errors,"ignore") == 0) { - return 0; - } - else if (strcmp(errors,"replace") == 0) { - **dest = '?'; - (*dest)++; - return 0; - } - else { - PyErr_Format(PyExc_ValueError, - "ASCII encoding error; " - "unknown error handling code: %.400s", - errors); - return -1; - } - } - PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p, int size, const char *errors) { ! PyObject *repr; ! char *s, *start; ! ! repr = PyString_FromStringAndSize(NULL, size); ! if (repr == NULL) ! return NULL; ! if (size == 0) ! return repr; ! ! s = PyString_AS_STRING(repr); ! start = s; ! while (size-- > 0) { ! Py_UNICODE ch = *p++; ! if (ch >= 128) { ! if (ascii_encoding_error(&p, &s, errors, ! "ordinal not in range(128)")) ! goto onError; ! } ! else ! *s++ = (char)ch; ! } ! /* Resize if error handling skipped some characters */ ! if (s - start < PyString_GET_SIZE(repr)) ! _PyString_Resize(&repr, s - start); ! return repr; ! ! onError: ! Py_DECREF(repr); ! return NULL; } PyObject *PyUnicode_AsASCIIString(PyObject *unicode) { if (!PyUnicode_Check(unicode)) { --- 2436,2483 ---- if (v == NULL) goto onError; if (size == 0) return (PyObject *)v; p = PyUnicode_AS_UNICODE(v); ! e = s + size; ! while (s < e) { ! register unsigned char c = (unsigned char)*s; ! if (c < 128) { *p++ = c; ! ++s; ! } ! else { ! startinpos = s-starts; ! endinpos = startinpos + 1; ! outpos = p-PyUnicode_AS_UNICODE(v); ! if (unicode_decode_call_errorhandler( ! errors, &errorHandler, ! "ascii", "ordinal not in range(128)", ! starts, size, &startinpos, &endinpos, &exc, &s, ! (PyObject **)&v, &outpos, &p)) goto onError; + } } if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v)) if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v)))) goto onError; + Py_XDECREF(errorHandler); + Py_XDECREF(exc); return (PyObject *)v; onError: Py_XDECREF(v); + Py_XDECREF(errorHandler); + Py_XDECREF(exc); return NULL; } PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p, int size, const char *errors) { ! return unicode_encode_ucs1(p, size, errors, 128); } PyObject *PyUnicode_AsASCIIString(PyObject *unicode) { if (!PyUnicode_Check(unicode)) { *************** *** 2307,2354 **** #endif /* MS_WINDOWS */ /* --- Character Mapping Codec -------------------------------------------- */ - static - int charmap_decoding_error(const char **source, - Py_UNICODE **dest, - const char *errors, - const char *details) - { - if ((errors == NULL) || - (strcmp(errors,"strict") == 0)) { - PyErr_Format(PyExc_UnicodeError, - "charmap decoding error: %.400s", - details); - return -1; - } - else if (strcmp(errors,"ignore") == 0) { - return 0; - } - else if (strcmp(errors,"replace") == 0) { - **dest = Py_UNICODE_REPLACEMENT_CHARACTER; - (*dest)++; - return 0; - } - else { - PyErr_Format(PyExc_ValueError, - "charmap decoding error; " - "unknown error handling code: %.400s", - errors); - return -1; - } - } - PyObject *PyUnicode_DecodeCharmap(const char *s, int size, PyObject *mapping, const char *errors) { PyUnicodeObject *v; Py_UNICODE *p; int extrachars = 0; /* Default to Latin-1 */ if (mapping == NULL) return PyUnicode_DecodeLatin1(s, size, errors); --- 2553,2577 ---- #endif /* MS_WINDOWS */ /* --- Character Mapping Codec -------------------------------------------- */ PyObject *PyUnicode_DecodeCharmap(const char *s, int size, PyObject *mapping, const char *errors) { + const char *starts = s; + int startinpos; + int endinpos; + int outpos; + const char *e; PyUnicodeObject *v; Py_UNICODE *p; int extrachars = 0; + PyObject *errorHandler = NULL; + PyObject *exc = NULL; /* Default to Latin-1 */ if (mapping == NULL) return PyUnicode_DecodeLatin1(s, size, errors); *************** *** 2356,2367 **** if (v == NULL) goto onError; if (size == 0) return (PyObject *)v; p = PyUnicode_AS_UNICODE(v); ! while (size-- > 0) { ! unsigned char ch = *s++; PyObject *w, *x; /* Get mapping (char ordinal -> integer, Unicode char or None) */ w = PyInt_FromLong((long)ch); if (w == NULL) --- 2579,2591 ---- if (v == NULL) goto onError; if (size == 0) return (PyObject *)v; p = PyUnicode_AS_UNICODE(v); ! e = s + size; ! while (s < e) { ! unsigned char ch = *s; PyObject *w, *x; /* Get mapping (char ordinal -> integer, Unicode char or None) */ w = PyInt_FromLong((long)ch); if (w == NULL) *************** *** 2389,2403 **** } *p++ = (Py_UNICODE)value; } else if (x == Py_None) { /* undefined mapping */ ! if (charmap_decoding_error(&s, &p, errors, ! "character maps to ")) { Py_DECREF(x); goto onError; } } else if (PyUnicode_Check(x)) { int targetsize = PyUnicode_GET_SIZE(x); if (targetsize == 1) --- 2613,2634 ---- } *p++ = (Py_UNICODE)value; } else if (x == Py_None) { /* undefined mapping */ ! outpos = p-PyUnicode_AS_UNICODE(v); ! startinpos = s-starts; ! endinpos = startinpos+1; ! if (unicode_decode_call_errorhandler( ! errors, &errorHandler, ! "charmap", "character maps to ", ! starts, size, &startinpos, &endinpos, &exc, &s, ! (PyObject **)&v, &outpos, &p)) { Py_DECREF(x); goto onError; } + continue; } else if (PyUnicode_Check(x)) { int targetsize = PyUnicode_GET_SIZE(x); if (targetsize == 1) *************** *** 2433,2583 **** "character mapping must return integer, None or unicode"); Py_DECREF(x); goto onError; } Py_DECREF(x); } if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v)) if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v)))) goto onError; return (PyObject *)v; onError: Py_XDECREF(v); return NULL; } ! static ! int charmap_encoding_error(const Py_UNICODE **source, ! char **dest, ! const char *errors, ! const char *details) ! { ! if ((errors == NULL) || ! (strcmp(errors,"strict") == 0)) { ! PyErr_Format(PyExc_UnicodeError, ! "charmap encoding error: %.400s", ! details); ! return -1; } ! else if (strcmp(errors,"ignore") == 0) { ! return 0; } ! else if (strcmp(errors,"replace") == 0) { ! **dest = '?'; ! (*dest)++; ! return 0; } else { ! PyErr_Format(PyExc_ValueError, ! "charmap encoding error; " ! "unknown error handling code: %.400s", ! errors); ! return -1; } } PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p, int size, PyObject *mapping, const char *errors) { ! PyObject *v; ! char *s; ! int extrachars = 0; /* Default to Latin-1 */ if (mapping == NULL) return PyUnicode_EncodeLatin1(p, size, errors); ! v = PyString_FromStringAndSize(NULL, size); ! if (v == NULL) ! return NULL; if (size == 0) ! return v; ! s = PyString_AS_STRING(v); ! while (size-- > 0) { ! Py_UNICODE ch = *p++; ! PyObject *w, *x; ! /* Get mapping (Unicode ordinal -> string char, integer or None) */ ! w = PyInt_FromLong((long)ch); ! if (w == NULL) goto onError; ! x = PyObject_GetItem(mapping, w); ! Py_DECREF(w); ! if (x == NULL) { ! if (PyErr_ExceptionMatches(PyExc_LookupError)) { ! /* No mapping found means: mapping is undefined. */ ! PyErr_Clear(); ! x = Py_None; ! Py_INCREF(x); ! } else ! goto onError; ! } ! ! /* Apply mapping */ ! if (PyInt_Check(x)) { ! long value = PyInt_AS_LONG(x); ! if (value < 0 || value > 255) { ! PyErr_SetString(PyExc_TypeError, ! "character mapping must be in range(256)"); ! Py_DECREF(x); ! goto onError; ! } ! *s++ = (char)value; ! } ! else if (x == Py_None) { ! /* undefined mapping */ ! if (charmap_encoding_error(&p, &s, errors, ! "character maps to ")) { ! Py_DECREF(x); goto onError; - } } ! else if (PyString_Check(x)) { ! int targetsize = PyString_GET_SIZE(x); ! ! if (targetsize == 1) ! /* 1-1 mapping */ ! *s++ = *PyString_AS_STRING(x); ! else if (targetsize > 1) { ! /* 1-n mapping */ ! if (targetsize > extrachars) { ! /* resize first */ ! int oldpos = (int)(s - PyString_AS_STRING(v)); ! int needed = (targetsize - extrachars) + \ ! (targetsize << 2); ! extrachars += needed; ! if (_PyString_Resize(&v, PyString_GET_SIZE(v) + needed)) { ! Py_DECREF(x); ! goto onError; ! } ! s = PyString_AS_STRING(v) + oldpos; ! } ! memcpy(s, PyString_AS_STRING(x), targetsize); ! s += targetsize; ! extrachars -= targetsize; ! } ! /* 1-0 mapping: skip the character */ ! } ! else { ! /* wrong return value */ ! PyErr_SetString(PyExc_TypeError, ! "character mapping must return integer, None or unicode"); ! Py_DECREF(x); goto onError; - } - Py_DECREF(x); } ! if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v)) ! _PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))); ! return v; ! onError: ! Py_XDECREF(v); return NULL; } PyObject *PyUnicode_AsCharmapString(PyObject *unicode, PyObject *mapping) --- 2664,2963 ---- "character mapping must return integer, None or unicode"); Py_DECREF(x); goto onError; } Py_DECREF(x); + ++s; } if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v)) if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v)))) goto onError; + Py_XDECREF(errorHandler); + Py_XDECREF(exc); return (PyObject *)v; onError: + Py_XDECREF(errorHandler); + Py_XDECREF(exc); Py_XDECREF(v); return NULL; } ! /* Lookup the character ch in the mapping. If the character ! can't be found, Py_None is returned (or NULL, if another ! error occured). */ ! static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping) ! { ! PyObject *w = PyInt_FromLong((long)c); ! PyObject *x; ! ! if (w == NULL) ! return NULL; ! x = PyObject_GetItem(mapping, w); ! Py_DECREF(w); ! if (x == NULL) { ! if (PyErr_ExceptionMatches(PyExc_LookupError)) { ! /* No mapping found means: mapping is undefined. */ ! PyErr_Clear(); ! x = Py_None; ! Py_INCREF(x); ! return x; ! } else ! return NULL; } ! else if (PyInt_Check(x)) { ! long value = PyInt_AS_LONG(x); ! if (value < 0 || value > 255) { ! PyErr_SetString(PyExc_TypeError, ! "character mapping must be in range(256)"); ! Py_DECREF(x); ! return NULL; ! } ! return x; } ! else if (PyString_Check(x)) ! return x; ! else { ! /* wrong return value */ ! PyErr_SetString(PyExc_TypeError, ! "character mapping must return integer, None or str"); ! Py_DECREF(x); ! return NULL; } + } + + /* lookup the character, put the result in the output string and adjust + various state variables. Reallocate the output string if not enough + space is available. Return a new reference to the object that + was put in the output buffer, or Py_None, if the mapping was undefined + (in which case no character was written) or NULL, if a + reallocation error ocurred. The called must decref the result */ + static + PyObject *charmapencode_output(Py_UNICODE c, PyObject *mapping, + PyObject **outobj, int *outpos) + { + PyObject *rep = charmapencode_lookup(c, mapping); + + if (rep==NULL) + return NULL; + else if (rep==Py_None) + return rep; else { ! char *outstart = PyString_AS_STRING(*outobj); ! int outsize = PyString_GET_SIZE(*outobj); ! if (PyInt_Check(rep)) { ! int requiredsize = *outpos+1; ! if (outsize0; ++uni2) { + x = charmapencode_output(*uni2, mapping, res, respos); + if (x==NULL) { + Py_DECREF(repunicode); + return -1; + } + else if (x==Py_None) { + Py_DECREF(repunicode); + Py_DECREF(x); + raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); + return -1; + } + Py_DECREF(x); + } + *inpos = newpos; + Py_DECREF(repunicode); + } + return 0; } PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p, int size, PyObject *mapping, const char *errors) { ! /* output object */ ! PyObject *res = NULL; ! /* current input position */ ! int inpos = 0; ! /* current output position */ ! int respos = 0; ! PyObject *errorHandler = NULL; ! PyObject *exc = NULL; ! /* the following variable is used for caching string comparisons ! * -1=not initialized, 0=unknown, 1=strict, 2=replace, ! * 3=ignore, 4=xmlcharrefreplace */ ! int known_errorHandler = -1; /* Default to Latin-1 */ if (mapping == NULL) return PyUnicode_EncodeLatin1(p, size, errors); ! /* allocate enough for a simple encoding without ! replacements, if we need more, we'll resize */ ! res = PyString_FromStringAndSize(NULL, size); ! if (res == NULL) ! goto onError; if (size == 0) ! return res; ! while (inpos adjust input position */ ! ++inpos; ! Py_DECREF(x); ! } ! /* Resize if we allocated to much */ ! if (respos 0) { - Py_UNICODE ch = *s++; - PyObject *w, *x; ! /* Get mapping */ ! w = PyInt_FromLong(ch); ! if (w == NULL) ! goto onError; ! x = PyObject_GetItem(mapping, w); ! Py_DECREF(w); ! if (x == NULL) { ! if (PyErr_ExceptionMatches(PyExc_LookupError)) { ! /* No mapping found: default to 1-1 mapping */ ! PyErr_Clear(); ! *p++ = ch; ! continue; ! } goto onError; } ! ! /* Apply mapping */ ! if (PyInt_Check(x)) ! *p++ = (Py_UNICODE)PyInt_AS_LONG(x); ! else if (x == Py_None) { ! /* undefined mapping */ ! if (translate_error(&s, &p, errors, ! "character maps to ")) { ! Py_DECREF(x); ! goto onError; ! } ! } ! else if (PyUnicode_Check(x)) { ! if (PyUnicode_GET_SIZE(x) != 1) { ! /* 1-n mapping */ ! PyErr_SetString(PyExc_NotImplementedError, ! "1-n mappings are currently not implemented"); ! Py_DECREF(x); ! goto onError; } - *p++ = *PyUnicode_AS_UNICODE(x); } - else { - /* wrong return value */ - PyErr_SetString(PyExc_TypeError, - "translate mapping must return integer, None or unicode"); - Py_DECREF(x); - goto onError; - } - Py_DECREF(x); } ! if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v)) ! if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v)))) goto onError; ! done: ! return (PyObject *)v; ! ! onError: ! Py_XDECREF(v); return NULL; } PyObject *PyUnicode_Translate(PyObject *str, PyObject *mapping, --- 2970,3317 ---- PyUnicode_GET_SIZE(unicode), mapping, NULL); } + /* create or adjust a UnicodeTranslateError */ + static void make_translate_exception(PyObject **exceptionObject, + const Py_UNICODE *unicode, int size, + int startpos, int endpos, + const char *reason) + { + if (*exceptionObject == NULL) { + *exceptionObject = PyUnicodeTranslateError_Create( + unicode, size, startpos, endpos, reason); + } + else { + if (!PyUnicodeTranslateError_SetStart(*exceptionObject, startpos)) + goto onError; + if (!PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos)) + goto onError; + if (!PyUnicodeTranslateError_SetReason(*exceptionObject, reason)) + goto onError; + return; + onError: + Py_DECREF(*exceptionObject); + *exceptionObject = NULL; + } + } + + /* raises a UnicodeTranslateError */ + static void raise_translate_exception(PyObject **exceptionObject, + const Py_UNICODE *unicode, int size, + int startpos, int endpos, + const char *reason) + { + make_translate_exception(exceptionObject, + unicode, size, startpos, endpos, reason); + if (*exceptionObject != NULL) + PyCodec_StrictErrors(*exceptionObject); + } + + /* error handling callback helper: + build arguments, call the callback and check the arguments, + put the result into newpos and return the replacement string, which + has to be freed by the caller */ + static PyObject *unicode_translate_call_errorhandler(const char *errors, + PyObject **errorHandler, + const char *reason, + const Py_UNICODE *unicode, int size, PyObject **exceptionObject, + int startpos, int endpos, + int *newpos) + { + static char *argparse = "O!i;translating error handler must return (unicode, int) tuple"; + + PyObject *restuple; + PyObject *resunicode; + + if (*errorHandler == NULL) { + *errorHandler = PyCodec_LookupError(errors); + if (*errorHandler == NULL) + return NULL; + } + + make_translate_exception(exceptionObject, + unicode, size, startpos, endpos, reason); + if (*exceptionObject == NULL) + return NULL; + + restuple = PyObject_CallFunctionObjArgs( + *errorHandler, *exceptionObject, NULL); + if (restuple == NULL) + return NULL; + if (!PyTuple_Check(restuple)) { + PyErr_Format(PyExc_TypeError, &argparse[4]); + Py_DECREF(restuple); + return NULL; + } + if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, + &resunicode, newpos)) { + Py_DECREF(restuple); + return NULL; + } + if (*newpos<0) + *newpos = 0; + else if (*newpos>size) + *newpos = size; + Py_INCREF(resunicode); + Py_DECREF(restuple); + return resunicode; + } + + /* Lookup the character ch in the mapping and put the result in result, + which must be decrefed by the caller. + Return 0 on success, -1 on error */ static ! int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result) ! { ! PyObject *w = PyInt_FromLong((long)c); ! PyObject *x; ! ! if (w == NULL) ! return -1; ! x = PyObject_GetItem(mapping, w); ! Py_DECREF(w); ! if (x == NULL) { ! if (PyErr_ExceptionMatches(PyExc_LookupError)) { ! /* No mapping found means: use 1:1 mapping. */ ! PyErr_Clear(); ! *result = NULL; ! return 0; ! } else ! return -1; } ! else if (x == Py_None) { ! *result = x; return 0; } ! else if (PyInt_Check(x)) { ! long value = PyInt_AS_LONG(x); ! long max = PyUnicode_GetMax(); ! if (value < 0 || value > max) { ! PyErr_Format(PyExc_TypeError, ! "character mapping must be in range(0x%lx)", max+1); ! Py_DECREF(x); ! return -1; ! } ! *result = x; ! return 0; ! } ! else if (PyUnicode_Check(x)) { ! *result = x; return 0; } else { ! /* wrong return value */ ! PyErr_SetString(PyExc_TypeError, ! "character mapping must return integer, None or unicode"); return -1; } } + /* ensure that *outobj is at least requiredsize characters long, + if not reallocate and adjust various state variables. + Return 0 on success, -1 on error */ + static + int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp, int *outsize, + int requiredsize) + { + if (requiredsize > *outsize) { + /* remember old output position */ + int outpos = *outp-PyUnicode_AS_UNICODE(*outobj); + /* exponentially overallocate to minimize reallocations */ + if (requiredsize < 2 * *outsize) + requiredsize = 2 * *outsize; + if (_PyUnicode_Resize(outobj, requiredsize)) + return -1; + *outp = PyUnicode_AS_UNICODE(*outobj) + outpos; + *outsize = requiredsize; + } + return 0; + } + /* lookup the character, put the result in the output string and adjust + various state variables. Return a new reference to the object that + was put in the output buffer in *result, or Py_None, if the mapping was + undefined (in which case no character was written). + The called must decref result. + Return 0 on success, -1 on error. */ + static + int charmaptranslate_output(Py_UNICODE c, PyObject *mapping, + PyObject **outobj, int *outsize, Py_UNICODE **outp, PyObject **res) + { + if (charmaptranslate_lookup(c, mapping, res)) + return -1; + if (*res==NULL) { + /* not found => default to 1:1 mapping */ + *(*outp)++ = (Py_UNICODE)c; + } + else if (*res==Py_None) + ; + else if (PyInt_Check(*res)) { + /* no overflow check, because we know that the space is enough */ + *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res); + } + else if (PyUnicode_Check(*res)) { + int repsize = PyUnicode_GET_SIZE(*res); + if (repsize==1) { + /* no overflow check, because we know that the space is enough */ + *(*outp)++ = *PyUnicode_AS_UNICODE(*res); + } + else if (repsize!=0) { + /* more than one character */ + int requiredsize = *outsize + repsize - 1; + if (charmaptranslate_makespace(outobj, outp, outsize, requiredsize)) + return -1; + memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize); + *outp += repsize; + } + } + else + return -1; + return 0; + } ! PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p, int size, PyObject *mapping, const char *errors) { ! /* output object */ ! PyObject *res = NULL; ! /* pointers to the beginning and end+1 of input */ ! const Py_UNICODE *startp = p; ! const Py_UNICODE *endp = p + size; ! /* pointer into the output */ ! Py_UNICODE *str; ! /* current output position */ ! int respos = 0; ! int ressize; ! char *reason = "character maps to "; ! PyObject *errorHandler = NULL; ! PyObject *exc = NULL; ! /* the following variable is used for caching string comparisons ! * -1=not initialized, 0=unknown, 1=strict, 2=replace, ! * 3=ignore, 4=xmlcharrefreplace */ ! int known_errorHandler = -1; ! if (mapping == NULL) { PyErr_BadArgument(); return NULL; } ! /* allocate enough for a simple 1:1 translation without ! replacements, if we need more, we'll resize */ ! res = PyUnicode_FromUnicode(NULL, size); ! if (res == NULL) ! goto onError; ! if (size == 0) ! return res; ! str = PyUnicode_AS_UNICODE(res); ! ressize = size; ! ! while (p adjust input pointer */ ! ++p; ! else { /* untranslatable character */ ! PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ ! int repsize; ! int newpos; ! Py_UNICODE *uni2; ! /* startpos for collecting untranslatable chars */ ! const Py_UNICODE *collstart = p; ! const Py_UNICODE *collend = p+1; ! const Py_UNICODE *coll; ! ! Py_XDECREF(x); ! /* find all untranslatable characters */ ! while (collend < endp) { ! if (charmaptranslate_lookup(*collend, mapping, &x)) ! goto onError; ! Py_XDECREF(x); ! if (x!=Py_None) ! break; ! ++collend; ! } ! /* cache callback name lookup ! * (if not done yet, i.e. it's the first error) */ ! if (known_errorHandler==-1) { ! if ((errors==NULL) || (!strcmp(errors, "strict"))) ! known_errorHandler = 1; ! else if (!strcmp(errors, "replace")) ! known_errorHandler = 2; ! else if (!strcmp(errors, "ignore")) ! known_errorHandler = 3; ! else if (!strcmp(errors, "xmlcharrefreplace")) ! known_errorHandler = 4; ! else ! known_errorHandler = 0; ! } ! switch (known_errorHandler) { ! case 1: /* strict */ ! raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason); ! goto onError; ! case 2: /* replace */ ! /* No need to check for space, this is a 1:1 replacement */ ! for (coll = collstart; coll0; ++uni2) ! *str++ = *uni2; ! p = startp + newpos; ! Py_DECREF(repunicode); } } } ! /* Resize if we allocated to much */ ! respos = str-PyUnicode_AS_UNICODE(res); ! if (respos= 0) { *output++ = '0' + decimal; continue; } if (0 < ch && ch < 256) { *output++ = (char)ch; continue; } ! /* All other characters are considered invalid */ ! if (errors == NULL || strcmp(errors, "strict") == 0) { ! PyErr_SetString(PyExc_ValueError, ! "invalid decimal Unicode string"); ! goto onError; } ! else if (strcmp(errors, "ignore") == 0) ! continue; ! else if (strcmp(errors, "replace") == 0) { ! *output++ = '?'; ! continue; } } /* 0-terminate the output string */ *output++ = '\0'; return 0; onError: return -1; } /* --- Helpers ------------------------------------------------------------ */ --- 3340,3469 ---- int length, char *output, const char *errors) { Py_UNICODE *p, *end; + PyObject *errorHandler = NULL; + PyObject *exc = NULL; + const char *encoding = "decimal"; + const char *reason = "invalid decimal Unicode string"; + /* the following variable is used for caching string comparisons + * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */ + int known_errorHandler = -1; if (output == NULL) { PyErr_BadArgument(); return -1; } p = s; end = s + length; while (p < end) { ! register Py_UNICODE ch = *p; int decimal; + PyObject *repunicode; + int repsize; + int newpos; + Py_UNICODE *uni2; + Py_UNICODE *collstart; + Py_UNICODE *collend; if (Py_UNICODE_ISSPACE(ch)) { *output++ = ' '; + ++p; continue; } decimal = Py_UNICODE_TODECIMAL(ch); if (decimal >= 0) { *output++ = '0' + decimal; + ++p; continue; } if (0 < ch && ch < 256) { *output++ = (char)ch; + ++p; continue; } ! /* All other characters are considered unencodable */ ! collstart = p; ! collend = p+1; ! while (collend < end) { ! if ((0 < *collend && *collend < 256) || ! !Py_UNICODE_ISSPACE(*collend) || ! Py_UNICODE_TODECIMAL(*collend)) ! break; } ! /* cache callback name lookup ! * (if not done yet, i.e. it's the first error) */ ! if (known_errorHandler==-1) { ! if ((errors==NULL) || (!strcmp(errors, "strict"))) ! known_errorHandler = 1; ! else if (!strcmp(errors, "replace")) ! known_errorHandler = 2; ! else if (!strcmp(errors, "ignore")) ! known_errorHandler = 3; ! else if (!strcmp(errors, "xmlcharrefreplace")) ! known_errorHandler = 4; ! else ! known_errorHandler = 0; ! } ! switch (known_errorHandler) { ! case 1: /* strict */ ! raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason); ! goto onError; ! case 2: /* replace */ ! for (p = collstart; p < collend; ++p) ! *output++ = '?'; ! /* fall through */ ! case 3: /* ignore */ ! p = collend; ! break; ! case 4: /* xmlcharrefreplace */ ! /* generate replacement (temporarily (mis)uses p) */ ! for (p = collstart; p < collend; ++p) ! output += sprintf(output, "&#%d;", (int)*p); ! p = collend; ! break; ! default: ! repunicode = unicode_encode_call_errorhandler(errors, &errorHandler, ! encoding, reason, s, length, &exc, ! collstart-s, collend-s, &newpos); ! if (repunicode == NULL) ! goto onError; ! /* generate replacement */ ! repsize = PyUnicode_GET_SIZE(repunicode); ! for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) { ! Py_UNICODE ch = *uni2; ! if (Py_UNICODE_ISSPACE(ch)) ! *output++ = ' '; ! else { ! decimal = Py_UNICODE_TODECIMAL(ch); ! if (decimal >= 0) ! *output++ = '0' + decimal; ! else if (0 < ch && ch < 256) ! *output++ = (char)ch; ! else { ! Py_DECREF(repunicode); ! raise_encode_exception(&exc, encoding, ! s, length, collstart-s, collend-s, reason); ! goto onError; ! } ! } ! } ! p = s + newpos; ! Py_DECREF(repunicode); } } /* 0-terminate the output string */ *output++ = '\0'; + Py_XDECREF(exc); + Py_XDECREF(errorHandler); return 0; onError: + Py_XDECREF(exc); + Py_XDECREF(errorHandler); return -1; } /* --- Helpers ------------------------------------------------------------ */ *************** *** 3863,3873 **** "S.encode([encoding[,errors]]) -> string\n\ \n\ Return an encoded string version of S. Default encoding is the current\n\ default string encoding. errors may be given to set a different error\n\ handling scheme. Default is 'strict' meaning that encoding errors raise\n\ ! a ValueError. Other possible values are 'ignore' and 'replace'."); static PyObject * unicode_encode(PyUnicodeObject *self, PyObject *args) { char *encoding = NULL; --- 4549,4561 ---- "S.encode([encoding[,errors]]) -> string\n\ \n\ Return an encoded string version of S. Default encoding is the current\n\ default string encoding. errors may be given to set a different error\n\ handling scheme. Default is 'strict' meaning that encoding errors raise\n\ ! a UnicodeError. Other possible values are 'ignore', 'replace' and\n\ ! 'xmlcharrefreplace' as well as any other name registered with\n\ ! codecs.register_error."); static PyObject * unicode_encode(PyUnicodeObject *self, PyObject *args) { char *encoding = NULL; Index: Python/codecs.c =================================================================== RCS file: /cvsroot/python/python/dist/src/Python/codecs.c,v retrieving revision 2.14 diff -u -c -5 -r2.14 codecs.c *** Python/codecs.c 18 Jul 2002 23:06:17 -0000 2.14 --- Python/codecs.c 24 Jul 2002 18:52:24 -0000 *************** *** 420,435 **** --- 420,832 ---- Py_XDECREF(decoder); Py_XDECREF(result); return NULL; } + static PyObject *_PyCodec_ErrorRegistry; + + /* Register the error handling callback function error under the name + name. This function will be called by the codec when it encounters + an unencodable characters/undecodable bytes and doesn't know the + callback name, when name is specified as the error parameter + in the call to the encode/decode function. + Return 0 on success, -1 on error */ + int PyCodec_RegisterError(const char *name, PyObject *error) + { + if (!PyCallable_Check(error)) { + PyErr_SetString(PyExc_TypeError, "handler must be callable"); + return -1; + } + return PyDict_SetItemString( _PyCodec_ErrorRegistry, (char *)name, error); + } + + /* Lookup the error handling callback function registered under the + name error. As a special case NULL can be passed, in which case + the error handling callback for strict encoding will be returned. */ + PyObject *PyCodec_LookupError(const char *name) + { + PyObject *handler = NULL; + + if (name==NULL) + name = "strict"; + handler = PyDict_GetItemString(_PyCodec_ErrorRegistry, (char *)name); + if (!handler) + PyErr_Format(PyExc_LookupError, "unknown error handler name '%.400s'", name); + else + Py_INCREF(handler); + return handler; + } + + static void wrong_exception_type(PyObject *exc) + { + PyObject *type = PyObject_GetAttrString(exc, "__class__"); + if (type != NULL) { + PyObject *name = PyObject_GetAttrString(type, "__name__"); + Py_DECREF(type); + if (name != NULL) { + PyObject *string = PyObject_Str(name); + Py_DECREF(name); + PyErr_Format(PyExc_TypeError, "don't know how to handle %.400s in error callback", + PyString_AS_STRING(string)); + Py_DECREF(string); + } + } + } + + PyObject *PyCodec_StrictErrors(PyObject *exc) + { + if (PyInstance_Check(exc)) + PyErr_SetObject((PyObject*)((PyInstanceObject*)exc)->in_class, + exc); + else + PyErr_SetString(PyExc_TypeError, "codec must pass exception instance"); + return NULL; + } + + + PyObject *PyCodec_IgnoreErrors(PyObject *exc) + { + int end; + if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) { + if (!PyUnicodeEncodeError_GetEnd(exc, &end)) + return NULL; + } + else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) { + if (!PyUnicodeDecodeError_GetEnd(exc, &end)) + return NULL; + } + else if (PyObject_IsInstance(exc, PyExc_UnicodeTranslateError)) { + if (!PyUnicodeTranslateError_GetEnd(exc, &end)) + return NULL; + } + else { + wrong_exception_type(exc); + return NULL; + } + /* ouch: passing NULL, 0, pos gives None instead of u'' */ + return Py_BuildValue("(u#i)", &end, 0, end); + } + + + PyObject *PyCodec_ReplaceErrors(PyObject *exc) + { + PyObject *restuple; + int start; + int end; + int i; + + if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) { + PyObject *res; + Py_UNICODE *p; + if (!PyUnicodeEncodeError_GetStart(exc, &start)) + return NULL; + if (!PyUnicodeEncodeError_GetEnd(exc, &end)) + return NULL; + res = PyUnicode_FromUnicode(NULL, end-start); + if (res == NULL) + return NULL; + for (p = PyUnicode_AS_UNICODE(res), i = start; + i0) { + *outp++ = '0' + c/base; + c %= base; + base /= 10; + } + *outp++ = ';'; + } + restuple = Py_BuildValue("(Oi)", res, end); + Py_DECREF(res); + Py_DECREF(object); + return restuple; + } + else { + wrong_exception_type(exc); + return NULL; + } + } + + static Py_UNICODE hexdigits[] = { + '0', '1', '2', '3', '4', '5', '6', '7', + '8', '9', 'a', 'b', 'c', 'd', 'e', 'f' + }; + + PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc) + { + if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) { + PyObject *restuple; + PyObject *object; + int start; + int end; + PyObject *res; + Py_UNICODE *p; + Py_UNICODE *startp; + Py_UNICODE *outp; + int ressize; + if (!PyUnicodeEncodeError_GetStart(exc, &start)) + return NULL; + if (!PyUnicodeEncodeError_GetEnd(exc, &end)) + return NULL; + if (!(object = PyUnicodeEncodeError_GetObject(exc))) + return NULL; + startp = PyUnicode_AS_UNICODE(object); + for (p = startp+start, ressize = 0; p < startp+end; ++p) { + if (*p >= 0x00010000) + ressize += 1+1+8; + else if (*p >= 0x100) { + ressize += 1+1+4; + } + else + ressize += 1+1+2; + } + res = PyUnicode_FromUnicode(NULL, ressize); + if (res==NULL) + return NULL; + for (p = startp+start, outp = PyUnicode_AS_UNICODE(res); + p < startp+end; ++p) { + Py_UNICODE c = *p; + *outp++ = '\\'; + if (c >= 0x00010000) { + *outp++ = 'U'; + *outp++ = hexdigits[(c>>28)&0xf]; + *outp++ = hexdigits[(c>>24)&0xf]; + *outp++ = hexdigits[(c>>20)&0xf]; + *outp++ = hexdigits[(c>>16)&0xf]; + *outp++ = hexdigits[(c>>12)&0xf]; + *outp++ = hexdigits[(c>>8)&0xf]; + } + else if (c >= 0x100) { + *outp++ = 'u'; + *outp++ = hexdigits[(c>>12)&0xf]; + *outp++ = hexdigits[(c>>8)&0xf]; + } + else + *outp++ = 'x'; + *outp++ = hexdigits[(c>>4)&0xf]; + *outp++ = hexdigits[c&0xf]; + } + + restuple = Py_BuildValue("(Oi)", res, end); + Py_DECREF(res); + Py_DECREF(object); + return restuple; + } + else { + wrong_exception_type(exc); + return NULL; + } + } + + static PyObject *strict_errors(PyObject *self, PyObject *exc) + { + return PyCodec_StrictErrors(exc); + } + + + static PyObject *ignore_errors(PyObject *self, PyObject *exc) + { + return PyCodec_IgnoreErrors(exc); + } + + + static PyObject *replace_errors(PyObject *self, PyObject *exc) + { + return PyCodec_ReplaceErrors(exc); + } + + + static PyObject *xmlcharrefreplace_errors(PyObject *self, PyObject *exc) + { + return PyCodec_XMLCharRefReplaceErrors(exc); + } + + + static PyObject *backslashreplace_errors(PyObject *self, PyObject *exc) + { + return PyCodec_BackslashReplaceErrors(exc); + } + + void _PyCodecRegistry_Init(void) { + static struct { + char *name; + PyMethodDef def; + } methods[] = + { + { + "strict", + { + "strict_errors", + strict_errors, + METH_O + } + }, + { + "ignore", + { + "ignore_errors", + ignore_errors, + METH_O + } + }, + { + "replace", + { + "replace_errors", + replace_errors, + METH_O + } + }, + { + "xmlcharrefreplace", + { + "xmlcharrefreplace_errors", + xmlcharrefreplace_errors, + METH_O + } + }, + { + "backslashreplace", + { + "backslashreplace_errors", + backslashreplace_errors, + METH_O + } + } + }; if (_PyCodec_SearchPath == NULL) _PyCodec_SearchPath = PyList_New(0); if (_PyCodec_SearchCache == NULL) _PyCodec_SearchCache = PyDict_New(); + if (_PyCodec_ErrorRegistry == NULL) { + int i; + _PyCodec_ErrorRegistry = PyDict_New(); + + if (_PyCodec_ErrorRegistry) { + for (i = 0; i < 5; ++i) { + PyObject *func = PyCFunction_New(&methods[i].def, NULL); + int res; + if (!func) + Py_FatalError("can't initialize codec error registry"); + res = PyCodec_RegisterError(methods[i].name, func); + Py_DECREF(func); + if (res) + Py_FatalError("can't initialize codec error registry"); + } + } + } if (_PyCodec_SearchPath == NULL || _PyCodec_SearchCache == NULL) Py_FatalError("can't initialize codec registry"); } *************** *** 437,442 **** --- 834,841 ---- { Py_XDECREF(_PyCodec_SearchPath); _PyCodec_SearchPath = NULL; Py_XDECREF(_PyCodec_SearchCache); _PyCodec_SearchCache = NULL; + Py_XDECREF(_PyCodec_ErrorRegistry); + _PyCodec_ErrorRegistry = NULL; } Index: Python/exceptions.c =================================================================== RCS file: /cvsroot/python/python/dist/src/Python/exceptions.c,v retrieving revision 1.32 diff -u -c -5 -r1.32 exceptions.c *** Python/exceptions.c 13 Jun 2002 20:33:02 -0000 1.32 --- Python/exceptions.c 24 Jul 2002 18:52:25 -0000 *************** *** 98,107 **** --- 98,111 ---- | | +-- FloatingPointError\n\ | |\n\ | +-- ValueError\n\ | | |\n\ | | +-- UnicodeError\n\ + | | |\n\ + | | +-- UnicodeEncodeError\n\ + | | +-- UnicodeDecodeError\n\ + | | +-- UnicodeTranslateError\n\ | |\n\ | +-- ReferenceError\n\ | +-- SystemError\n\ | +-- MemoryError\n\ |\n\ *************** *** 837,846 **** --- 841,1434 ---- {"__str__", SyntaxError__str__, METH_VARARGS}, {NULL, NULL} }; + static + int get_int(PyObject *exc, const char *name, int *value) + { + PyObject *attr = PyObject_GetAttrString(exc, (char *)name); + + if (!attr) + return 0; + if (!PyInt_Check(attr)) { + PyErr_Format(PyExc_TypeError, "%s attribute must be int", name); + Py_DECREF(attr); + return 0; + } + *value = PyInt_AS_LONG(attr); + Py_DECREF(attr); + return -1; + } + + + static + int set_int(PyObject *exc, const char *name, int value) + { + PyObject *obj = PyInt_FromLong(value); + int result; + + if (!obj) + return 0; + result = PyObject_SetAttrString(exc, (char *)name, obj); + Py_DECREF(obj); + return result ? 0 : -1; + } + + + static + PyObject *get_string(PyObject *exc, const char *name) + { + PyObject *attr = PyObject_GetAttrString(exc, (char *)name); + + if (!attr) + return NULL; + if (!PyString_Check(attr)) { + PyErr_Format(PyExc_TypeError, "%s attribute must be str", name); + Py_DECREF(attr); + return NULL; + } + return attr; + } + + + static + int set_string(PyObject *exc, const char *name, const char *value) + { + PyObject *obj = PyString_FromString(value); + int result; + + if (!obj) + return 0; + result = PyObject_SetAttrString(exc, (char *)name, obj); + Py_DECREF(obj); + return result ? 0 : -1; + } + + + static + PyObject *get_unicode(PyObject *exc, const char *name) + { + PyObject *attr = PyObject_GetAttrString(exc, (char *)name); + + if (!attr) + return NULL; + if (!PyUnicode_Check(attr)) { + PyErr_Format(PyExc_TypeError, "%s attribute must be unicode", name); + Py_DECREF(attr); + return NULL; + } + return attr; + } + + PyObject * PyUnicodeEncodeError_GetEncoding(PyObject *exc) + { + return get_string(exc, "encoding"); + } + + PyObject * PyUnicodeDecodeError_GetEncoding(PyObject *exc) + { + return get_string(exc, "encoding"); + } + + PyObject * PyUnicodeTranslateError_GetEncoding(PyObject *exc) + { + return get_string(exc, "encoding"); + } + + PyObject *PyUnicodeEncodeError_GetObject(PyObject *exc) + { + return get_unicode(exc, "object"); + } + + PyObject *PyUnicodeDecodeError_GetObject(PyObject *exc) + { + return get_string(exc, "object"); + } + + PyObject *PyUnicodeTranslateError_GetObject(PyObject *exc) + { + return get_unicode(exc, "object"); + } + + int PyUnicodeEncodeError_GetStart(PyObject *exc, int *start) + { + if (get_int(exc, "start", start)) { + PyObject *object = PyUnicodeEncodeError_GetObject(exc); + int size; + if (!object) + return 0; + size = PyUnicode_GET_SIZE(object); + if (*start<0) + *start = 0; + if (*start>=size) + *start = size-1; + Py_DECREF(object); + return -1; + } + return 0; + } + + + int PyUnicodeDecodeError_GetStart(PyObject *exc, int *start) + { + if (get_int(exc, "start", start)) { + PyObject *object = PyUnicodeDecodeError_GetObject(exc); + int size; + if (!object) + return 0; + size = PyString_GET_SIZE(object); + if (*start<0) + *start = 0; + if (*start>=size) + *start = size-1; + Py_DECREF(object); + return -1; + } + return 0; + } + + + int PyUnicodeTranslateError_GetStart(PyObject *exc, int *start) + { + return PyUnicodeEncodeError_GetStart(exc, start); + } + + + int PyUnicodeEncodeError_SetStart(PyObject *exc, int start) + { + return set_int(exc, "start", start); + } + + + int PyUnicodeDecodeError_SetStart(PyObject *exc, int start) + { + return set_int(exc, "start", start); + } + + + int PyUnicodeTranslateError_SetStart(PyObject *exc, int start) + { + return set_int(exc, "start", start); + } + + + int PyUnicodeEncodeError_GetEnd(PyObject *exc, int *end) + { + if (get_int(exc, "end", end)) { + PyObject *object = PyUnicodeEncodeError_GetObject(exc); + int size; + if (!object) + return 0; + size = PyUnicode_GET_SIZE(object); + if (*end<1) + *end = 1; + if (*end>size) + *end = size; + Py_DECREF(object); + return -1; + } + return 0; + } + + + int PyUnicodeDecodeError_GetEnd(PyObject *exc, int *end) + { + if (get_int(exc, "end", end)) { + PyObject *object = PyUnicodeDecodeError_GetObject(exc); + int size; + if (!object) + return 0; + size = PyString_GET_SIZE(object); + if (*end<1) + *end = 1; + if (*end>size) + *end = size; + Py_DECREF(object); + return -1; + } + return 0; + } + + + int PyUnicodeTranslateError_GetEnd(PyObject *exc, int *start) + { + return PyUnicodeEncodeError_GetEnd(exc, start); + } + + + int PyUnicodeEncodeError_SetEnd(PyObject *exc, int end) + { + return set_int(exc, "end", end); + } + + + int PyUnicodeDecodeError_SetEnd(PyObject *exc, int end) + { + return set_int(exc, "end", end); + } + + + int PyUnicodeTranslateError_SetEnd(PyObject *exc, int end) + { + return set_int(exc, "end", end); + } + + + PyObject *PyUnicodeEncodeError_GetReason(PyObject *exc) + { + return get_string(exc, "reason"); + } + + + PyObject *PyUnicodeDecodeError_GetReason(PyObject *exc) + { + return get_string(exc, "reason"); + } + + + PyObject *PyUnicodeTranslateError_GetReason(PyObject *exc) + { + return get_string(exc, "reason"); + } + + + int PyUnicodeEncodeError_SetReason(PyObject *exc, const char *reason) + { + return set_string(exc, "reason", reason); + } + + + int PyUnicodeDecodeError_SetReason(PyObject *exc, const char *reason) + { + return set_string(exc, "reason", reason); + } + + + int PyUnicodeTranslateError_SetReason(PyObject *exc, const char *reason) + { + return set_string(exc, "reason", reason); + } + + + static PyObject * + UnicodeError__init__(PyObject *self, PyObject *args, PyTypeObject *objecttype) + { + PyObject *rtnval = NULL; + PyObject *encoding; + PyObject *object; + PyObject *start; + PyObject *end; + PyObject *reason; + + if (!(self = get_self(args))) + return NULL; + + if (!(args = PySequence_GetSlice(args, 1, PySequence_Size(args)))) + return NULL; + + if (!PyArg_ParseTuple(args, "O!O!O!O!O!", + &PyString_Type, &encoding, + objecttype, &object, + &PyInt_Type, &start, + &PyInt_Type, &end, + &PyString_Type, &reason)) + return NULL; + + if (PyObject_SetAttrString(self, "args", args)) + goto finally; + + if (PyObject_SetAttrString(self, "encoding", encoding)) + goto finally; + if (PyObject_SetAttrString(self, "object", object)) + goto finally; + if (PyObject_SetAttrString(self, "start", start)) + goto finally; + if (PyObject_SetAttrString(self, "end", end)) + goto finally; + if (PyObject_SetAttrString(self, "reason", reason)) + goto finally; + + Py_INCREF(Py_None); + rtnval = Py_None; + + finally: + Py_DECREF(args); + return rtnval; + } + + + static PyObject * + UnicodeEncodeError__init__(PyObject *self, PyObject *args) + { + return UnicodeError__init__(self, args, &PyUnicode_Type); + } + + static PyObject * + UnicodeEncodeError__str__(PyObject *self, PyObject *arg) + { + PyObject *encodingObj = NULL; + PyObject *objectObj = NULL; + int length; + int start; + int end; + PyObject *reasonObj = NULL; + char buffer[1000]; + PyObject *result = NULL; + + self = arg; + + if (!(encodingObj = PyUnicodeEncodeError_GetEncoding(self))) + goto error; + + if (!(objectObj = PyUnicodeEncodeError_GetObject(self))) + goto error; + + length = PyUnicode_GET_SIZE(objectObj); + + if (!PyUnicodeEncodeError_GetStart(self, &start)) + goto error; + + if (!PyUnicodeEncodeError_GetEnd(self, &end)) + goto error; + + if (!(reasonObj = PyUnicodeEncodeError_GetReason(self))) + goto error; + + if (end==start+1) { + PyOS_snprintf(buffer, sizeof(buffer), + "'%.400s' codec can't encode character '\\u%x' in position %d: %.400s", + PyString_AS_STRING(encodingObj), + (int)PyUnicode_AS_UNICODE(objectObj)[start], + start, + PyString_AS_STRING(reasonObj) + ); + } + else { + PyOS_snprintf(buffer, sizeof(buffer), + "'%.400s' codec can't encode characters in position %d-%d: %.400s", + PyString_AS_STRING(encodingObj), + start, + end-1, + PyString_AS_STRING(reasonObj) + ); + } + result = PyString_FromString(buffer); + + error: + Py_XDECREF(reasonObj); + Py_XDECREF(objectObj); + Py_XDECREF(encodingObj); + return result; + } + + static PyMethodDef UnicodeEncodeError_methods[] = { + {"__init__", UnicodeEncodeError__init__, METH_VARARGS}, + {"__str__", UnicodeEncodeError__str__, METH_O}, + {NULL, NULL} + }; + + + PyObject * PyUnicodeEncodeError_Create( + const char *encoding, const Py_UNICODE *object, int length, + int start, int end, const char *reason) + { + return PyObject_CallFunction(PyExc_UnicodeEncodeError, "su#iis", + encoding, object, length, start, end, reason); + } + + + static PyObject * + UnicodeDecodeError__init__(PyObject *self, PyObject *args) + { + return UnicodeError__init__(self, args, &PyString_Type); + } + + static PyObject * + UnicodeDecodeError__str__(PyObject *self, PyObject *arg) + { + PyObject *encodingObj = NULL; + PyObject *objectObj = NULL; + int length; + int start; + int end; + PyObject *reasonObj = NULL; + char buffer[1000]; + PyObject *result = NULL; + + self = arg; + + if (!(encodingObj = PyUnicodeDecodeError_GetEncoding(self))) + goto error; + + if (!(objectObj = PyUnicodeDecodeError_GetObject(self))) + goto error; + + length = PyString_GET_SIZE(objectObj); + + if (!PyUnicodeDecodeError_GetStart(self, &start)) + goto error; + + if (!PyUnicodeDecodeError_GetEnd(self, &end)) + goto error; + + if (!(reasonObj = PyUnicodeDecodeError_GetReason(self))) + goto error; + + if (end==start+1) { + PyOS_snprintf(buffer, sizeof(buffer), + "'%.400s' codec can't decode byte 0x%x in position %d: %.400s", + PyString_AS_STRING(encodingObj), + ((int)PyString_AS_STRING(objectObj)[start])&0xff, + start, + PyString_AS_STRING(reasonObj) + ); + } + else { + PyOS_snprintf(buffer, sizeof(buffer), + "'%.400s' codec can't decode bytes in position %d-%d: %.400s", + PyString_AS_STRING(encodingObj), + start, + end-1, + PyString_AS_STRING(reasonObj) + ); + } + result = PyString_FromString(buffer); + + error: + Py_XDECREF(reasonObj); + Py_XDECREF(objectObj); + Py_XDECREF(encodingObj); + return result; + } + + static PyMethodDef UnicodeDecodeError_methods[] = { + {"__init__", UnicodeDecodeError__init__, METH_VARARGS}, + {"__str__", UnicodeDecodeError__str__, METH_O}, + {NULL, NULL} + }; + + + PyObject * PyUnicodeDecodeError_Create( + const char *encoding, const char *object, int length, + int start, int end, const char *reason) + { + return PyObject_CallFunction(PyExc_UnicodeDecodeError, "ss#iis", + encoding, object, length, start, end, reason); + } + + + static PyObject * + UnicodeTranslateError__init__(PyObject *self, PyObject *args) + { + PyObject *rtnval = NULL; + PyObject *object; + PyObject *start; + PyObject *end; + PyObject *reason; + + if (!(self = get_self(args))) + return NULL; + + if (!(args = PySequence_GetSlice(args, 1, PySequence_Size(args)))) + return NULL; + + if (!PyArg_ParseTuple(args, "O!O!O!O!", + &PyUnicode_Type, &object, + &PyInt_Type, &start, + &PyInt_Type, &end, + &PyString_Type, &reason)) + goto finally; + + if (PyObject_SetAttrString(self, "args", args)) + goto finally; + + if (PyObject_SetAttrString(self, "object", object)) + goto finally; + if (PyObject_SetAttrString(self, "start", start)) + goto finally; + if (PyObject_SetAttrString(self, "end", end)) + goto finally; + if (PyObject_SetAttrString(self, "reason", reason)) + goto finally; + + Py_INCREF(Py_None); + rtnval = Py_None; + + finally: + Py_DECREF(args); + return rtnval; + } + + + static PyObject * + UnicodeTranslateError__str__(PyObject *self, PyObject *arg) + { + PyObject *objectObj = NULL; + int length; + int start; + int end; + PyObject *reasonObj = NULL; + char buffer[1000]; + PyObject *result = NULL; + + self = arg; + + if (!(objectObj = PyUnicodeTranslateError_GetObject(self))) + goto error; + + length = PyUnicode_GET_SIZE(objectObj); + + if (!PyUnicodeTranslateError_GetStart(self, &start)) + goto error; + + if (!PyUnicodeTranslateError_GetEnd(self, &end)) + goto error; + + if (!(reasonObj = PyUnicodeTranslateError_GetReason(self))) + goto error; + + if (end==start+1) { + PyOS_snprintf(buffer, sizeof(buffer), + "can't translate character '\\u%x' in position %d: %.400s", + (int)PyUnicode_AS_UNICODE(objectObj)[start], + start, + PyString_AS_STRING(reasonObj) + ); + } + else { + PyOS_snprintf(buffer, sizeof(buffer), + "can't translate characters in position %d-%d: %.400s", + start, + end-1, + PyString_AS_STRING(reasonObj) + ); + } + result = PyString_FromString(buffer); + + error: + Py_XDECREF(reasonObj); + Py_XDECREF(objectObj); + return result; + } + + static PyMethodDef UnicodeTranslateError_methods[] = { + {"__init__", UnicodeTranslateError__init__, METH_VARARGS}, + {"__str__", UnicodeTranslateError__str__, METH_O}, + {NULL, NULL} + }; + + + PyObject * PyUnicodeTranslateError_Create( + const Py_UNICODE *object, int length, + int start, int end, const char *reason) + { + return PyObject_CallFunction(PyExc_UnicodeTranslateError, "u#iis", + object, length, start, end, reason); + } + + /* Exception doc strings */ PyDoc_STRVAR(AssertionError__doc__, "Assertion failed."); *************** *** 862,871 **** --- 1450,1465 ---- PyDoc_STRVAR(ValueError__doc__, "Inappropriate argument value (of correct type)."); PyDoc_STRVAR(UnicodeError__doc__, "Unicode related error."); + PyDoc_STRVAR(UnicodeEncodeError__doc__, "Unicode encoding error."); + + PyDoc_STRVAR(UnicodeDecodeError__doc__, "Unicode decoding error."); + + PyDoc_STRVAR(UnicodeTranslateError__doc__, "Unicode translation error."); + PyDoc_STRVAR(SystemError__doc__, "Internal error in the Python interpreter.\n\ \n\ Please report this to the Python maintainer, along with the traceback,\n\ the Python version, and the hardware/OS platform and version."); *************** *** 942,951 **** --- 1536,1548 ---- PyObject *PyExc_ReferenceError; PyObject *PyExc_SystemError; PyObject *PyExc_SystemExit; PyObject *PyExc_UnboundLocalError; PyObject *PyExc_UnicodeError; + PyObject *PyExc_UnicodeEncodeError; + PyObject *PyExc_UnicodeDecodeError; + PyObject *PyExc_UnicodeTranslateError; PyObject *PyExc_TypeError; PyObject *PyExc_ValueError; PyObject *PyExc_ZeroDivisionError; #ifdef MS_WINDOWS PyObject *PyExc_WindowsError; *************** *** 1027,1036 **** --- 1624,1639 ---- ZeroDivisionError__doc__}, {"FloatingPointError", &PyExc_FloatingPointError, &PyExc_ArithmeticError, FloatingPointError__doc__}, {"ValueError", &PyExc_ValueError, 0, ValueError__doc__}, {"UnicodeError", &PyExc_UnicodeError, &PyExc_ValueError, UnicodeError__doc__}, + {"UnicodeEncodeError", &PyExc_UnicodeEncodeError, &PyExc_UnicodeError, + UnicodeEncodeError__doc__, UnicodeEncodeError_methods}, + {"UnicodeDecodeError", &PyExc_UnicodeDecodeError, &PyExc_UnicodeError, + UnicodeDecodeError__doc__, UnicodeDecodeError_methods}, + {"UnicodeTranslateError", &PyExc_UnicodeTranslateError, &PyExc_UnicodeError, + UnicodeTranslateError__doc__, UnicodeTranslateError_methods}, {"ReferenceError", &PyExc_ReferenceError, 0, ReferenceError__doc__}, {"SystemError", &PyExc_SystemError, 0, SystemError__doc__}, {"MemoryError", &PyExc_MemoryError, 0, MemoryError__doc__}, /* Warning categories */ {"Warning", &PyExc_Warning, &PyExc_Exception, Warning__doc__},