? Include/.codecs.h.swp ? Include/.unicodeobject.h.swp ? Modules/._codecsmodule.c.swp ? Objects/.unicodeobject.c.swp ? Python/.codecs.c.swp Index: Include/codecs.h =================================================================== RCS file: /cvsroot/python/python/dist/src/Include/codecs.h,v retrieving revision 2.3 diff -u -r2.3 codecs.h --- Include/codecs.h 2000/08/03 16:24:24 2.3 +++ Include/codecs.h 2001/07/12 10:50:32 @@ -53,12 +53,18 @@ object is passed through the encoder function found for the given encoding using the error handling method defined by errors. errors - may be NULL to use the default method defined for the codec. + may be NULL to use the strict encoding. Raises a LookupError in case no encoder can be found. */ +extern DL_IMPORT(PyObject *) PyCodec_EncodeEx( + PyObject *object, + const char *encoding, + PyObject *errors + ); + extern DL_IMPORT(PyObject *) PyCodec_Encode( PyObject *object, const char *encoding, @@ -111,11 +117,52 @@ /* Get a StreamWriter factory function for the given encoding. */ +extern DL_IMPORT(PyObject *) PyCodec_StreamWriterEx( + const char *encoding, + PyObject *stream, + PyObject *errors + ); + +/* DEPRECATED */ extern DL_IMPORT(PyObject *) PyCodec_StreamWriter( const char *encoding, PyObject *stream, const char *errors ); + +/* Return a new reference to one of the builtin unicode + encode error handlers. error can be: + + * NULL, Py_None, "strict" or u"strict" for + codecs.raise_unicodeencode_errors + * "ignore" or u"ignore" for codecs.ignore_unicodeencode_errors + * "replace" or u"replace" for codecs.replace_unicodeencode_errors + * a callable which will be returned directy + + everything else will raise an exception */ +extern DL_IMPORT(PyObject *) PyCodec_UnicodeEncodeHandlerForObject(PyObject *error); + +/* Raises a Unicode exception */ +extern DL_IMPORT(void) PyCodec_RaiseUnicodeEncodeError(const char *encoding, Py_UNICODE c, int pos); + +/* Encode error handler that raises an exception */ +extern DL_IMPORT(PyObject *) PyCodec_RaiseUnicodeEncodeErrors(PyObject *self, PyObject *args); + +/* Encode error handler that returns a empty string and so ignores the + unencodable character */ +extern DL_IMPORT(PyObject *) PyCodec_IgnoreUnicodeEncodeErrors(PyObject *self, PyObject *args); + +/* Encode error handler that returns an Unicode replacement character + that will be used by the codec to replace the unencodable character */ +extern DL_IMPORT(PyObject *) PyCodec_ReplaceUnicodeEncodeErrors(PyObject *self, PyObject *args); + +/* Encode error handler that returns an XML character reference for the + unencodable character */ +extern DL_IMPORT(PyObject *) PyCodec_XMLCharRefReplaceUnicodeEncodeErrors(PyObject *self, PyObject *args); + +/* Encode error handler that returns an \u (or \U) escape sequence + for the unencodable character */ +extern DL_IMPORT(PyObject *) PyCodec_EscapeReplaceUnicodeEncodeErrors(PyObject *self, PyObject *args); #ifdef __cplusplus } Index: Include/unicodeobject.h =================================================================== RCS file: /cvsroot/python/python/dist/src/Include/unicodeobject.h,v retrieving revision 2.27 diff -u -r2.27 unicodeobject.h --- Include/unicodeobject.h 2001/06/27 22:08:26 2.27 +++ Include/unicodeobject.h 2001/07/12 10:50:33 @@ -419,9 +419,15 @@ const char *errors /* error handling */ ); -/* Encodes a Py_UNICODE buffer of the given size and returns a - Python string object. */ +/* Encodes a Unicode object and returns a Python string object. */ +extern DL_IMPORT(PyObject*) PyUnicode_EncodeEx( + PyObject *unicode, /* Unicode object */ + const char *encoding, /* encoding */ + PyObject *errors /* error handling */ + ); + +/* DEPRECATED */ extern DL_IMPORT(PyObject*) PyUnicode_Encode( const Py_UNICODE *s, /* Unicode char buffer */ int size, /* number of Py_UNICODE chars to encode */ @@ -429,9 +435,12 @@ const char *errors /* error handling */ ); +#define PyUnicode_AsEncodedStringEx PyUnicode_EncodeEx + /* Encodes a Unicode object and returns the result as Python string object. */ +/* DEPRECATED */ extern DL_IMPORT(PyObject*) PyUnicode_AsEncodedString( PyObject *unicode, /* Unicode object */ const char *encoding, /* encoding */ @@ -450,6 +459,12 @@ PyObject *unicode /* Unicode object */ ); +extern DL_IMPORT(PyObject*) PyUnicode_EncodeUTF8Ex( + PyObject *unicode, /* Unicode object */ + PyObject *errors /* error handling */ + ); + +/* DEPRECATED */ extern DL_IMPORT(PyObject*) PyUnicode_EncodeUTF8( const Py_UNICODE *data, /* Unicode char buffer */ int length, /* number of Py_UNICODE chars to encode */ @@ -517,6 +532,13 @@ */ +extern DL_IMPORT(PyObject*) PyUnicode_EncodeUTF16Ex( + PyObject *unicode, /* Unicode object */ + PyObject *errors, /* error handling */ + int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */ + ); + +/* DEPRECATED */ extern DL_IMPORT(PyObject*) PyUnicode_EncodeUTF16( const Py_UNICODE *data, /* Unicode char buffer */ int length, /* number of Py_UNICODE chars to encode */ @@ -536,6 +558,10 @@ PyObject *unicode /* Unicode object */ ); +extern DL_IMPORT(PyObject*) PyUnicode_EncodeUnicodeEscapeEx( + PyObject *unicode /* Unicode object */ + ); + extern DL_IMPORT(PyObject*) PyUnicode_EncodeUnicodeEscape( const Py_UNICODE *data, /* Unicode char buffer */ int length /* Number of Py_UNICODE chars to encode */ @@ -549,7 +575,7 @@ const char *errors /* error handling */ ); -extern DL_IMPORT(PyObject*) PyUnicode_AsRawUnicodeEscapeString( +extern DL_IMPORT(PyObject*) PyUnicode_EncodeRawUnicodeEscapeEx( PyObject *unicode /* Unicode object */ ); @@ -558,6 +584,8 @@ int length /* Number of Py_UNICODE chars to encode */ ); +#define PyUnicode_AsRawUnicodeEscapeString PyUnicode_EncodeRawUnicodeEscapeEx + /* --- Latin-1 Codecs ----------------------------------------------------- Note: Latin-1 corresponds to the first 256 Unicode ordinals. @@ -574,6 +602,12 @@ PyObject *unicode /* Unicode object */ ); +extern DL_IMPORT(PyObject*) PyUnicode_EncodeLatin1Ex( + PyObject *unicode, /* Unicode object */ + PyObject *errors /* error handling */ + ); + +/* DEPRECATED */ extern DL_IMPORT(PyObject*) PyUnicode_EncodeLatin1( const Py_UNICODE *data, /* Unicode char buffer */ int length, /* Number of Py_UNICODE chars to encode */ @@ -596,6 +630,12 @@ PyObject *unicode /* Unicode object */ ); +extern DL_IMPORT(PyObject*) PyUnicode_EncodeASCIIEx( + PyObject *unicode, /* Unicode object */ + PyObject *errors /* error handling */ + ); + +/* DEPRECATED */ extern DL_IMPORT(PyObject*) PyUnicode_EncodeASCII( const Py_UNICODE *data, /* Unicode char buffer */ int length, /* Number of Py_UNICODE chars to encode */ @@ -638,6 +678,14 @@ (unicode ordinal -> char ordinal) */ ); +extern DL_IMPORT(PyObject*) PyUnicode_EncodeCharmapEx( + PyObject *unicode, /* Unicode object */ + PyObject *mapping, /* character mapping + (unicode ordinal -> char ordinal) */ + PyObject *errors /* error handling */ + ); + +/* DEPRECATED */ extern DL_IMPORT(PyObject*) PyUnicode_EncodeCharmap( const Py_UNICODE *data, /* Unicode char buffer */ int length, /* Number of Py_UNICODE chars to encode */ @@ -680,6 +728,12 @@ PyObject *unicode /* Unicode object */ ); +extern DL_IMPORT(PyObject*) PyUnicode_EncodeMBCSEx( + const Pyobject *unicode, /* Unicode object */ + const PyObject *errors /* error handling */ + ); + +/* DEPRECATED */ extern DL_IMPORT(PyObject*) PyUnicode_EncodeMBCS( const Py_UNICODE *data, /* Unicode char buffer */ int length, /* Number of Py_UNICODE chars to encode */ @@ -694,7 +748,8 @@ an output buffer using standard ASCII digit codes. The output buffer has to provide at least length+1 bytes of storage - area. The output string is 0-terminated. + area (more if longer replacement string are generated). + The output string is 0-terminated. The encoder converts whitespace to ' ', decimal characters to their corresponding ASCII digit and all other Latin-1 characters except @@ -703,15 +758,23 @@ Error handling is defined by the errors argument: - NULL or "strict": raise a ValueError - "ignore": ignore the wrong characters (these are not copied to the - output buffer) - "replace": replaces illegal characters with '?' + NULL, None, "strict" or u"strict": raise a UnicodeError + "ignore" or u"ignore": ignore the wrong characters (these are + not copied to the output buffer) + "replace" or u"replace": replaces illegal characters with '?' + callable object: use what the object returns as replacement Returns 0 on success, -1 on failure. */ +extern DL_IMPORT(int) PyUnicode_EncodeDecimalEx( + PyObject *unicode, /* Unicode object */ + char *output, /* Output buffer; must have size >= length */ + PyObject *errors /* error handling */ + ); + +/* DEPRECATED */ extern DL_IMPORT(int) PyUnicode_EncodeDecimal( Py_UNICODE *s, /* Unicode buffer */ int length, /* Number of Py_UNICODE chars to encode */ @@ -772,7 +835,7 @@ */ extern DL_IMPORT(PyObject *) PyUnicode_Translate( - PyObject *str, /* String */ + PyObject *str, /* String */ PyObject *table, /* Translate table */ const char *errors /* error handling */ ); Index: Lib/codecs.py =================================================================== RCS file: /cvsroot/python/python/dist/src/Lib/codecs.py,v retrieving revision 1.19 diff -u -r1.19 codecs.py --- Lib/codecs.py 2001/05/29 06:06:54 1.19 +++ Lib/codecs.py 2001/07/12 10:50:33 @@ -51,13 +51,17 @@ The .encode()/.decode() methods may implement different error handling schemes by providing the errors argument. These - string values are defined: + values are defined: - 'strict' - raise a ValueError error (or a subclass) + None or 'strict' - raise a UnicodeError error (or a subclass) 'ignore' - ignore the character and continue with the next 'replace' - replace with a suitable replacement character; - Python will use the official U+FFFD REPLACEMENT - CHARACTER for the builtin Unicode codecs. + Python will use the official U+FFFD REPLACEMENT + CHARACTER for the builtin Unicode codecs. + callable object - call the object with the arguments + encoding name, character, position + and encode the unicode object returned + instead of the original character. """ def encode(self, input, errors='strict'): @@ -66,7 +70,7 @@ object, length consumed). errors defines the error handling to apply. It defaults to - 'strict' handling. + strict handling. The method may not store state in the Codec instance. Use StreamCodec for codecs which have to keep state in order to @@ -122,9 +126,15 @@ schemes by providing the errors keyword argument. These parameters are defined: - 'strict' - raise a ValueError (or a subclass) - 'ignore' - ignore the character and continue with the next - 'replace'- replace with a suitable replacement character + None or 'strict' - raise a UnicodeError error (or a subclass) + 'ignore' - ignore the character and continue with the next + 'replace' - replace with a suitable replacement character; + Python will use the official U+FFFD REPLACEMENT + CHARACTER for the builtin Unicode codecs. + callable object - call the object with the arguments + encoding name, character, position + and encode the unicode object returned + instead of the original character. """ self.stream = stream Index: Lib/encodings/base64_codec.py =================================================================== RCS file: /cvsroot/python/python/dist/src/Lib/encodings/base64_codec.py,v retrieving revision 1.1 diff -u -r1.1 base64_codec.py --- Lib/encodings/base64_codec.py 2001/05/15 12:00:02 1.1 +++ Lib/encodings/base64_codec.py 2001/07/12 10:50:34 @@ -10,17 +10,15 @@ ### Codec APIs -def base64_encode(input,errors='strict'): +def base64_encode(input,errors='None'): """ Encodes the object input and returns a tuple (output object, length consumed). - errors defines the error handling to apply. It defaults to - 'strict' handling which is the only currently supported - error handling for this codec. + errors defines the error handling to apply. As there are + no unencodable characters, errors will be ignored. """ - assert errors == 'strict' output = base64.encodestring(input) return (output, len(input)) @@ -34,7 +32,7 @@ mapped files are examples of objects providing this slot. errors defines the error handling to apply. It defaults to - 'strict' handling which is the only currently supported + strict handling which is the only currently supported error handling for this codec. """ Index: Lib/encodings/hex_codec.py =================================================================== RCS file: /cvsroot/python/python/dist/src/Lib/encodings/hex_codec.py,v retrieving revision 1.1 diff -u -r1.1 hex_codec.py --- Lib/encodings/hex_codec.py 2001/05/15 12:00:02 1.1 +++ Lib/encodings/hex_codec.py 2001/07/12 10:50:34 @@ -10,17 +10,15 @@ ### Codec APIs -def hex_encode(input,errors='strict'): +def hex_encode(input,errors=None): """ Encodes the object input and returns a tuple (output object, length consumed). - errors defines the error handling to apply. It defaults to - 'strict' handling which is the only currently supported - error handling for this codec. + errors defines the error handling to apply. As there + are no unencodable characters, errors will be ignored. """ - assert errors == 'strict' output = binascii.b2a_hex(input) return (output, len(input)) @@ -34,7 +32,7 @@ mapped files are examples of objects providing this slot. errors defines the error handling to apply. It defaults to - 'strict' handling which is the only currently supported + strict handling which is the only currently supported error handling for this codec. """ Index: Lib/encodings/quopri_codec.py =================================================================== RCS file: /cvsroot/python/python/dist/src/Lib/encodings/quopri_codec.py,v retrieving revision 1.1 diff -u -r1.1 quopri_codec.py --- Lib/encodings/quopri_codec.py 2001/05/15 15:34:07 1.1 +++ Lib/encodings/quopri_codec.py 2001/07/12 10:50:34 @@ -9,15 +9,12 @@ except ImportError: from StringIO import StringIO -def quopri_encode(input, errors='strict'): +def quopri_encode(input, errors=None): """Encode the input, returning a tuple (output object, length consumed). - errors defines the error handling to apply. It defaults to - 'strict' handling which is the only currently supported - error handling for this codec. - + errors defines the error handling to apply. As there are + not unencodable characters for quopri, errors will be ignored. """ - assert errors == 'strict' f = StringIO(input) g = StringIO() quopri.encode(f, g, 1) Index: Lib/encodings/uu_codec.py =================================================================== RCS file: /cvsroot/python/python/dist/src/Lib/encodings/uu_codec.py,v retrieving revision 1.1 diff -u -r1.1 uu_codec.py --- Lib/encodings/uu_codec.py 2001/05/15 12:00:02 1.1 +++ Lib/encodings/uu_codec.py 2001/07/12 10:50:34 @@ -12,17 +12,17 @@ ### Codec APIs -def uu_encode(input,errors='strict',filename='',mode=0666): +def uu_encode(input,errors=None,filename='',mode=0666): """ Encodes the object input and returns a tuple (output object, length consumed). errors defines the error handling to apply. It defaults to - 'strict' handling which is the only currently supported + strict handling which is the only currently supported error handling for this codec. """ - assert errors == 'strict' + assert errors is None or errors == "strict" or errors == codecs.raise_unicodeencode_errors from cStringIO import StringIO from binascii import b2a_uu infile = StringIO(input) @@ -50,7 +50,7 @@ mapped files are examples of objects providing this slot. errors defines the error handling to apply. It defaults to - 'strict' handling which is the only currently supported + strict handling which is the only currently supported error handling for this codec. Note: filename and file mode information in the input data is Index: Lib/encodings/zlib_codec.py =================================================================== RCS file: /cvsroot/python/python/dist/src/Lib/encodings/zlib_codec.py,v retrieving revision 1.1 diff -u -r1.1 zlib_codec.py --- Lib/encodings/zlib_codec.py 2001/05/15 12:00:02 1.1 +++ Lib/encodings/zlib_codec.py 2001/07/12 10:50:35 @@ -11,17 +11,15 @@ ### Codec APIs -def zlib_encode(input,errors='strict'): +def zlib_encode(input,errors=None): """ Encodes the object input and returns a tuple (output object, length consumed). - errors defines the error handling to apply. It defaults to - 'strict' handling which is the only currently supported - error handling for this codec. + errors defines the error handling to apply. As there + are no unencodable characters errors will be ignored. """ - assert errors == 'strict' output = zlib.compress(input) return (output, len(input)) @@ -35,7 +33,7 @@ mapped files are examples of objects providing this slot. errors defines the error handling to apply. It defaults to - 'strict' handling which is the only currently supported + strict handling which is the only currently supported error handling for this codec. """ Index: Modules/_codecsmodule.c =================================================================== RCS file: /cvsroot/python/python/dist/src/Modules/_codecsmodule.c,v retrieving revision 2.8 diff -u -r2.8 _codecsmodule.c --- Modules/_codecsmodule.c 2001/06/26 15:11:00 2.8 +++ Modules/_codecsmodule.c 2001/07/12 10:50:39 @@ -327,9 +327,9 @@ { const char *data; int size; - const char *errors = NULL; + PyObject *errors = NULL; - if (!PyArg_ParseTuple(args, "s#|z:readbuffer_encode", + if (!PyArg_ParseTuple(args, "s#|O:readbuffer_encode", &data, &size, &errors)) return NULL; @@ -343,9 +343,9 @@ { const char *data; int size; - const char *errors = NULL; + PyObject *errors = NULL; - if (!PyArg_ParseTuple(args, "t#|z:charbuffer_encode", + if (!PyArg_ParseTuple(args, "t#|O:charbuffer_encode", &data, &size, &errors)) return NULL; @@ -358,11 +358,11 @@ PyObject *args) { PyObject *obj; - const char *errors = NULL; + PyObject *errors = NULL; const char *data; int size; - if (!PyArg_ParseTuple(args, "O|z:unicode_internal_encode", + if (!PyArg_ParseTuple(args, "O|O:unicode_internal_encode", &obj, &errors)) return NULL; @@ -382,22 +382,21 @@ static PyObject * utf_8_encode(PyObject *self, - PyObject *args) + PyObject *args) { PyObject *str, *v; - const char *errors = NULL; + PyObject *errors = NULL; - if (!PyArg_ParseTuple(args, "O|z:utf_8_encode", + if (!PyArg_ParseTuple(args, "O|O:utf_8_encode", &str, &errors)) return NULL; str = PyUnicode_FromObject(str); if (str == NULL) return NULL; - v = codec_tuple(PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(str), - PyUnicode_GET_SIZE(str), - errors), - PyUnicode_GET_SIZE(str)); + v = codec_tuple( + PyUnicode_EncodeUTF8Ex(str, errors), + PyUnicode_GET_SIZE(str)); Py_DECREF(str); return v; } @@ -411,172 +410,161 @@ static PyObject * utf_16_encode(PyObject *self, - PyObject *args) + PyObject *args) { PyObject *str, *v; - const char *errors = NULL; + PyObject *errors = NULL; int byteorder = 0; - if (!PyArg_ParseTuple(args, "O|zi:utf_16_encode", + if (!PyArg_ParseTuple(args, "O|Oi:utf_16_encode", &str, &errors, &byteorder)) return NULL; str = PyUnicode_FromObject(str); if (str == NULL) return NULL; - v = codec_tuple(PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(str), - PyUnicode_GET_SIZE(str), - errors, - byteorder), - PyUnicode_GET_SIZE(str)); + v = codec_tuple( + PyUnicode_EncodeUTF16Ex(str, errors, byteorder), + PyUnicode_GET_SIZE(str)); Py_DECREF(str); return v; } static PyObject * utf_16_le_encode(PyObject *self, - PyObject *args) + PyObject *args) { PyObject *str, *v; - const char *errors = NULL; + PyObject *errors = NULL; - if (!PyArg_ParseTuple(args, "O|z:utf_16_le_encode", + if (!PyArg_ParseTuple(args, "O|O:utf_16_le_encode", &str, &errors)) return NULL; str = PyUnicode_FromObject(str); if (str == NULL) return NULL; - v = codec_tuple(PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(str), - PyUnicode_GET_SIZE(str), - errors, - -1), - PyUnicode_GET_SIZE(str)); + v = codec_tuple( + PyUnicode_EncodeUTF16Ex(str, errors, -1), + PyUnicode_GET_SIZE(str)); Py_DECREF(str); return v; } static PyObject * utf_16_be_encode(PyObject *self, - PyObject *args) + PyObject *args) { PyObject *str, *v; - const char *errors = NULL; + PyObject *errors = NULL; - if (!PyArg_ParseTuple(args, "O|z:utf_16_be_encode", + if (!PyArg_ParseTuple(args, "O|O:utf_16_be_encode", &str, &errors)) return NULL; str = PyUnicode_FromObject(str); if (str == NULL) return NULL; - v = codec_tuple(PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(str), - PyUnicode_GET_SIZE(str), - errors, - +1), - PyUnicode_GET_SIZE(str)); + v = codec_tuple( + PyUnicode_EncodeUTF16Ex(str, errors, +1), + PyUnicode_GET_SIZE(str)); Py_DECREF(str); return v; } static PyObject * unicode_escape_encode(PyObject *self, - PyObject *args) + PyObject *args) { PyObject *str, *v; - const char *errors = NULL; + PyObject *errors = NULL; - if (!PyArg_ParseTuple(args, "O|z:unicode_escape_encode", + if (!PyArg_ParseTuple(args, "O|O:unicode_escape_encode", &str, &errors)) return NULL; str = PyUnicode_FromObject(str); if (str == NULL) return NULL; - v = codec_tuple(PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(str), - PyUnicode_GET_SIZE(str)), - PyUnicode_GET_SIZE(str)); + v = codec_tuple( + PyUnicode_EncodeUnicodeEscapeEx(str), + PyUnicode_GET_SIZE(str)); Py_DECREF(str); return v; } static PyObject * raw_unicode_escape_encode(PyObject *self, - PyObject *args) + PyObject *args) { PyObject *str, *v; - const char *errors = NULL; + PyObject *errors = NULL; - if (!PyArg_ParseTuple(args, "O|z:raw_unicode_escape_encode", + if (!PyArg_ParseTuple(args, "O|O:raw_unicode_escape_encode", &str, &errors)) return NULL; str = PyUnicode_FromObject(str); if (str == NULL) return NULL; - v = codec_tuple(PyUnicode_EncodeRawUnicodeEscape( - PyUnicode_AS_UNICODE(str), - PyUnicode_GET_SIZE(str)), - PyUnicode_GET_SIZE(str)); + v = codec_tuple( + PyUnicode_EncodeRawUnicodeEscapeEx(str), + PyUnicode_GET_SIZE(str)); Py_DECREF(str); return v; } static PyObject * latin_1_encode(PyObject *self, - PyObject *args) + PyObject *args) { PyObject *str, *v; - const char *errors = NULL; + PyObject *errors = NULL; - if (!PyArg_ParseTuple(args, "O|z:latin_1_encode", + if (!PyArg_ParseTuple(args, "O|O:latin_1_encode", &str, &errors)) return NULL; str = PyUnicode_FromObject(str); if (str == NULL) return NULL; - v = codec_tuple(PyUnicode_EncodeLatin1( - PyUnicode_AS_UNICODE(str), - PyUnicode_GET_SIZE(str), - errors), - PyUnicode_GET_SIZE(str)); + v = codec_tuple( + PyUnicode_EncodeLatin1Ex(str, errors), + PyUnicode_GET_SIZE(str)); Py_DECREF(str); return v; } static PyObject * ascii_encode(PyObject *self, - PyObject *args) + PyObject *args) { PyObject *str, *v; - const char *errors = NULL; + PyObject *errors = NULL; - if (!PyArg_ParseTuple(args, "O|z:ascii_encode", + if (!PyArg_ParseTuple(args, "O|O:ascii_encode", &str, &errors)) return NULL; str = PyUnicode_FromObject(str); if (str == NULL) return NULL; - v = codec_tuple(PyUnicode_EncodeASCII( - PyUnicode_AS_UNICODE(str), - PyUnicode_GET_SIZE(str), - errors), - PyUnicode_GET_SIZE(str)); + v = codec_tuple( + PyUnicode_EncodeASCIIEx(str, errors), + PyUnicode_GET_SIZE(str)); Py_DECREF(str); return v; } static PyObject * charmap_encode(PyObject *self, - PyObject *args) + PyObject *args) { PyObject *str, *v; - const char *errors = NULL; + PyObject *errors = NULL; PyObject *mapping = NULL; - if (!PyArg_ParseTuple(args, "O|zO:charmap_encode", + if (!PyArg_ParseTuple(args, "O|OO:charmap_encode", &str, &errors, &mapping)) return NULL; if (mapping == Py_None) @@ -585,12 +573,9 @@ str = PyUnicode_FromObject(str); if (str == NULL) return NULL; - v = codec_tuple(PyUnicode_EncodeCharmap( - PyUnicode_AS_UNICODE(str), - PyUnicode_GET_SIZE(str), - mapping, - errors), - PyUnicode_GET_SIZE(str)); + v = codec_tuple( + PyUnicode_EncodeCharmapEx(str, mapping, errors), + PyUnicode_GET_SIZE(str)); Py_DECREF(str); return v; } @@ -602,20 +587,18 @@ PyObject *args) { PyObject *str, *v; - const char *errors = NULL; + PyObject *errors = NULL; - if (!PyArg_ParseTuple(args, "O|z:mbcs_encode", + if (!PyArg_ParseTuple(args, "O|O:mbcs_encode", &str, &errors)) return NULL; str = PyUnicode_FromObject(str); if (str == NULL) return NULL; - v = codec_tuple(PyUnicode_EncodeMBCS( - PyUnicode_AS_UNICODE(str), - PyUnicode_GET_SIZE(str), - errors), - PyUnicode_GET_SIZE(str)); + v = codec_tuple( + PyUnicode_EncodeMBCSEx(str, errors), + PyUnicode_GET_SIZE(str)); Py_DECREF(str); return v; } @@ -654,7 +637,12 @@ {"mbcs_encode", mbcs_encode, 1}, {"mbcs_decode", mbcs_decode, 1}, #endif - {NULL, NULL} /* sentinel */ + {"raise_unicodeencode_errors", PyCodec_RaiseUnicodeEncodeErrors, 1}, + {"ignore_unicodeencode_errors", PyCodec_IgnoreUnicodeEncodeErrors, 1}, + {"replace_unicodeencode_errors", PyCodec_ReplaceUnicodeEncodeErrors, 1}, + {"xmlcharrefreplace_unicodeencode_errors", PyCodec_XMLCharRefReplaceUnicodeEncodeErrors, 1}, + {"escapereplace_unicodeencode_errors", PyCodec_EscapeReplaceUnicodeEncodeErrors, 1}, + {NULL, NULL} /* sentinel */ }; DL_EXPORT(void) Index: Objects/unicodeobject.c =================================================================== RCS file: /cvsroot/python/python/dist/src/Objects/unicodeobject.c,v retrieving revision 2.101 diff -u -r2.101 unicodeobject.c --- Objects/unicodeobject.c 2001/06/27 18:59:43 2.101 +++ Objects/unicodeobject.c 2001/07/12 10:50:43 @@ -509,47 +509,30 @@ return NULL; } -PyObject *PyUnicode_Encode(const Py_UNICODE *s, - int size, - const char *encoding, - const char *errors) +PyObject *PyUnicode_EncodeEx(PyObject *unicode, + const char *encoding, + PyObject *errors) { - PyObject *v, *unicode; - - unicode = PyUnicode_FromUnicode(s, size); - if (unicode == NULL) - return NULL; - v = PyUnicode_AsEncodedString(unicode, encoding, errors); - Py_DECREF(unicode); - return v; -} - -PyObject *PyUnicode_AsEncodedString(PyObject *unicode, - const char *encoding, - const char *errors) -{ PyObject *v; - + if (!PyUnicode_Check(unicode)) { PyErr_BadArgument(); goto onError; } - if (encoding == NULL) + if (encoding == NULL) encoding = PyUnicode_GetDefaultEncoding(); /* Shortcuts for common default encodings */ - if (errors == NULL) { - if (strcmp(encoding, "utf-8") == 0) - return PyUnicode_AsUTF8String(unicode); - else if (strcmp(encoding, "latin-1") == 0) - return PyUnicode_AsLatin1String(unicode); - else if (strcmp(encoding, "ascii") == 0) - return PyUnicode_AsASCIIString(unicode); - } + if (strcmp(encoding, "utf-8") == 0) + return PyUnicode_EncodeUTF8Ex(unicode, errors); + else if ((strcmp(encoding, "latin-1") == 0) || (strcmp(encoding, "iso-8859-1") == 0)) + return PyUnicode_EncodeLatin1Ex(unicode, errors); + else if (strcmp(encoding, "ascii") == 0) + return PyUnicode_EncodeASCIIEx(unicode, errors); /* Encode via the codec registry */ - v = PyCodec_Encode(unicode, encoding, errors); + v = PyCodec_EncodeEx(unicode, encoding, errors); if (v == NULL) goto onError; /* XXX Should we really enforce this ? */ @@ -566,6 +549,28 @@ return NULL; } +PyObject *PyUnicode_AsEncodedString(PyObject *unicode, + const char *encoding, + const char *errors) +{ + PyObject *errorstr; + PyObject *res; + + if (errors) { + errorstr = PyString_FromString(errors); + if (!errorstr) + return NULL; + } + else { + Py_INCREF(Py_None); + errorstr = Py_None; + } + + res = PyUnicode_AsEncodedStringEx(unicode, encoding, errorstr); + Py_DECREF(errorstr); + return res; +} + /* Return a Python string holding the default encoded value of the Unicode object. @@ -865,23 +870,36 @@ } #endif -PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s, - int size, - const char *errors) +PyObject *PyUnicode_EncodeUTF8Ex(PyObject *unicode, + PyObject *errors) { + Py_UNICODE *s; + int size; PyObject *v; char *p; char *q; Py_UCS4 ch2; - unsigned int cbAllocated = 3 * size; + unsigned int cbAllocated; unsigned int cbWritten = 0; int i = 0; + if (!PyUnicode_Check(unicode)) { + PyErr_BadArgument(); + return NULL; + } + errors = PyCodec_UnicodeEncodeHandlerForObject(errors); + if (errors == NULL) + return NULL; + s = PyUnicode_AS_UNICODE(unicode); + size = PyUnicode_GET_SIZE(unicode); + cbAllocated = 3 * size; v = PyString_FromStringAndSize(NULL, cbAllocated); if (v == NULL) return NULL; - if (size == 0) + if (size == 0) { + Py_DECREF(errors); return v; + } p = q = PyString_AS_STRING(v); while (i < size) { @@ -941,18 +959,42 @@ onError: Py_DECREF(v); + Py_DECREF(errors); return NULL; } -PyObject *PyUnicode_AsUTF8String(PyObject *unicode) +PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s, + int size, + const char *errors) { - if (!PyUnicode_Check(unicode)) { - PyErr_BadArgument(); - return NULL; + PyObject *unicode; + PyObject *errorstr; + PyObject *res; + + unicode = PyUnicode_FromUnicode(s, size); + if (!unicode) + return NULL; + if (errors) { + errorstr = PyString_FromString(errors); + if (!errorstr) { + Py_DECREF(unicode); + return NULL; + } } - return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode), - PyUnicode_GET_SIZE(unicode), - NULL); + else { + Py_INCREF(Py_None); + errorstr = Py_None; + } + + res = PyUnicode_EncodeUTF8Ex(unicode, errorstr); + Py_DECREF(unicode); + Py_DECREF(errorstr); + return res; +} + +PyObject *PyUnicode_AsUTF8String(PyObject *unicode) +{ + return PyUnicode_EncodeUTF8Ex(unicode, NULL); } /* --- UTF-16 Codec ------------------------------------------------------- */ @@ -1123,23 +1165,32 @@ #undef UTF16_ERROR -PyObject *PyUnicode_EncodeUTF16(const Py_UNICODE *s, - int size, - const char *errors, - int byteorder) -{ - PyObject *v; +PyObject *PyUnicode_EncodeUTF16Ex(PyObject *unicode, + PyObject *errors, + int byteorder) +{ + Py_UNICODE *s; + int size; + PyObject *v = NULL; Py_UCS2 *p; char *q; int i, pairs, doswap = 1; + if (!PyUnicode_Check(unicode)) { + PyErr_BadArgument(); + return NULL; + } + + s = PyUnicode_AS_UNICODE(unicode); + size = PyUnicode_GET_SIZE(unicode); + for (i = pairs = 0; i < size; i++) if (s[i] >= 0x10000) pairs++; v = PyString_FromStringAndSize(NULL, sizeof(Py_UCS2) * (size + pairs + (byteorder == 0))); if (v == NULL) - return NULL; + goto finish; q = PyString_AS_STRING(v); p = (Py_UCS2 *)q; @@ -1172,19 +1223,44 @@ *p++ = ch2; } } + finish: + Py_DECREF(errors); return v; } -PyObject *PyUnicode_AsUTF16String(PyObject *unicode) +PyObject *PyUnicode_EncodeUTF16(const Py_UNICODE *s, + int size, + const char *errors, + int byteorder) { - if (!PyUnicode_Check(unicode)) { - PyErr_BadArgument(); - return NULL; + PyObject *unicode; + PyObject *errorstr; + PyObject *res; + + unicode = PyUnicode_FromUnicode(s, size); + if (!unicode) + return NULL; + if (errors) { + errorstr = PyString_FromString(errors); + if (!errorstr) { + Py_DECREF(unicode); + return NULL; + } + } + else { + Py_INCREF(Py_None); + errorstr = Py_None; } - return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode), - PyUnicode_GET_SIZE(unicode), - NULL, - 0); + + res = PyUnicode_EncodeUTF16Ex(unicode, errorstr, byteorder); + Py_DECREF(unicode); + Py_DECREF(errorstr); + return res; +} + +PyObject *PyUnicode_AsUTF16String(PyObject *unicode) +{ + return PyUnicode_EncodeUTF16Ex(unicode, NULL, 0); } /* --- Unicode Escape Codec ----------------------------------------------- */ @@ -1501,6 +1577,18 @@ return NULL; } +PyObject *PyUnicode_EncodeUnicodeEscapeEx(PyObject *unicode) +{ + if (!PyUnicode_Check(unicode)) { + PyErr_BadArgument(); + return NULL; + } + + return unicodeescape_string( + PyUnicode_AS_UNICODE(unicode), + PyUnicode_GET_SIZE(unicode), 0); +} + PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s, int size) { @@ -1509,12 +1597,7 @@ PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode) { - if (!PyUnicode_Check(unicode)) { - PyErr_BadArgument(); - return NULL; - } - return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode), - PyUnicode_GET_SIZE(unicode)); + return PyUnicode_EncodeUnicodeEscapeEx(unicode); } /* --- Raw Unicode Escape Codec ------------------------------------------- */ @@ -1595,15 +1678,22 @@ return NULL; } -PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s, - int size) +PyObject *PyUnicode_EncodeRawUnicodeEscapeEx(PyObject *unicode) { PyObject *repr; char *p; char *q; + Py_UNICODE *s; + int size; static const char *hexdigit = "0123456789abcdef"; + if (!PyUnicode_Check(unicode)) { + PyErr_BadArgument(); + return NULL; + } + s = PyUnicode_AS_UNICODE(unicode); + size = PyUnicode_GET_SIZE(unicode); repr = PyString_FromStringAndSize(NULL, 6 * size); if (repr == NULL) return NULL; @@ -1637,14 +1727,19 @@ return NULL; } -PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode) +PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s, + int size) { - if (!PyUnicode_Check(unicode)) { - PyErr_BadArgument(); + PyObject *unicode; + PyObject *res; + + unicode = PyUnicode_FromUnicode(s, size); + if (!unicode) return NULL; - } - return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode), - PyUnicode_GET_SIZE(unicode)); + + res = PyUnicode_EncodeRawUnicodeEscapeEx(unicode); + Py_DECREF(unicode); + return res; } /* --- Latin-1 Codec ------------------------------------------------------ */ @@ -1677,83 +1772,212 @@ return NULL; } -static -int latin1_encoding_error(const Py_UNICODE **source, - char **dest, - const char *errors, - const char *details) -{ - if ((errors == NULL) || - (strcmp(errors,"strict") == 0)) { - PyErr_Format(PyExc_UnicodeError, - "Latin-1 encoding error: %.400s", - details); - return -1; - } - else if (strcmp(errors,"ignore") == 0) { - return 0; - } - else if (strcmp(errors,"replace") == 0) { - **dest = '?'; - (*dest)++; - return 0; +/* error handling callback helper: + build arguments, call the callback and check the arguments, + put the result into newpos and return the replacement string, which + has to be freed by the caller +*/ + +static PyObject *unicode_encode_call_errorhandler(PyObject *errors, PyObject **errorHandler, + const char *encoding, PyObject *unicode, int unicodepos, + int *newpos) +{ + static char *argparse = "O!i;encoding error handler must return (unicode, int) tuple"; + + PyObject *args; + PyObject *restuple; + PyObject *resunicode; + int size = PyUnicode_GET_SIZE(unicode); + + if (*errorHandler == NULL) { + *errorHandler = PyCodec_UnicodeEncodeHandlerForObject(errors); + if (*errorHandler == NULL) + return NULL; + } + + /* we don't need a state */ + args = Py_BuildValue("sOiO", encoding, unicode, unicodepos, Py_None); + if (args == NULL) + return NULL; + restuple = PyEval_CallObject(*errorHandler, args); + Py_DECREF(args); + if (restuple == NULL) + return NULL; + if (!PyTuple_Check(restuple)) { + PyErr_Format(PyExc_TypeError, &argparse[4]); + Py_DECREF(restuple); + return NULL; } - else { - PyErr_Format(PyExc_ValueError, - "Latin-1 encoding error; " - "unknown error handling code: %.400s", - errors); - return -1; + if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &resunicode, newpos)) { + Py_DECREF(restuple); + return NULL; } + if (*newpos<0) + *newpos = -1; + else if (*newpos>size) + *newpos = size-1; + else + --*newpos; + Py_INCREF(resunicode); + Py_DECREF(restuple); + return resunicode; } -PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p, - int size, - const char *errors) -{ - PyObject *repr; - char *s, *start; +/* Encode a Unicode object as ASCII (limit==128) or + latin-1 (limit==256) +*/ - repr = PyString_FromStringAndSize(NULL, size); - if (repr == NULL) - return NULL; - if (size == 0) - return repr; +static PyObject *unicode_encode_ucs1ex(PyObject *unicode, + PyObject *errors, int limit) +{ + /* output object */ + PyObject *res; + /* size of input */ + int unicodesize; + /* remaining size of input */ + int remainingunicodesize; + Py_UNICODE *uni; + Py_UNICODE *origuni; + char *str; + /* current output position */ + int respos = 0; + int ressize; + char *encoding = (limit == 256) ? "latin-1" : "ascii"; + PyObject *errorHandler = NULL; - s = PyString_AS_STRING(repr); - start = s; - while (size-- > 0) { - Py_UNICODE ch = *p++; - if (ch >= 256) { - if (latin1_encoding_error(&p, &s, errors, - "ordinal not in range(256)")) + if (!PyUnicode_Check(unicode)) { + PyErr_BadArgument(); + return NULL; + } + remainingunicodesize = unicodesize = PyUnicode_GET_SIZE(unicode); + origuni = uni = PyUnicode_AS_UNICODE(unicode); + /* allocate enough for a simple encoding without + replacements, if we need more, we'll resize */ + res = PyString_FromStringAndSize(NULL, unicodesize); + if (res == NULL) + goto onError; + if (remainingunicodesize == 0) + return res; + str = PyString_AS_STRING(res); + ressize = unicodesize; + + for (;remainingunicodesize; --remainingunicodesize, ++uni) { + Py_UNICODE c = *uni; + + /* can we encode this? */ + if (c ressize) { + if (requiredsize<2*ressize) + requiredsize = 2*ressize; + if (_PyString_Resize(&res, requiredsize)) { + Py_DECREF(repunicode); + goto onError; + } + str = PyString_AS_STRING(res) + respos; + ressize = requiredsize; + } + /* check if there is anything unencodable in the replacement + and copy it to the output */ + while (remaining-->0) { + c = *uni2++; + if (c >= limit) { + PyCodec_RaiseUnicodeEncodeError(encoding, c, unicodepos); + Py_DECREF(repunicode); + goto onError; + } + *str++ = (char)c; + } + unicodepos = newpos; + Py_DECREF(repunicode); } - else - *s++ = (char)ch; } - /* Resize if error handling skipped some characters */ - if (s - start < PyString_GET_SIZE(repr)) - if (_PyString_Resize(&repr, s - start)) + /* Resize if we allocated to much */ + respos = str-PyString_AS_STRING(res); + if (respos 0) { - Py_UNICODE ch = *p++; - if (ch >= 128) { - if (ascii_encoding_error(&p, &s, errors, - "ordinal not in range(128)")) - goto onError; - } - else - *s++ = (char)ch; - } - /* Resize if error handling skipped some characters */ - if (s - start < PyString_GET_SIZE(repr)) - if (_PyString_Resize(&repr, s - start)) - goto onError; - return repr; - - onError: - Py_DECREF(repr); - return NULL; + return unicode_encode_ucs1(p, size, errors, 128); } PyObject *PyUnicode_AsASCIIString(PyObject *unicode) { - if (!PyUnicode_Check(unicode)) { - PyErr_BadArgument(); - return NULL; - } - return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode), - PyUnicode_GET_SIZE(unicode), - NULL); + return PyUnicode_EncodeASCIIEx(unicode, NULL); } #if defined(MS_WIN32) && defined(HAVE_USABLE_WCHAR_T) @@ -1932,20 +2097,26 @@ return (PyObject *)v; } -PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p, - int size, - const char *errors) +PyObject *PyUnicode_EncodeMBCSEx(PyObject *unicode, + PyObject *errors) { PyObject *repr; - char *s; DWORD mbcssize; + if (!PyUnicode_Check(unicode)) { + PyErr_BadArgument(); + return NULL; + } + /* If there are no characters, bail now! */ - if (size==0) - return PyString_FromString(""); + if (PyUNICODE_GET_SIZE(unicode) == 0) + return PyString_FromString(""); /* First get the size of the result */ - mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL); + mbcssize = WideCharToMultiByte(CP_ACP, 0, + PyUnicode_AS_UNICODE(unicode), + PyUnicode_GET_SIZE(unicode), + NULL, 0, NULL, NULL); if (mbcssize==0) return PyErr_SetFromWindowsErrWithFilename(0, NULL); @@ -1956,14 +2127,46 @@ return repr; /* Do the conversion */ - s = PyString_AS_STRING(repr); - if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) { + if (0 == WideCharToMultiByte(CP_ACP, 0, + PyUnicode_AS_UNICODE(unicode), + PyUnicode_GET_SIZE(unicode), + PyString_AS_STRING(repr), + mbcssize, NULL, NULL)) { Py_DECREF(repr); return PyErr_SetFromWindowsErrWithFilename(0, NULL); } return repr; } +PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p, + int size, + const char *errors) +{ + PyObject *unicode; + PyObject *errorstr; + PyObject *res; + + unicode = PyUnicode_FromUnicode(p, size); + if (!unicode) + return NULL; + if (errors) { + errorstr = PyString_FromString(errors); + if (!errorstr) { + Py_DECREF(unicode); + return NULL; + } + } + else { + Py_INCREF(Py_None); + errorstr = Py_None; + } + + res = PyUnicode_EncodeMBCSEx(unicode, errorstr); + Py_DECREF(unicode); + Py_DECREF(errorstr); + return res; +} + #endif /* MS_WIN32 */ /* --- Character Mapping Codec -------------------------------------------- */ @@ -2105,61 +2308,81 @@ return NULL; } -static -int charmap_encoding_error(const Py_UNICODE **source, - char **dest, - const char *errors, - const char *details) -{ - if ((errors == NULL) || - (strcmp(errors,"strict") == 0)) { - PyErr_Format(PyExc_UnicodeError, - "charmap encoding error: %.400s", - details); - return -1; - } - else if (strcmp(errors,"ignore") == 0) { - return 0; - } - else if (strcmp(errors,"replace") == 0) { - **dest = '?'; - (*dest)++; - return 0; - } - else { - PyErr_Format(PyExc_ValueError, - "charmap encoding error; " - "unknown error handling code: %.400s", - errors); - return -1; - } -} +/* For this and the other encode functions the loop through + the string is done in the following way: A stack with two + strings is kept and the loop always encodes a character from + the string at the stacktop. If an error is encountered and + the stack has only one entry (during encoding of the original + string) the callback is called and the unicode object returned + is pushed onto the stack, so the encoding continues with the + replacement string. If the stack has two entries when an + error is encountered, the replacement string itself has + an unencodable character and an exception will be raised. + When the encoder has reached the end of it's current string + there are two possibilities: when the stack contains two + entries, this was the replacement string, so the replacement + string will be popped from the stack and encoding continues + with the next character from the original string. If the + stack had only one entry, encoding is finished. */ +PyObject *PyUnicode_EncodeCharmapEx(PyObject *unicode, + PyObject *mapping, + PyObject *errors) +{ + /* current input position */ + int unicodepos; + /* output object */ + PyObject *res; + /* current output position */ + int respos = 0; + /* the next two variables are used as a "micro stack": + during processing of a replacement string unicode2 + and unicode2pos contain the values for the original + unicode object to be encoded */ + PyObject *unicode2 = NULL; + int unicode2pos = 0; + PyObject *errorHandler = NULL; -PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p, - int size, - PyObject *mapping, - const char *errors) -{ - PyObject *v; - char *s; int extrachars = 0; /* Default to Latin-1 */ if (mapping == NULL) - return PyUnicode_EncodeLatin1(p, size, errors); + return PyUnicode_EncodeLatin1Ex(unicode, errors); - v = PyString_FromStringAndSize(NULL, size); - if (v == NULL) + if (!PyUnicode_Check(unicode)) { + PyErr_BadArgument(); + return NULL; + } + res = PyString_FromStringAndSize(NULL, PyUnicode_GET_SIZE(unicode)); + if (res == NULL) return NULL; - if (size == 0) - return v; - s = PyString_AS_STRING(v); - while (size-- > 0) { - Py_UNICODE ch = *p++; + if (PyUnicode_GET_SIZE(unicode) == 0) + return res; + for (unicodepos = 0;;++unicodepos) { + Py_UNICODE c; PyObject *w, *x; + /* finished with current string? */ + if (unicodepos == PyUnicode_GET_SIZE(unicode)) { + /* currently processing replacement? */ + if (unicode2) { + /* forget replacement string */ + Py_DECREF(unicode); + /* switch back to original */ + unicode = unicode2; + unicodepos = unicode2pos; + unicode2 = NULL; + /* maybe original is finished too? */ + continue; + } + else + /* currently processing original => finished */ + break; + } + + c = PyUnicode_AS_UNICODE(unicode)[unicodepos]; + /* Get mapping (Unicode ordinal -> string char, integer or None) */ - w = PyInt_FromLong((long)ch); + w = PyInt_FromLong((long)c); if (w == NULL) goto onError; x = PyObject_GetItem(mapping, w); @@ -2183,39 +2406,53 @@ Py_DECREF(x); goto onError; } - *s++ = (char)value; + PyString_AS_STRING(res)[respos++] = (char)value; } + /* undefined mapping */ else if (x == Py_None) { - /* undefined mapping */ - if (charmap_encoding_error(&p, &s, errors, - "character maps to ")) { + /* error while replacing */ + if (unicode2) { + /* report original position; FIXME should we give a better name? */ + PyCodec_RaiseUnicodeEncodeError("charmap", c, unicode2pos); Py_DECREF(x); goto onError; } + else { + /* "push" original to secondary variables */ + unicode2 = unicode; + /* switch to replacement */ + unicode = unicode_encode_call_errorhandler(errors, &errorHandler, + "charmap", unicode, unicodepos, &unicode2pos); + if (unicode == NULL) { + Py_DECREF(x); + goto onError; + } + unicodepos = -1; + /* retry with the replacement string */ + continue; + } } else if (PyString_Check(x)) { int targetsize = PyString_GET_SIZE(x); if (targetsize == 1) /* 1-1 mapping */ - *s++ = *PyString_AS_STRING(x); + PyString_AS_STRING(res)[respos++] = *PyString_AS_STRING(x); else if (targetsize > 1) { /* 1-n mapping */ if (targetsize > extrachars) { /* resize first */ - int oldpos = (int)(s - PyString_AS_STRING(v)); int needed = (targetsize - extrachars) + \ - (targetsize << 2); + (targetsize << 2); extrachars += needed; - if (_PyString_Resize(&v, PyString_GET_SIZE(v) + needed)) { + if (_PyString_Resize(&res, PyString_GET_SIZE(res) + needed)) { Py_DECREF(x); goto onError; } - s = PyString_AS_STRING(v) + oldpos; } - memcpy(s, PyString_AS_STRING(x), targetsize); - s += targetsize; + memcpy(&PyString_AS_STRING(res)[respos], PyString_AS_STRING(x), targetsize); + respos += targetsize; extrachars -= targetsize; } /* 1-0 mapping: skip the character */ @@ -2229,27 +2466,55 @@ } Py_DECREF(x); } - if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v)) - if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v)))) + /* Resize if we allocated to much */ + if (respos < PyString_GET_SIZE(res)) + if (_PyString_Resize(&res, respos)) goto onError; - return v; + return res; onError: - Py_DECREF(v); + Py_DECREF(res); + /* free replacement */ + if (unicode2) { + Py_XDECREF(unicode); + } + Py_XDECREF(errorHandler); return NULL; } -PyObject *PyUnicode_AsCharmapString(PyObject *unicode, - PyObject *mapping) +PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p, + int size, + PyObject *mapping, + const char *errors) { - if (!PyUnicode_Check(unicode) || mapping == NULL) { - PyErr_BadArgument(); + PyObject *unicode; + PyObject *errorstr; + PyObject *res; + + unicode = PyUnicode_FromUnicode(p, size); + if (!unicode) return NULL; + if (errors) { + errorstr = PyString_FromString(errors); + if (!errorstr) { + Py_DECREF(unicode); + return NULL; + } + } + else { + Py_INCREF(Py_None); + errorstr = Py_None; } - return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode), - PyUnicode_GET_SIZE(unicode), - mapping, - NULL); + + res = PyUnicode_EncodeCharmapEx(unicode, mapping, errorstr); + Py_DECREF(unicode); + Py_DECREF(errorstr); + return res; +} + +PyObject *PyUnicode_AsCharmapString(PyObject *unicode, PyObject *mapping) +{ + return PyUnicode_EncodeCharmapEx(unicode, mapping, NULL); } static @@ -2387,48 +2652,74 @@ /* --- Decimal Encoder ---------------------------------------------------- */ -int PyUnicode_EncodeDecimal(Py_UNICODE *s, - int length, - char *output, - const char *errors) -{ - Py_UNICODE *p, *end; +int PyUnicode_EncodeDecimalEx(PyObject *unicode, + char *output, + PyObject *errors) +{ + /* current input position */ + int unicodepos; + /* the next two variables are used as a "micro stack": + during processing of a replacement string unicode2 + and unicode2pos contain the values for the original + unicode object to be encoded */ + PyObject *unicode2 = NULL; + int unicode2pos = 0; + PyObject *errorHandler = NULL; if (output == NULL) { PyErr_BadArgument(); return -1; } - p = s; - end = s + length; - while (p < end) { - register Py_UNICODE ch = *p++; + if (!PyUnicode_Check(unicode)) { + PyErr_BadArgument(); + return -1; + } + for (unicodepos = 0;;++unicodepos) { + Py_UNICODE c; int decimal; - - if (Py_UNICODE_ISSPACE(ch)) { + /* finished with the string? */ + if (unicodepos == PyUnicode_GET_SIZE(unicode)) { + /* processing replacement? */ + if (unicode2) { + /* forget replacement */ + Py_DECREF(unicode); + /* switch back to original */ + unicode = unicode2; + unicodepos = unicode2pos; + unicode2 = NULL; + unicode2pos = 0; + /* maybe original is finished too? */ + continue; + } + else + /* processing original => finished */ + break; + } + c = PyUnicode_AS_UNICODE(unicode)[unicodepos]; + + if (Py_UNICODE_ISSPACE(c)) { *output++ = ' '; continue; } - decimal = Py_UNICODE_TODECIMAL(ch); + decimal = Py_UNICODE_TODECIMAL(c); if (decimal >= 0) { *output++ = '0' + decimal; continue; - } - if (0 < ch && ch < 256) { - *output++ = (char)ch; - continue; } + if (0 < c && c < 256) + *output++ = (char)c; /* All other characters are considered invalid */ - if (errors == NULL || strcmp(errors, "strict") == 0) { - PyErr_SetString(PyExc_ValueError, - "invalid decimal Unicode string"); - goto onError; - } - else if (strcmp(errors, "ignore") == 0) - continue; - else if (strcmp(errors, "replace") == 0) { - *output++ = '?'; - continue; + else { + /* "push" original to secondary variables */ + unicode2 = unicode; + /* switch to replacement */ + unicode = unicode_encode_call_errorhandler(errors, &errorHandler, + "charmap", unicode, unicodepos, &unicode2pos); + if (unicode == NULL) + goto onError; + unicodepos = -1; + /* retry with the replacement string */ } } /* 0-terminate the output string */ @@ -2436,9 +2727,45 @@ return 0; onError: + Py_XDECREF(errorHandler); + /* free replacement */ + if (unicode2) { + Py_XDECREF(unicode); + } + return -1; } +int PyUnicode_EncodeDecimal(Py_UNICODE *s, + int length, + char *output, + const char *errors) +{ + PyObject *unicode; + PyObject *errorstr; + int res; + + unicode = PyUnicode_FromUnicode(s, length); + if (!unicode) + return -1; + if (errors) { + errorstr = PyString_FromString(errors); + if (!errorstr) { + Py_DECREF(unicode); + return -1; + } + } + else { + Py_INCREF(Py_None); + errorstr = Py_None; + } + + res = PyUnicode_EncodeDecimalEx(unicode, output, errorstr); + Py_DECREF(unicode); + Py_DECREF(errorstr); + return res; +} + /* --- Helpers ------------------------------------------------------------ */ static @@ -3549,17 +3876,23 @@ \n\ Return an encoded string version of S. Default encoding is the current\n\ default string encoding. errors may be given to set a different error\n\ -handling scheme. Default is 'strict' meaning that encoding errors raise\n\ -a ValueError. Other possible values are 'ignore' and 'replace'."; +handling scheme. Default is None meaning that unencodable characters\n +raise UnicodeError. 'strict' does the same. Other possible values are\n\ +ignore' and 'replace' or a callable that will be called with the encoding,\n\ +the original unicode string, the position of the unencodable character\n\ +and an object describing the current state of the encoder and must\n\ +return a tuple with a unicode string that will be encoded instead of the\n\ +unencodable character and the position in the original string where encoding\n\ +should continue."; static PyObject * unicode_encode(PyUnicodeObject *self, PyObject *args) { char *encoding = NULL; - char *errors = NULL; - if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors)) + PyObject *errors = NULL; + if (!PyArg_ParseTuple(args, "|sO:encode", &encoding, &errors)) return NULL; - return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors); + return PyUnicode_AsEncodedStringEx((PyObject *)self, encoding, errors); } static char expandtabs__doc__[] = Index: Python/codecs.c =================================================================== RCS file: /cvsroot/python/python/dist/src/Python/codecs.c,v retrieving revision 2.13 diff -u -r2.13 codecs.c --- Python/codecs.c 2000/09/26 05:46:01 2.13 +++ Python/codecs.c 2001/07/12 10:50:44 @@ -236,11 +236,47 @@ return args; } +static +PyObject *args_tupleex(PyObject *object, + PyObject *errors) +{ + PyObject *args; + + args = PyTuple_New(2); + if (args == NULL) + return NULL; + Py_INCREF(object); + PyTuple_SET_ITEM(args,0,object); + errors = PyCodec_UnicodeEncodeHandlerForObject(errors); + if (!errors) { + Py_DECREF(args); + return NULL; + } + PyTuple_SET_ITEM(args, 1, errors); + return args; +} + /* Build a codec by calling factory(stream[,errors]) or just factory(errors) depending on whether the given parameters are non-NULL. */ static +PyObject *build_stream_codecex(PyObject *factory, + PyObject *stream, + PyObject *errors) +{ + PyObject *args, *codec; + + args = args_tupleex(stream, errors); + if (args == NULL) + return NULL; + + codec = PyEval_CallObject(factory, args); + Py_DECREF(args); + return codec; +} + +static PyObject *build_stream_codec(PyObject *factory, PyObject *stream, const char *errors) @@ -309,29 +345,51 @@ return NULL; } -PyObject *PyCodec_StreamWriter(const char *encoding, - PyObject *stream, - const char *errors) +PyObject *PyCodec_StreamWriterEx(const char *encoding, + PyObject *stream, + PyObject *errors) { PyObject *codecs; codecs = _PyCodec_Lookup(encoding); if (codecs == NULL) goto onError; - return build_stream_codec(PyTuple_GET_ITEM(codecs,3),stream,errors); + return build_stream_codecex(PyTuple_GET_ITEM(codecs,3),stream,errors); onError: return NULL; } +PyObject *PyCodec_StreamWriter(const char *encoding, + PyObject *stream, + const char *errors) +{ + PyObject *errorstr; + PyObject *res; + + if (errors) { + errorstr = PyString_FromString(errors); + if (!errorstr) + return NULL; + } + else { + Py_INCREF(Py_None); + errorstr = Py_None; + } + + res = PyCodec_StreamWriterEx(encoding,stream,errorstr); + Py_DECREF(errorstr); + return res; +} + /* Encode an object (e.g. an Unicode object) using the given encoding and return the resulting encoded object (usually a Python string). errors is passed to the encoder factory as argument if non-NULL. */ -PyObject *PyCodec_Encode(PyObject *object, - const char *encoding, - const char *errors) +PyObject *PyCodec_EncodeEx(PyObject *object, + const char *encoding, + PyObject *errors) { PyObject *encoder = NULL; PyObject *args = NULL, *result; @@ -341,11 +399,11 @@ if (encoder == NULL) goto onError; - args = args_tuple(object, errors); + args = args_tupleex(object, errors); if (args == NULL) goto onError; - result = PyEval_CallObject(encoder,args); + result = PyEval_CallObject(encoder, args); if (result == NULL) goto onError; @@ -370,6 +428,30 @@ return NULL; } +PyObject *PyCodec_Encode(PyObject *object, + const char *encoding, + const char *errors) +{ + PyObject *errorstr; + PyObject *res; + + if (errors) { + errorstr = PyString_FromString(errors); + if (!errorstr) { + Py_DECREF(object); + return NULL; + } + } + else { + Py_INCREF(Py_None); + errorstr = Py_None; + } + + res = PyCodec_EncodeEx(object, encoding, errorstr); + Py_DECREF(errorstr); + return res; +} + /* Decode an object (usually a Python string) using the given encoding and return an equivalent object (e.g. an Unicode object). @@ -414,6 +496,205 @@ Py_XDECREF(decoder); Py_XDECREF(result); return NULL; +} + +/* return a new reference to one of the builtin unicode encode + error handlers or None. + error can be: + + * NULL, Py_None, "strict" or u"strict" for + codecs.raise_unicodeencode_errors + * "ignore" or u"ignore" for codecs.ignore_unicodeencode_errors + * "replace" or u"replace" for codecs.replace_unicodeencode_errors + * a callable which will be returned directy + + everything else will raise an exception */ +PyObject *PyCodec_UnicodeEncodeHandlerForObject(PyObject *error) +{ + static Py_UNICODE strict[] = { 's', 't', 'r', 'i', 'c', 't' }; + static Py_UNICODE ignore[] = { 'i', 'g', 'n', 'o', 'r', 'e' }; + static Py_UNICODE replace[] = { 'r', 'e', 'p', 'l', 'a', 'c', 'e' }; + static PyMethodDef strictMethod = { + "raise_unicodeencode_errors", + PyCodec_RaiseUnicodeEncodeErrors, + METH_VARARGS + }; + static PyMethodDef ignoreMethod = { + "ignore_unicodeencode_errors", + PyCodec_IgnoreUnicodeEncodeErrors, + METH_VARARGS + }; + static PyMethodDef replaceMethod = { + "replace_unicodeencode_errors", + PyCodec_ReplaceUnicodeEncodeErrors, + METH_VARARGS + }; + PyMethodDef *method = NULL; + PyObject *res = NULL; + + if (error==NULL || error==Py_None) + method = &strictMethod; + else if (PyCallable_Check(error)) { + res = error; + Py_INCREF(error); + } + else if (PyString_Check(error)) { + char *s = PyString_AS_STRING(error); + int size = PyString_GET_SIZE(error); + if (size==6 && !memcmp(s, "strict", size)) + method = &strictMethod; + else if (size==6 && !memcmp(s, "ignore", size)) + method = &ignoreMethod; + else if (size==7 && !memcmp(s, "replace", size)) + method = &replaceMethod; + else + PyErr_SetString(PyExc_ValueError, "unknown error handler name"); + } + else if (PyUnicode_Check(error)) { + Py_UNICODE *s = PyUnicode_AS_UNICODE(error); + int size = PyUnicode_GET_SIZE(error); + if (size==sizeof(strict) && !memcmp(s, strict, sizeof(strict))) + method = &strictMethod; + else if (size==sizeof(ignore) && !memcmp(s, ignore, sizeof(ignore))) + method = &ignoreMethod; + else if (size==sizeof(replace) && !memcmp(s, replace, sizeof(replace))) + method = &replaceMethod; + else + PyErr_SetString(PyExc_ValueError, "unknown error handler name"); + } + else + PyErr_SetString(PyExc_TypeError, "wrong type for error handler"); + if (method) + res = PyCFunction_New(method, NULL); + return res; +} + + +void PyCodec_RaiseUnicodeEncodeError(const char *encoding, Py_UNICODE c, int pos) +{ + PyErr_Format(PyExc_UnicodeError, + "encoding '%.400s' can't encode character '\\u%x' in position %d", + encoding, (long)c, pos); +} + + +PyObject *PyCodec_RaiseUnicodeEncodeErrors(PyObject *self, PyObject *args) +{ + char *encoding; + Py_UNICODE *unicode; + int pos; + PyObject *state; + + if (PyArg_ParseTuple(args, "suiO:raise_unicodeencode_errors", + &encoding, &unicode, &pos, &state)) + PyCodec_RaiseUnicodeEncodeError(encoding, unicode[pos], pos); + return NULL; +} + + +PyObject *PyCodec_IgnoreUnicodeEncodeErrors(PyObject *self, PyObject *args) +{ + char *encoding; + Py_UNICODE *unicode; + int pos; + PyObject *state; + + if (!PyArg_ParseTuple(args, "suiO:ignore_unicodeencode_errors", + &encoding, &unicode, &pos, &state)) + return NULL; + /* skip the unencodable character */ + ++pos; + /* ouch: passing NULL, 0, pos gives None instead of u'' */ + return Py_BuildValue("(u#i)", &pos, 0, pos); +} + + +PyObject *PyCodec_ReplaceUnicodeEncodeErrors(PyObject *self, PyObject *args) +{ + char *encoding; + Py_UNICODE *unicode; + int pos; + PyObject *state; + Py_UNICODE res = '?'; + + if (!PyArg_ParseTuple(args, "suiO:replace_unicodeencode_errors", + &encoding, &unicode, &pos, &state)) + return NULL; + /* skip the unencodable character */ + ++pos; + return Py_BuildValue("(u#i)", &res, 1, pos); +} + +static Py_UNICODE hexdigits[] = { + '0', '1', '2', '3', '4', '5', '6', '7', + '8', '9', 'a', 'b', 'c', 'd', 'e', 'f' +}; + +PyObject *PyCodec_XMLCharRefReplaceUnicodeEncodeErrors(PyObject *self, PyObject *args) +{ + char *encoding; + Py_UNICODE *unicode; + int pos; + PyObject *state; + Py_UNICODE buf[9]; + Py_UNICODE *p = buf; + Py_UNICODE c; + + if (!PyArg_ParseTuple(args, "sui|O:xmlcharrefreplace_unicodeencode_errors", + &encoding, &unicode, &pos, &state)) + return NULL; + + c = unicode[pos]; + *p++ = '&'; + *p++ = '#'; + *p++ = 'x'; + if (c>=0x1000) + *p++ = hexdigits[c>>12]; + if (c>=0x0100) + *p++ = hexdigits[(c>>8)&0xf]; + if (c>=0x0010) + *p++ = hexdigits[(c>>4)&0xf]; + *p++ = hexdigits[c&0xf]; + *p++ = ';'; + + /* skip the unencodable character */ + ++pos; + return Py_BuildValue("(u#i)", buf, p-buf, pos); +} + +PyObject *PyCodec_EscapeReplaceUnicodeEncodeErrors(PyObject *self, PyObject *args) +{ + char *encoding; + Py_UNICODE *unicode; + int pos; + PyObject *state; + Py_UNICODE buf[10]; + Py_UNICODE *p = buf; + Py_UNICODE c; + + if (!PyArg_ParseTuple(args, "sui|O:escapereplace_unicodeencode_errors", + &encoding, &unicode, &pos, &state)) + return NULL; + + c = unicode[pos]; + *p++ = '\\'; + if (c >= 0x00010000) { + *p++ = 'U'; + *p++ = hexdigits[(c>>28)&0xf]; + *p++ = hexdigits[(c>>24)&0xf]; + *p++ = hexdigits[(c>>20)&0xf]; + *p++ = hexdigits[(c>>16)&0xf]; + } + else + *p++ = 'u'; + *p++ = hexdigits[(c>>12)&0xf]; + *p++ = hexdigits[(c>>8)&0xf]; + *p++ = hexdigits[(c>>4)&0xf]; + *p++ = hexdigits[c&0xf]; + + /* skip the unencodable character */ + ++pos; + return Py_BuildValue("(u#i)", buf, p-buf, pos); } void _PyCodecRegistry_Init(void)