Index: Include/codecs.h =================================================================== RCS file: /cvsroot/python/python/dist/src/Include/codecs.h,v retrieving revision 2.3 diff -u -r2.3 codecs.h --- Include/codecs.h 2000/08/03 16:24:24 2.3 +++ Include/codecs.h 2001/06/13 13:01:05 @@ -53,12 +53,18 @@ object is passed through the encoder function found for the given encoding using the error handling method defined by errors. errors - may be NULL to use the default method defined for the codec. + may be NULL to use the strict encoding. Raises a LookupError in case no encoder can be found. */ +extern DL_IMPORT(PyObject *) PyCodec_EncodeEx( + PyObject *object, + const char *encoding, + PyObject *errors + ); + extern DL_IMPORT(PyObject *) PyCodec_Encode( PyObject *object, const char *encoding, @@ -111,11 +117,48 @@ /* Get a StreamWriter factory function for the given encoding. */ +extern DL_IMPORT(PyObject *) PyCodec_StreamWriterEx( + const char *encoding, + PyObject *stream, + PyObject *errors + ); + +/* DEPRECATED */ extern DL_IMPORT(PyObject *) PyCodec_StreamWriter( const char *encoding, PyObject *stream, const char *errors ); + +/* Return a new reference to one of the builtin unicode + encode error handlers. error can be: + + * NULL, Py_None, "strict" or u"strict" for + codecs.raise_unicodeencode_errors + * "ignore" or u"ignore" for codecs.ignore_unicodeencode_errors + * "replace" or u"replace" for codecs.replace_unicodeencode_errors + * a callable which will be returned directy + + everything else will raise an exception */ +extern DL_IMPORT(PyObject *) PyCodec_UnicodeEncodeHandlerForObject(PyObject *error); + +/* Raises a Unicode exception */ +extern DL_IMPORT(void) PyCodec_RaiseUnicodeEncodeError(const char *encoding, Py_UNICODE c, int pos); + +/* Encode error handler that raises an exception */ +extern DL_IMPORT(PyObject *) PyCodec_RaiseUnicodeEncodeErrors(PyObject *self, PyObject *args); + +/* Encode error handler that returns a empty string and so ignores the + unencodable character */ +extern DL_IMPORT(PyObject *) PyCodec_IgnoreUnicodeEncodeErrors(PyObject *self, PyObject *args); + +/* Encode error handler that returns an Unicode replacement character + that will be used by the codec to replace the unencodable character */ +extern DL_IMPORT(PyObject *) PyCodec_ReplaceUnicodeEncodeErrors(PyObject *self, PyObject *args); + +/* Encode error handler that returns an XML character reference for the + unencodable character */ +extern DL_IMPORT(PyObject *) PyCodec_XMLCharRefReplaceUnicodeEncodeErrors(PyObject *self, PyObject *args); #ifdef __cplusplus } Index: Include/unicodeobject.h =================================================================== RCS file: /cvsroot/python/python/dist/src/Include/unicodeobject.h,v retrieving revision 2.21 diff -u -r2.21 unicodeobject.h --- Include/unicodeobject.h 2001/05/21 20:30:15 2.21 +++ Include/unicodeobject.h 2001/06/13 13:01:14 @@ -407,9 +407,15 @@ const char *errors /* error handling */ ); -/* Encodes a Py_UNICODE buffer of the given size and returns a - Python string object. */ +/* Encodes a Unicode object and returns a Python string object. */ +extern DL_IMPORT(PyObject*) PyUnicode_EncodeEx( + PyObject *unicode, /* Unicode object */ + const char *encoding, /* encoding */ + PyObject *errors /* error handling */ + ); + +/* DEPRECATED */ extern DL_IMPORT(PyObject*) PyUnicode_Encode( const Py_UNICODE *s, /* Unicode char buffer */ int size, /* number of Py_UNICODE chars to encode */ @@ -417,9 +423,12 @@ const char *errors /* error handling */ ); +#define PyUnicode_AsEncodedStringEx PyUnicode_EncodeEx + /* Encodes a Unicode object and returns the result as Python string object. */ +/* DEPRECATED */ extern DL_IMPORT(PyObject*) PyUnicode_AsEncodedString( PyObject *unicode, /* Unicode object */ const char *encoding, /* encoding */ @@ -438,6 +447,12 @@ PyObject *unicode /* Unicode object */ ); +extern DL_IMPORT(PyObject*) PyUnicode_EncodeUTF8Ex( + PyObject *unicode, /* Unicode object */ + PyObject *errors /* error handling */ + ); + +/* DEPRECATED */ extern DL_IMPORT(PyObject*) PyUnicode_EncodeUTF8( const Py_UNICODE *data, /* Unicode char buffer */ int length, /* number of Py_UNICODE chars to encode */ @@ -505,6 +520,13 @@ */ +extern DL_IMPORT(PyObject*) PyUnicode_EncodeUTF16Ex( + PyObject *unicode, /* Unicode object */ + PyObject *errors, /* error handling */ + int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */ + ); + +/* DEPRECATED */ extern DL_IMPORT(PyObject*) PyUnicode_EncodeUTF16( const Py_UNICODE *data, /* Unicode char buffer */ int length, /* number of Py_UNICODE chars to encode */ @@ -524,6 +546,10 @@ PyObject *unicode /* Unicode object */ ); +extern DL_IMPORT(PyObject*) PyUnicode_EncodeUnicodeEscapeEx( + PyObject *unicode /* Unicode object */ + ); + extern DL_IMPORT(PyObject*) PyUnicode_EncodeUnicodeEscape( const Py_UNICODE *data, /* Unicode char buffer */ int length /* Number of Py_UNICODE chars to encode */ @@ -537,7 +563,7 @@ const char *errors /* error handling */ ); -extern DL_IMPORT(PyObject*) PyUnicode_AsRawUnicodeEscapeString( +extern DL_IMPORT(PyObject*) PyUnicode_EncodeRawUnicodeEscapeEx( PyObject *unicode /* Unicode object */ ); @@ -546,6 +572,8 @@ int length /* Number of Py_UNICODE chars to encode */ ); +#define PyUnicode_AsRawUnicodeEscapeString PyUnicode_EncodeRawUnicodeEscapeEx + /* --- Latin-1 Codecs ----------------------------------------------------- Note: Latin-1 corresponds to the first 256 Unicode ordinals. @@ -562,6 +590,12 @@ PyObject *unicode /* Unicode object */ ); +extern DL_IMPORT(PyObject*) PyUnicode_EncodeLatin1Ex( + PyObject *unicode, /* Unicode object */ + PyObject *errors /* error handling */ + ); + +/* DEPRECATED */ extern DL_IMPORT(PyObject*) PyUnicode_EncodeLatin1( const Py_UNICODE *data, /* Unicode char buffer */ int length, /* Number of Py_UNICODE chars to encode */ @@ -584,6 +618,12 @@ PyObject *unicode /* Unicode object */ ); +extern DL_IMPORT(PyObject*) PyUnicode_EncodeASCIIEx( + PyObject *unicode, /* Unicode object */ + PyObject *errors /* error handling */ + ); + +/* DEPRECATED */ extern DL_IMPORT(PyObject*) PyUnicode_EncodeASCII( const Py_UNICODE *data, /* Unicode char buffer */ int length, /* Number of Py_UNICODE chars to encode */ @@ -626,6 +666,14 @@ (unicode ordinal -> char ordinal) */ ); +extern DL_IMPORT(PyObject*) PyUnicode_EncodeCharmapEx( + PyObject *unicode, /* Unicode object */ + PyObject *mapping, /* character mapping + (unicode ordinal -> char ordinal) */ + PyObject *errors /* error handling */ + ); + +/* DEPRECATED */ extern DL_IMPORT(PyObject*) PyUnicode_EncodeCharmap( const Py_UNICODE *data, /* Unicode char buffer */ int length, /* Number of Py_UNICODE chars to encode */ @@ -668,6 +716,12 @@ PyObject *unicode /* Unicode object */ ); +extern DL_IMPORT(PyObject*) PyUnicode_EncodeMBCSEx( + const Pyobject *unicode, /* Unicode object */ + const PyObject *errors /* error handling */ + ); + +/* DEPRECATED */ extern DL_IMPORT(PyObject*) PyUnicode_EncodeMBCS( const Py_UNICODE *data, /* Unicode char buffer */ int length, /* Number of Py_UNICODE chars to encode */ @@ -682,7 +736,8 @@ an output buffer using standard ASCII digit codes. The output buffer has to provide at least length+1 bytes of storage - area. The output string is 0-terminated. + area (more if longer replacement string are generated). + The output string is 0-terminated. The encoder converts whitespace to ' ', decimal characters to their corresponding ASCII digit and all other Latin-1 characters except @@ -691,15 +746,23 @@ Error handling is defined by the errors argument: - NULL or "strict": raise a ValueError - "ignore": ignore the wrong characters (these are not copied to the - output buffer) - "replace": replaces illegal characters with '?' + NULL, None, "strict" or u"strict": raise a UnicodeError + "ignore" or u"ignore": ignore the wrong characters (these are + not copied to the output buffer) + "replace" or u"replace": replaces illegal characters with '?' + callable object: use what the object returns as replacement Returns 0 on success, -1 on failure. */ +extern DL_IMPORT(int) PyUnicode_EncodeDecimalEx( + PyObject *unicode, /* Unicode object */ + char *output, /* Output buffer; must have size >= length */ + PyObject *errors /* error handling */ + ); + +/* DEPRECATED */ extern DL_IMPORT(int) PyUnicode_EncodeDecimal( Py_UNICODE *s, /* Unicode buffer */ int length, /* Number of Py_UNICODE chars to encode */ @@ -760,7 +823,7 @@ */ extern DL_IMPORT(PyObject *) PyUnicode_Translate( - PyObject *str, /* String */ + PyObject *str, /* String */ PyObject *table, /* Translate table */ const char *errors /* error handling */ ); Index: Lib/codecs.py =================================================================== RCS file: /cvsroot/python/python/dist/src/Lib/codecs.py,v retrieving revision 1.19 diff -u -r1.19 codecs.py --- Lib/codecs.py 2001/05/29 06:06:54 1.19 +++ Lib/codecs.py 2001/06/13 13:01:25 @@ -51,13 +51,17 @@ The .encode()/.decode() methods may implement different error handling schemes by providing the errors argument. These - string values are defined: + values are defined: - 'strict' - raise a ValueError error (or a subclass) + None or 'strict' - raise a UnicodeError error (or a subclass) 'ignore' - ignore the character and continue with the next 'replace' - replace with a suitable replacement character; - Python will use the official U+FFFD REPLACEMENT - CHARACTER for the builtin Unicode codecs. + Python will use the official U+FFFD REPLACEMENT + CHARACTER for the builtin Unicode codecs. + callable object - call the object with the arguments + encoding name, character, position + and encode the unicode object returned + instead of the original character. """ def encode(self, input, errors='strict'): @@ -66,7 +70,7 @@ object, length consumed). errors defines the error handling to apply. It defaults to - 'strict' handling. + strict handling. The method may not store state in the Codec instance. Use StreamCodec for codecs which have to keep state in order to @@ -122,9 +126,15 @@ schemes by providing the errors keyword argument. These parameters are defined: - 'strict' - raise a ValueError (or a subclass) - 'ignore' - ignore the character and continue with the next - 'replace'- replace with a suitable replacement character + None or 'strict' - raise a UnicodeError error (or a subclass) + 'ignore' - ignore the character and continue with the next + 'replace' - replace with a suitable replacement character; + Python will use the official U+FFFD REPLACEMENT + CHARACTER for the builtin Unicode codecs. + callable object - call the object with the arguments + encoding name, character, position + and encode the unicode object returned + instead of the original character. """ self.stream = stream Index: Lib/encodings/base64_codec.py =================================================================== RCS file: /cvsroot/python/python/dist/src/Lib/encodings/base64_codec.py,v retrieving revision 1.1 diff -u -r1.1 base64_codec.py --- Lib/encodings/base64_codec.py 2001/05/15 12:00:02 1.1 +++ Lib/encodings/base64_codec.py 2001/06/13 13:02:40 @@ -10,17 +10,17 @@ ### Codec APIs -def base64_encode(input,errors='strict'): +def base64_encode(input,errors='None'): """ Encodes the object input and returns a tuple (output object, length consumed). errors defines the error handling to apply. It defaults to - 'strict' handling which is the only currently supported + strict handling which is the only currently supported error handling for this codec. """ - assert errors == 'strict' + assert errors is None or errors == "strict" or errors == codecs.raise_unicodeencode_errors output = base64.encodestring(input) return (output, len(input)) @@ -34,7 +34,7 @@ mapped files are examples of objects providing this slot. errors defines the error handling to apply. It defaults to - 'strict' handling which is the only currently supported + strict handling which is the only currently supported error handling for this codec. """ Index: Lib/encodings/hex_codec.py =================================================================== RCS file: /cvsroot/python/python/dist/src/Lib/encodings/hex_codec.py,v retrieving revision 1.1 diff -u -r1.1 hex_codec.py --- Lib/encodings/hex_codec.py 2001/05/15 12:00:02 1.1 +++ Lib/encodings/hex_codec.py 2001/06/13 13:02:48 @@ -10,17 +10,17 @@ ### Codec APIs -def hex_encode(input,errors='strict'): +def hex_encode(input,errors=None): """ Encodes the object input and returns a tuple (output object, length consumed). errors defines the error handling to apply. It defaults to - 'strict' handling which is the only currently supported + strict handling which is the only currently supported error handling for this codec. """ - assert errors == 'strict' + assert errors is None or errors == "strict" or errors == codecs.raise_unicodeencode_errors output = binascii.b2a_hex(input) return (output, len(input)) @@ -34,7 +34,7 @@ mapped files are examples of objects providing this slot. errors defines the error handling to apply. It defaults to - 'strict' handling which is the only currently supported + strict handling which is the only currently supported error handling for this codec. """ Index: Lib/encodings/quopri_codec.py =================================================================== RCS file: /cvsroot/python/python/dist/src/Lib/encodings/quopri_codec.py,v retrieving revision 1.1 diff -u -r1.1 quopri_codec.py --- Lib/encodings/quopri_codec.py 2001/05/15 15:34:07 1.1 +++ Lib/encodings/quopri_codec.py 2001/06/13 13:02:51 @@ -9,15 +9,15 @@ except ImportError: from StringIO import StringIO -def quopri_encode(input, errors='strict'): +def quopri_encode(input, errors=None): """Encode the input, returning a tuple (output object, length consumed). errors defines the error handling to apply. It defaults to - 'strict' handling which is the only currently supported + strict handling which is the only currently supported error handling for this codec. """ - assert errors == 'strict' + assert errors is None or errors == "strict" or errors == codecs.raise_unicodeencode_errors f = StringIO(input) g = StringIO() quopri.encode(f, g, 1) Index: Lib/encodings/uu_codec.py =================================================================== RCS file: /cvsroot/python/python/dist/src/Lib/encodings/uu_codec.py,v retrieving revision 1.1 diff -u -r1.1 uu_codec.py --- Lib/encodings/uu_codec.py 2001/05/15 12:00:02 1.1 +++ Lib/encodings/uu_codec.py 2001/06/13 13:02:51 @@ -12,17 +12,17 @@ ### Codec APIs -def uu_encode(input,errors='strict',filename='',mode=0666): +def uu_encode(input,errors=None,filename='',mode=0666): """ Encodes the object input and returns a tuple (output object, length consumed). errors defines the error handling to apply. It defaults to - 'strict' handling which is the only currently supported + strict handling which is the only currently supported error handling for this codec. """ - assert errors == 'strict' + assert errors is None or errors == "strict" or errors == codecs.raise_unicodeencode_errors from cStringIO import StringIO from binascii import b2a_uu infile = StringIO(input) @@ -50,7 +50,7 @@ mapped files are examples of objects providing this slot. errors defines the error handling to apply. It defaults to - 'strict' handling which is the only currently supported + strict handling which is the only currently supported error handling for this codec. Note: filename and file mode information in the input data is Index: Lib/encodings/zlib_codec.py =================================================================== RCS file: /cvsroot/python/python/dist/src/Lib/encodings/zlib_codec.py,v retrieving revision 1.1 diff -u -r1.1 zlib_codec.py --- Lib/encodings/zlib_codec.py 2001/05/15 12:00:02 1.1 +++ Lib/encodings/zlib_codec.py 2001/06/13 13:02:51 @@ -11,17 +11,17 @@ ### Codec APIs -def zlib_encode(input,errors='strict'): +def zlib_encode(input,errors=None): """ Encodes the object input and returns a tuple (output object, length consumed). errors defines the error handling to apply. It defaults to - 'strict' handling which is the only currently supported + strict handling which is the only currently supported error handling for this codec. """ - assert errors == 'strict' + assert errors is None or errors == "strict" or errors == codecs.raise_unicodeencode_errors output = zlib.compress(input) return (output, len(input)) @@ -35,7 +35,7 @@ mapped files are examples of objects providing this slot. errors defines the error handling to apply. It defaults to - 'strict' handling which is the only currently supported + strict handling which is the only currently supported error handling for this codec. """ Index: Modules/_codecsmodule.c =================================================================== RCS file: /cvsroot/python/python/dist/src/Modules/_codecsmodule.c,v retrieving revision 2.6 diff -u -r2.6 _codecsmodule.c --- Modules/_codecsmodule.c 2000/09/21 21:09:45 2.6 +++ Modules/_codecsmodule.c 2001/06/13 13:11:54 @@ -327,9 +327,9 @@ { const char *data; int size; - const char *errors = NULL; + PyObject *errors = NULL; - if (!PyArg_ParseTuple(args, "s#|z:readbuffer_encode", + if (!PyArg_ParseTuple(args, "s#|O:readbuffer_encode", &data, &size, &errors)) return NULL; @@ -343,9 +343,9 @@ { const char *data; int size; - const char *errors = NULL; + PyObject *errors = NULL; - if (!PyArg_ParseTuple(args, "t#|z:charbuffer_encode", + if (!PyArg_ParseTuple(args, "t#|O:charbuffer_encode", &data, &size, &errors)) return NULL; @@ -358,11 +358,11 @@ PyObject *args) { PyObject *obj; - const char *errors = NULL; + PyObject *errors = NULL; const char *data; int size; - if (!PyArg_ParseTuple(args, "O|z:unicode_internal_encode", + if (!PyArg_ParseTuple(args, "O|O:unicode_internal_encode", &obj, &errors)) return NULL; @@ -382,22 +382,21 @@ static PyObject * utf_8_encode(PyObject *self, - PyObject *args) + PyObject *args) { PyObject *str, *v; - const char *errors = NULL; + PyObject *errors = NULL; - if (!PyArg_ParseTuple(args, "O|z:utf_8_encode", + if (!PyArg_ParseTuple(args, "O|O:utf_8_encode", &str, &errors)) return NULL; str = PyUnicode_FromObject(str); if (str == NULL) return NULL; - v = codec_tuple(PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(str), - PyUnicode_GET_SIZE(str), - errors), - PyUnicode_GET_SIZE(str)); + v = codec_tuple( + PyUnicode_EncodeUTF8Ex(str, errors), + PyUnicode_GET_SIZE(str)); Py_DECREF(str); return v; } @@ -411,172 +410,161 @@ static PyObject * utf_16_encode(PyObject *self, - PyObject *args) + PyObject *args) { PyObject *str, *v; - const char *errors = NULL; + PyObject *errors = NULL; int byteorder = 0; - if (!PyArg_ParseTuple(args, "O|zi:utf_16_encode", + if (!PyArg_ParseTuple(args, "O|Oi:utf_16_encode", &str, &errors, &byteorder)) return NULL; str = PyUnicode_FromObject(str); if (str == NULL) return NULL; - v = codec_tuple(PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(str), - PyUnicode_GET_SIZE(str), - errors, - byteorder), - PyUnicode_GET_SIZE(str)); + v = codec_tuple( + PyUnicode_EncodeUTF16Ex(str, errors, byteorder), + PyUnicode_GET_SIZE(str)); Py_DECREF(str); return v; } static PyObject * utf_16_le_encode(PyObject *self, - PyObject *args) + PyObject *args) { PyObject *str, *v; - const char *errors = NULL; + PyObject *errors = NULL; - if (!PyArg_ParseTuple(args, "O|zi:utf_16_le_encode", + if (!PyArg_ParseTuple(args, "O|Oi:utf_16_le_encode", &str, &errors)) return NULL; str = PyUnicode_FromObject(str); if (str == NULL) return NULL; - v = codec_tuple(PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(str), - PyUnicode_GET_SIZE(str), - errors, - -1), - PyUnicode_GET_SIZE(str)); + v = codec_tuple( + PyUnicode_EncodeUTF16Ex(str, errors, -1), + PyUnicode_GET_SIZE(str)); Py_DECREF(str); return v; } static PyObject * utf_16_be_encode(PyObject *self, - PyObject *args) + PyObject *args) { PyObject *str, *v; - const char *errors = NULL; + PyObject *errors = NULL; - if (!PyArg_ParseTuple(args, "O|zi:utf_16_be_encode", + if (!PyArg_ParseTuple(args, "O|Oi:utf_16_be_encode", &str, &errors)) return NULL; str = PyUnicode_FromObject(str); if (str == NULL) return NULL; - v = codec_tuple(PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(str), - PyUnicode_GET_SIZE(str), - errors, - +1), - PyUnicode_GET_SIZE(str)); + v = codec_tuple( + PyUnicode_EncodeUTF16Ex(str, errors, +1), + PyUnicode_GET_SIZE(str)); Py_DECREF(str); return v; } static PyObject * unicode_escape_encode(PyObject *self, - PyObject *args) + PyObject *args) { PyObject *str, *v; - const char *errors = NULL; + PyObject *errors = NULL; - if (!PyArg_ParseTuple(args, "O|z:unicode_escape_encode", + if (!PyArg_ParseTuple(args, "O|O:unicode_escape_encode", &str, &errors)) return NULL; str = PyUnicode_FromObject(str); if (str == NULL) return NULL; - v = codec_tuple(PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(str), - PyUnicode_GET_SIZE(str)), - PyUnicode_GET_SIZE(str)); + v = codec_tuple( + PyUnicode_EncodeUnicodeEscapeEx(str), + PyUnicode_GET_SIZE(str)); Py_DECREF(str); return v; } static PyObject * raw_unicode_escape_encode(PyObject *self, - PyObject *args) + PyObject *args) { PyObject *str, *v; - const char *errors = NULL; + PyObject *errors = NULL; - if (!PyArg_ParseTuple(args, "O|z:raw_unicode_escape_encode", + if (!PyArg_ParseTuple(args, "O|O:raw_unicode_escape_encode", &str, &errors)) return NULL; str = PyUnicode_FromObject(str); if (str == NULL) return NULL; - v = codec_tuple(PyUnicode_EncodeRawUnicodeEscape( - PyUnicode_AS_UNICODE(str), - PyUnicode_GET_SIZE(str)), - PyUnicode_GET_SIZE(str)); + v = codec_tuple( + PyUnicode_EncodeRawUnicodeEscapeEx(str), + PyUnicode_GET_SIZE(str)); Py_DECREF(str); return v; } static PyObject * latin_1_encode(PyObject *self, - PyObject *args) + PyObject *args) { PyObject *str, *v; - const char *errors = NULL; + PyObject *errors = NULL; - if (!PyArg_ParseTuple(args, "O|z:latin_1_encode", + if (!PyArg_ParseTuple(args, "O|O:latin_1_encode", &str, &errors)) return NULL; str = PyUnicode_FromObject(str); if (str == NULL) return NULL; - v = codec_tuple(PyUnicode_EncodeLatin1( - PyUnicode_AS_UNICODE(str), - PyUnicode_GET_SIZE(str), - errors), - PyUnicode_GET_SIZE(str)); + v = codec_tuple( + PyUnicode_EncodeLatin1Ex(str, errors), + PyUnicode_GET_SIZE(str)); Py_DECREF(str); return v; } static PyObject * ascii_encode(PyObject *self, - PyObject *args) + PyObject *args) { PyObject *str, *v; - const char *errors = NULL; + PyObject *errors = NULL; - if (!PyArg_ParseTuple(args, "O|z:ascii_encode", + if (!PyArg_ParseTuple(args, "O|O:ascii_encode", &str, &errors)) return NULL; str = PyUnicode_FromObject(str); if (str == NULL) return NULL; - v = codec_tuple(PyUnicode_EncodeASCII( - PyUnicode_AS_UNICODE(str), - PyUnicode_GET_SIZE(str), - errors), - PyUnicode_GET_SIZE(str)); + v = codec_tuple( + PyUnicode_EncodeASCIIEx(str, errors), + PyUnicode_GET_SIZE(str)); Py_DECREF(str); return v; } static PyObject * charmap_encode(PyObject *self, - PyObject *args) + PyObject *args) { PyObject *str, *v; - const char *errors = NULL; + PyObject *errors = NULL; PyObject *mapping = NULL; - if (!PyArg_ParseTuple(args, "O|zO:charmap_encode", + if (!PyArg_ParseTuple(args, "O|OO:charmap_encode", &str, &errors, &mapping)) return NULL; if (mapping == Py_None) @@ -585,12 +573,9 @@ str = PyUnicode_FromObject(str); if (str == NULL) return NULL; - v = codec_tuple(PyUnicode_EncodeCharmap( - PyUnicode_AS_UNICODE(str), - PyUnicode_GET_SIZE(str), - mapping, - errors), - PyUnicode_GET_SIZE(str)); + v = codec_tuple( + PyUnicode_EncodeCharmapEx(str, mapping, errors), + PyUnicode_GET_SIZE(str)); Py_DECREF(str); return v; } @@ -602,20 +587,18 @@ PyObject *args) { PyObject *str, *v; - const char *errors = NULL; + PyObject *errors = NULL; - if (!PyArg_ParseTuple(args, "O|z:mbcs_encode", + if (!PyArg_ParseTuple(args, "O|O:mbcs_encode", &str, &errors)) return NULL; str = PyUnicode_FromObject(str); if (str == NULL) return NULL; - v = codec_tuple(PyUnicode_EncodeMBCS( - PyUnicode_AS_UNICODE(str), - PyUnicode_GET_SIZE(str), - errors), - PyUnicode_GET_SIZE(str)); + v = codec_tuple( + PyUnicode_EncodeMBCSEx(str, errors), + PyUnicode_GET_SIZE(str)); Py_DECREF(str); return v; } @@ -625,36 +608,40 @@ /* --- Module API --------------------------------------------------------- */ static PyMethodDef _codecs_functions[] = { - {"register", codecregister, 1}, - {"lookup", codeclookup, 1}, - {"utf_8_encode", utf_8_encode, 1}, - {"utf_8_decode", utf_8_decode, 1}, - {"utf_16_encode", utf_16_encode, 1}, - {"utf_16_le_encode", utf_16_le_encode, 1}, - {"utf_16_be_encode", utf_16_be_encode, 1}, - {"utf_16_decode", utf_16_decode, 1}, - {"utf_16_le_decode", utf_16_le_decode, 1}, - {"utf_16_be_decode", utf_16_be_decode, 1}, - {"utf_16_ex_decode", utf_16_ex_decode, 1}, - {"unicode_escape_encode", unicode_escape_encode, 1}, - {"unicode_escape_decode", unicode_escape_decode, 1}, - {"unicode_internal_encode", unicode_internal_encode, 1}, - {"unicode_internal_decode", unicode_internal_decode, 1}, - {"raw_unicode_escape_encode", raw_unicode_escape_encode, 1}, - {"raw_unicode_escape_decode", raw_unicode_escape_decode, 1}, - {"latin_1_encode", latin_1_encode, 1}, - {"latin_1_decode", latin_1_decode, 1}, - {"ascii_encode", ascii_encode, 1}, - {"ascii_decode", ascii_decode, 1}, - {"charmap_encode", charmap_encode, 1}, - {"charmap_decode", charmap_decode, 1}, - {"readbuffer_encode", readbuffer_encode, 1}, - {"charbuffer_encode", charbuffer_encode, 1}, + {"register", codecregister, 1}, + {"lookup", codeclookup, 1}, + {"utf_8_encode", utf_8_encode, 1}, + {"utf_8_decode", utf_8_decode, 1}, + {"utf_16_encode", utf_16_encode, 1}, + {"utf_16_le_encode", utf_16_le_encode, 1}, + {"utf_16_be_encode", utf_16_be_encode, 1}, + {"utf_16_decode", utf_16_decode, 1}, + {"utf_16_le_decode", utf_16_le_decode, 1}, + {"utf_16_be_decode", utf_16_be_decode, 1}, + {"utf_16_ex_decode", utf_16_ex_decode, 1}, + {"unicode_escape_encode", unicode_escape_encode, 1}, + {"unicode_escape_decode", unicode_escape_decode, 1}, + {"unicode_internal_encode", unicode_internal_encode, 1}, + {"unicode_internal_decode", unicode_internal_decode, 1}, + {"raw_unicode_escape_encode", raw_unicode_escape_encode, 1}, + {"raw_unicode_escape_decode", raw_unicode_escape_decode, 1}, + {"latin_1_encode", latin_1_encode, 1}, + {"latin_1_decode", latin_1_decode, 1}, + {"ascii_encode", ascii_encode, 1}, + {"ascii_decode", ascii_decode, 1}, + {"charmap_encode", charmap_encode, 1}, + {"charmap_decode", charmap_decode, 1}, + {"readbuffer_encode", readbuffer_encode, 1}, + {"charbuffer_encode", charbuffer_encode, 1}, #ifdef MS_WIN32 - {"mbcs_encode", mbcs_encode, 1}, - {"mbcs_decode", mbcs_decode, 1}, + {"mbcs_encode", mbcs_encode, 1}, + {"mbcs_decode", mbcs_decode, 1}, #endif - {NULL, NULL} /* sentinel */ + {"raise_unicodeencode_errors", PyCodec_RaiseUnicodeEncodeErrors, 1}, + {"ignore_unicodeencode_errors", PyCodec_IgnoreUnicodeEncodeErrors, 1}, + {"replace_unicodeencode_errors", PyCodec_ReplaceUnicodeEncodeErrors, 1}, + {"xmlcharrefreplace_unicodeencode_errors", PyCodec_XMLCharRefReplaceUnicodeEncodeErrors, 1}, + {NULL, NULL} /* sentinel */ }; DL_EXPORT(void) Index: Objects/unicodeobject.c =================================================================== RCS file: /cvsroot/python/python/dist/src/Objects/unicodeobject.c,v retrieving revision 2.93 diff -u -r2.93 unicodeobject.c --- Objects/unicodeobject.c 2001/06/07 12:26:56 2.93 +++ Objects/unicodeobject.c 2001/06/13 13:14:05 @@ -497,47 +497,30 @@ return NULL; } -PyObject *PyUnicode_Encode(const Py_UNICODE *s, - int size, - const char *encoding, - const char *errors) +PyObject *PyUnicode_EncodeEx(PyObject *unicode, + const char *encoding, + PyObject *errors) { - PyObject *v, *unicode; - - unicode = PyUnicode_FromUnicode(s, size); - if (unicode == NULL) - return NULL; - v = PyUnicode_AsEncodedString(unicode, encoding, errors); - Py_DECREF(unicode); - return v; -} - -PyObject *PyUnicode_AsEncodedString(PyObject *unicode, - const char *encoding, - const char *errors) -{ PyObject *v; - + if (!PyUnicode_Check(unicode)) { PyErr_BadArgument(); goto onError; } - if (encoding == NULL) + if (encoding == NULL) encoding = PyUnicode_GetDefaultEncoding(); /* Shortcuts for common default encodings */ - if (errors == NULL) { - if (strcmp(encoding, "utf-8") == 0) - return PyUnicode_AsUTF8String(unicode); - else if (strcmp(encoding, "latin-1") == 0) - return PyUnicode_AsLatin1String(unicode); - else if (strcmp(encoding, "ascii") == 0) - return PyUnicode_AsASCIIString(unicode); - } + if (strcmp(encoding, "utf-8") == 0) + return PyUnicode_EncodeUTF8Ex(unicode, errors); + else if ((strcmp(encoding, "latin-1") == 0) || (strcmp(encoding, "iso-8859-1") == 0)) + return PyUnicode_EncodeLatin1Ex(unicode, errors); + else if (strcmp(encoding, "ascii") == 0) + return PyUnicode_EncodeASCIIEx(unicode, errors); /* Encode via the codec registry */ - v = PyCodec_Encode(unicode, encoding, errors); + v = PyCodec_EncodeEx(unicode, encoding, errors); if (v == NULL) goto onError; /* XXX Should we really enforce this ? */ @@ -554,6 +537,28 @@ return NULL; } +PyObject *PyUnicode_AsEncodedString(PyObject *unicode, + const char *encoding, + const char *errors) +{ + PyObject *errorstr; + PyObject *res; + + if (errors) { + errorstr = PyString_FromString(errors); + if (!errorstr) + return NULL; + } + else { + Py_INCREF(Py_None); + errorstr = Py_None; + } + + res = PyUnicode_AsEncodedStringEx(unicode, encoding, errorstr); + Py_DECREF(errorstr); + return res; +} + /* Return a Python string holding the default encoded value of the Unicode object. @@ -848,23 +853,36 @@ } #endif -PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s, - int size, - const char *errors) +PyObject *PyUnicode_EncodeUTF8Ex(PyObject *unicode, + PyObject *errors) { + Py_UNICODE *s; + int size; PyObject *v; char *p; char *q; Py_UCS4 ch2; - unsigned int cbAllocated = 3 * size; + unsigned int cbAllocated; unsigned int cbWritten = 0; int i = 0; + if (!PyUnicode_Check(unicode)) { + PyErr_BadArgument(); + return NULL; + } + errors = PyCodec_UnicodeEncodeHandlerForObject(errors); + if (errors == NULL) + return NULL; + s = PyUnicode_AS_UNICODE(unicode); + size = PyUnicode_GET_SIZE(unicode); + cbAllocated = 3 * size; v = PyString_FromStringAndSize(NULL, cbAllocated); if (v == NULL) return NULL; - if (size == 0) + if (size == 0) { + Py_DECREF(errors); return v; + } p = q = PyString_AS_STRING(v); while (i < size) { @@ -918,20 +936,44 @@ onError: Py_DECREF(v); + Py_DECREF(errors); return NULL; } -PyObject *PyUnicode_AsUTF8String(PyObject *unicode) +PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s, + int size, + const char *errors) { - if (!PyUnicode_Check(unicode)) { - PyErr_BadArgument(); - return NULL; + PyObject *unicode; + PyObject *errorstr; + PyObject *res; + + unicode = PyUnicode_FromUnicode(s, size); + if (!unicode) + return NULL; + if (errors) { + errorstr = PyString_FromString(errors); + if (!errorstr) { + Py_DECREF(unicode); + return NULL; + } + } + else { + Py_INCREF(Py_None); + errorstr = Py_None; } - return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode), - PyUnicode_GET_SIZE(unicode), - NULL); + + res = PyUnicode_EncodeUTF8Ex(unicode, errorstr); + Py_DECREF(unicode); + Py_DECREF(errorstr); + return res; } +PyObject *PyUnicode_AsUTF8String(PyObject *unicode) +{ + return PyUnicode_EncodeUTF8Ex(unicode, NULL); +} + /* --- UTF-16 Codec ------------------------------------------------------- */ static @@ -1084,53 +1126,91 @@ #undef UTF16_ERROR -PyObject *PyUnicode_EncodeUTF16(const Py_UNICODE *s, - int size, - const char *errors, - int byteorder) -{ - PyObject *v; +PyObject *PyUnicode_EncodeUTF16Ex(PyObject *unicode, + PyObject *errors, + int byteorder) +{ + Py_UNICODE *s; + int size; + PyObject *v = NULL; Py_UNICODE *p; char *q; + if (!PyUnicode_Check(unicode)) { + PyErr_BadArgument(); + return NULL; + } + errors = PyCodec_UnicodeEncodeHandlerForObject(errors); + if (errors == NULL) + return NULL; + + s = PyUnicode_AS_UNICODE(unicode); + size = PyUnicode_GET_SIZE(unicode); + /* We don't create UTF-16 pairs... */ - v = PyString_FromStringAndSize(NULL, + v = PyString_FromStringAndSize(NULL, sizeof(Py_UNICODE) * (size + (byteorder == 0))); if (v == NULL) - return NULL; + goto finish; q = PyString_AS_STRING(v); p = (Py_UNICODE *)q; if (byteorder == 0) *p++ = 0xFEFF; - if (size == 0) - return v; - if (byteorder == 0 || + if (size > 0) { + if (byteorder == 0 || #ifdef BYTEORDER_IS_LITTLE_ENDIAN - byteorder == -1 + byteorder == -1 #else - byteorder == 1 + byteorder == 1 #endif - ) - Py_UNICODE_COPY(p, s, size); - else - while (size-- > 0) { - Py_UNICODE ch = *s++; - *p++ = (ch >> 8) | (ch << 8); - } + ) + Py_UNICODE_COPY(p, s, size); + else + while (size-- > 0) { + Py_UNICODE ch = *s++; + *p++ = (ch >> 8) | (ch << 8); + } + } + finish: + Py_DECREF(errors); return v; } -PyObject *PyUnicode_AsUTF16String(PyObject *unicode) + +PyObject *PyUnicode_EncodeUTF16(const Py_UNICODE *s, + int size, + const char *errors, + int byteorder) { - if (!PyUnicode_Check(unicode)) { - PyErr_BadArgument(); - return NULL; + PyObject *unicode; + PyObject *errorstr; + PyObject *res; + + unicode = PyUnicode_FromUnicode(s, size); + if (!unicode) + return NULL; + if (errors) { + errorstr = PyString_FromString(errors); + if (!errorstr) { + Py_DECREF(unicode); + return NULL; + } + } + else { + Py_INCREF(Py_None); + errorstr = Py_None; } - return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode), - PyUnicode_GET_SIZE(unicode), - NULL, - 0); + + res = PyUnicode_EncodeUTF16Ex(unicode, errorstr, byteorder); + Py_DECREF(unicode); + Py_DECREF(errorstr); + return res; +} + +PyObject *PyUnicode_AsUTF16String(PyObject *unicode) +{ + return PyUnicode_EncodeUTF16Ex(unicode, NULL, 0); } /* --- Unicode Escape Codec ----------------------------------------------- */ @@ -1430,6 +1510,18 @@ return NULL; } +PyObject *PyUnicode_EncodeUnicodeEscapeEx(PyObject *unicode) +{ + if (!PyUnicode_Check(unicode)) { + PyErr_BadArgument(); + return NULL; + } + + return unicodeescape_string( + PyUnicode_AS_UNICODE(unicode), + PyUnicode_GET_SIZE(unicode), 0); +} + PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s, int size) { @@ -1438,12 +1530,7 @@ PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode) { - if (!PyUnicode_Check(unicode)) { - PyErr_BadArgument(); - return NULL; - } - return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode), - PyUnicode_GET_SIZE(unicode)); + return PyUnicode_EncodeUnicodeEscapeEx(unicode); } /* --- Raw Unicode Escape Codec ------------------------------------------- */ @@ -1524,15 +1611,22 @@ return NULL; } -PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s, - int size) +PyObject *PyUnicode_EncodeRawUnicodeEscapeEx(PyObject *unicode) { PyObject *repr; char *p; char *q; + Py_UNICODE *s; + int size; static const char *hexdigit = "0123456789abcdef"; + if (!PyUnicode_Check(unicode)) { + PyErr_BadArgument(); + return NULL; + } + s = PyUnicode_AS_UNICODE(unicode); + size = PyUnicode_GET_SIZE(unicode); repr = PyString_FromStringAndSize(NULL, 6 * size); if (repr == NULL) return NULL; @@ -1566,14 +1660,19 @@ return NULL; } -PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode) +PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s, + int size) { - if (!PyUnicode_Check(unicode)) { - PyErr_BadArgument(); + PyObject *unicode; + PyObject *res; + + unicode = PyUnicode_FromUnicode(s, size); + if (!unicode) return NULL; - } - return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode), - PyUnicode_GET_SIZE(unicode)); + + res = PyUnicode_EncodeRawUnicodeEscapeEx(unicode); + Py_DECREF(unicode); + return res; } /* --- Latin-1 Codec ------------------------------------------------------ */ @@ -1606,81 +1705,184 @@ return NULL; } -static -int latin1_encoding_error(const Py_UNICODE **source, - char **dest, - const char *errors, - const char *details) -{ - if ((errors == NULL) || - (strcmp(errors,"strict") == 0)) { - PyErr_Format(PyExc_UnicodeError, - "Latin-1 encoding error: %.400s", - details); - return -1; +/* Encode a Unicode object as ASCII (limit==128) or + latin-1 (limit==256) + + For this and the other encode functions the loop through + the string is done in the following way: A stack with two + strings is kept and the loop always encodes a character from + the string at the stacktop. If an error is encountered and + the stack has only one entry (during encoding of the original + string) the callback is called and the unicode object returned + is pushed onto the stack, so the encoding continues with the + replacement string. If the stack has two entries when an + error is encountered, the replacement string itself has + an unencodable character and an exception will be raised. + When the encoder has reached the end of it's current string + there are two possibilities: when the stack contains two + entries, this was the replacement string, so the replacement + string will be popped from the stack and encoding continues + with the next character from the original string. If the + stack had only one entry, encoding is finished. */ +static PyObject *unicode_encode_ucs1ex(PyObject *unicode, + PyObject *errors, int limit) +{ + /* current input position */ + int unicodepos; + /* output object */ + PyObject *res; + /* current output position */ + int respos = 0; + /* the next two variables are used as a "micro stack": + during processing of a replacement string unicode2 + and unicode2pos contain the values for the original + unicode object to be encoded */ + PyObject *unicode2 = NULL; + int unicode2pos = 0; + char *encoding = (limit == 256) ? "latin-1" : "ascii"; + + if (!PyUnicode_Check(unicode)) { + PyErr_BadArgument(); + return NULL; } - else if (strcmp(errors,"ignore") == 0) { - return 0; + errors = PyCodec_UnicodeEncodeHandlerForObject(errors); + if (errors == NULL) + return NULL; + /* allocate enough for a simple encoding without + replacements, if we need more, we'll resize */ + res = PyString_FromStringAndSize(NULL, PyUnicode_GET_SIZE(unicode)); + if (res == NULL) + goto onError; + if (PyUnicode_GET_SIZE(unicode) == 0) { + Py_DECREF(errors); + return res; + } + + for (unicodepos = 0;;++unicodepos) { + Py_UNICODE c; + /* finished with the string? */ + if (unicodepos == PyUnicode_GET_SIZE(unicode)) { + /* processing replacement? */ + if (unicode2) { + /* forget replacement */ + Py_DECREF(unicode); + /* switch back to original */ + unicode = unicode2; + unicodepos = unicode2pos; + unicode2 = NULL; + unicode2pos = 0; + /* maybe original is finished too? */ + continue; + } + else + /* processing original => finished */ + break; + } + c = PyUnicode_AS_UNICODE(unicode)[unicodepos]; + + /* we can't encode this */ + if (c>=limit) { + if ((c == Py_UNICODE_REPLACEMENT_CHARACTER) && unicode2) + /* use our own replacement character, but only when processing replacements */ + c = '?'; + else if (unicode2) { + /* error while replacing => report position in original */ + PyCodec_RaiseUnicodeEncodeError(encoding, c, unicode2pos); + goto onError; + } else { + /* use the callback */ + PyObject *args = Py_BuildValue("sOi", encoding, unicode, unicodepos); + if (args == NULL) + goto onError; + /* "push" original to secondary variables */ + unicode2 = unicode; + unicode2pos = unicodepos; + /* switch to replacement */ + unicode = PyEval_CallObject(errors, args); + Py_DECREF(args); + if (unicode == NULL) + goto onError; + if (!PyUnicode_Check(unicode)) { + PyErr_Format(PyExc_ValueError, + "encoding error handler must return unicode"); + goto onError; + } + unicodepos = -1; + /* retry with the replacement string */ + continue; + } + } + /* need more space? */ + if (respos == PyString_GET_SIZE(res)) { + /* allocate twice the space */ + if (_PyString_Resize(&res, 2*PyString_GET_SIZE(res))) + goto onError; + } + PyString_AS_STRING(res)[respos++] = (char)c; } - else if (strcmp(errors,"replace") == 0) { - **dest = '?'; - (*dest)++; - return 0; + /* Resize if we allocated to much */ + if (respos 0) { - Py_UNICODE ch = *p++; - if (ch >= 256) { - if (latin1_encoding_error(&p, &s, errors, - "ordinal not in range(256)")) - goto onError; + unicode = PyUnicode_FromUnicode(p, size); + if (!unicode) + return NULL; + if (errors) { + errorstr = PyString_FromString(errors); + if (!errorstr) { + Py_DECREF(unicode); + return NULL; } - else - *s++ = (char)ch; } - /* Resize if error handling skipped some characters */ - if (s - start < PyString_GET_SIZE(repr)) - if (_PyString_Resize(&repr, s - start)) - goto onError; - return repr; + else { + Py_INCREF(Py_None); + errorstr = Py_None; + } - onError: - Py_DECREF(repr); - return NULL; + res = unicode_encode_ucs1ex(unicode, errorstr, limit); + Py_DECREF(unicode); + Py_DECREF(errorstr); + return res; +} + +PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p, + int size, + const char *errors) +{ + return unicode_encode_ucs1(p, size, errors, 256); } PyObject *PyUnicode_AsLatin1String(PyObject *unicode) { - if (!PyUnicode_Check(unicode)) { - PyErr_BadArgument(); - return NULL; - } - return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode), - PyUnicode_GET_SIZE(unicode), - NULL); + return PyUnicode_EncodeLatin1Ex(unicode, NULL); } /* --- 7-bit ASCII Codec -------------------------------------------------- */ @@ -1754,81 +1956,22 @@ return NULL; } -static -int ascii_encoding_error(const Py_UNICODE **source, - char **dest, - const char *errors, - const char *details) +PyObject *PyUnicode_EncodeASCIIEx(PyObject *unicode, + PyObject *errors) { - if ((errors == NULL) || - (strcmp(errors,"strict") == 0)) { - PyErr_Format(PyExc_UnicodeError, - "ASCII encoding error: %.400s", - details); - return -1; - } - else if (strcmp(errors,"ignore") == 0) { - return 0; - } - else if (strcmp(errors,"replace") == 0) { - **dest = '?'; - (*dest)++; - return 0; - } - else { - PyErr_Format(PyExc_ValueError, - "ASCII encoding error; " - "unknown error handling code: %.400s", - errors); - return -1; - } + return unicode_encode_ucs1ex(unicode, errors, 128); } PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p, - int size, - const char *errors) + int size, + const char *errors) { - PyObject *repr; - char *s, *start; - - repr = PyString_FromStringAndSize(NULL, size); - if (repr == NULL) - return NULL; - if (size == 0) - return repr; - - s = PyString_AS_STRING(repr); - start = s; - while (size-- > 0) { - Py_UNICODE ch = *p++; - if (ch >= 128) { - if (ascii_encoding_error(&p, &s, errors, - "ordinal not in range(128)")) - goto onError; - } - else - *s++ = (char)ch; - } - /* Resize if error handling skipped some characters */ - if (s - start < PyString_GET_SIZE(repr)) - if (_PyString_Resize(&repr, s - start)) - goto onError; - return repr; - - onError: - Py_DECREF(repr); - return NULL; + return unicode_encode_ucs1(p, size, errors, 128); } PyObject *PyUnicode_AsASCIIString(PyObject *unicode) { - if (!PyUnicode_Check(unicode)) { - PyErr_BadArgument(); - return NULL; - } - return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode), - PyUnicode_GET_SIZE(unicode), - NULL); + return PyUnicode_EncodeASCIIEx(unicode, NULL); } #ifdef MS_WIN32 @@ -1861,20 +2004,26 @@ return (PyObject *)v; } -PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p, - int size, - const char *errors) +PyObject *PyUnicode_EncodeMBCSEx(PyObject *unicode, + PyObject *errors) { PyObject *repr; - char *s; DWORD mbcssize; + if (!PyUnicode_Check(unicode)) { + PyErr_BadArgument(); + return NULL; + } + /* If there are no characters, bail now! */ - if (size==0) - return PyString_FromString(""); + if (PyUNICODE_GET_SIZE(unicode) == 0) + return PyString_FromString(""); /* First get the size of the result */ - mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL); + mbcssize = WideCharToMultiByte(CP_ACP, 0, + PyUnicode_AS_UNICODE(unicode), + PyUnicode_GET_SIZE(unicode), + NULL, 0, NULL, NULL); if (mbcssize==0) return PyErr_SetFromWindowsErrWithFilename(0, NULL); @@ -1885,14 +2034,46 @@ return repr; /* Do the conversion */ - s = PyString_AS_STRING(repr); - if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) { + if (0 == WideCharToMultiByte(CP_ACP, 0, + PyUnicode_AS_UNICODE(unicode), + PyUnicode_GET_SIZE(unicode), + PyString_AS_STRING(repr), + mbcssize, NULL, NULL)) { Py_DECREF(repr); return PyErr_SetFromWindowsErrWithFilename(0, NULL); } return repr; } +PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p, + int size, + const char *errors) +{ + PyObject *unicode; + PyObject *errorstr; + PyObject *res; + + unicode = PyUnicode_FromUnicode(p, size); + if (!unicode) + return NULL; + if (errors) { + errorstr = PyString_FromString(errors); + if (!errorstr) { + Py_DECREF(unicode); + return NULL; + } + } + else { + Py_INCREF(Py_None); + errorstr = Py_None; + } + + res = PyUnicode_EncodeMBCSEx(unicode, errorstr); + Py_DECREF(unicode); + Py_DECREF(errorstr); + return res; +} + #endif /* MS_WIN32 */ /* --- Character Mapping Codec -------------------------------------------- */ @@ -2034,61 +2215,73 @@ return NULL; } -static -int charmap_encoding_error(const Py_UNICODE **source, - char **dest, - const char *errors, - const char *details) -{ - if ((errors == NULL) || - (strcmp(errors,"strict") == 0)) { - PyErr_Format(PyExc_UnicodeError, - "charmap encoding error: %.400s", - details); - return -1; - } - else if (strcmp(errors,"ignore") == 0) { - return 0; - } - else if (strcmp(errors,"replace") == 0) { - **dest = '?'; - (*dest)++; - return 0; - } - else { - PyErr_Format(PyExc_ValueError, - "charmap encoding error; " - "unknown error handling code: %.400s", - errors); - return -1; - } -} +PyObject *PyUnicode_EncodeCharmapEx(PyObject *unicode, + PyObject *mapping, + PyObject *errors) +{ + /* current input position */ + int unicodepos; + /* output object */ + PyObject *res; + /* current output position */ + int respos = 0; + /* the next two variables are used as a "micro stack": + during processing of a replacement string unicode2 + and unicode2pos contain the values for the original + unicode object to be encoded */ + PyObject *unicode2 = NULL; + int unicode2pos = 0; -PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p, - int size, - PyObject *mapping, - const char *errors) -{ - PyObject *v; - char *s; int extrachars = 0; /* Default to Latin-1 */ if (mapping == NULL) - return PyUnicode_EncodeLatin1(p, size, errors); + return PyUnicode_EncodeLatin1Ex(unicode, errors); - v = PyString_FromStringAndSize(NULL, size); - if (v == NULL) + if (!PyUnicode_Check(unicode)) { + PyErr_BadArgument(); + return NULL; + } + errors = PyCodec_UnicodeEncodeHandlerForObject(errors); + if (errors == NULL) + return NULL; + + res = PyString_FromStringAndSize(NULL, PyUnicode_GET_SIZE(unicode)); + if (res == NULL) return NULL; - if (size == 0) - return v; - s = PyString_AS_STRING(v); - while (size-- > 0) { - Py_UNICODE ch = *p++; + if (PyUnicode_GET_SIZE(unicode) == 0) + return res; + for (unicodepos = 0;;++unicodepos) { + Py_UNICODE c; PyObject *w, *x; + /* finished with current string? */ + if (unicodepos == PyUnicode_GET_SIZE(unicode)) { + /* currently processing replacement? */ + if (unicode2) { + /* forget replacement string */ + Py_DECREF(unicode); + /* switch back to original */ + unicode = unicode2; + unicodepos = unicode2pos; + unicode2 = NULL; + unicode2pos = 0; + /* maybe original is finished too? */ + continue; + } + else + /* currently processing original => finished */ + break; + } + + c = PyUnicode_AS_UNICODE(unicode)[unicodepos]; + + /* use our own replacement character, but only when processing replacements */ + if ((c == Py_UNICODE_REPLACEMENT_CHARACTER) && unicode2) + c = '?'; + /* Get mapping (Unicode ordinal -> string char, integer or None) */ - w = PyInt_FromLong((long)ch); + w = PyInt_FromLong((long)c); if (w == NULL) goto onError; x = PyObject_GetItem(mapping, w); @@ -2112,39 +2305,67 @@ Py_DECREF(x); goto onError; } - *s++ = (char)value; + PyString_AS_STRING(res)[respos++] = (char)value; } + /* undefined mapping */ else if (x == Py_None) { - /* undefined mapping */ - if (charmap_encoding_error(&p, &s, errors, - "character maps to ")) { + /* error while replacing */ + if (unicode2) { + /* report original position; FIXME should we give a better name? */ + PyCodec_RaiseUnicodeEncodeError("charmap", c, unicode2pos); Py_DECREF(x); goto onError; } + else { + /* use the callback */ + PyObject *args = Py_BuildValue("sOi", "charmap", unicode, unicodepos); + if (args == NULL) { + Py_DECREF(x); + goto onError; + } + /* "push" original to secondary variables */ + unicode2 = unicode; + unicode2pos = unicodepos; + /* switch to replacement */ + unicode = PyEval_CallObject(errors, args); + Py_DECREF(args); + if (unicode == NULL) { + Py_DECREF(x); + goto onError; + } + if (!PyUnicode_Check(unicode)) { + PyErr_Format(PyExc_ValueError, + "encoding error handler must return unicode"); + Py_DECREF(x); + Py_DECREF(unicode); + goto onError; + } + unicodepos = -1; + /* retry with the replacement string */ + continue; + } } else if (PyString_Check(x)) { int targetsize = PyString_GET_SIZE(x); if (targetsize == 1) /* 1-1 mapping */ - *s++ = *PyString_AS_STRING(x); + PyString_AS_STRING(res)[respos++] = *PyString_AS_STRING(x); else if (targetsize > 1) { /* 1-n mapping */ if (targetsize > extrachars) { /* resize first */ - int oldpos = (int)(s - PyString_AS_STRING(v)); int needed = (targetsize - extrachars) + \ - (targetsize << 2); + (targetsize << 2); extrachars += needed; - if (_PyString_Resize(&v, PyString_GET_SIZE(v) + needed)) { + if (_PyString_Resize(&res, PyString_GET_SIZE(res) + needed)) { Py_DECREF(x); goto onError; } - s = PyString_AS_STRING(v) + oldpos; } - memcpy(s, PyString_AS_STRING(x), targetsize); - s += targetsize; + memcpy(&PyString_AS_STRING(res)[respos], PyString_AS_STRING(x), targetsize); + respos += targetsize; extrachars -= targetsize; } /* 1-0 mapping: skip the character */ @@ -2158,29 +2379,57 @@ } Py_DECREF(x); } - if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v)) - if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v)))) + /* Resize if we allocated to much */ + if (respos < PyString_GET_SIZE(res)) + if (_PyString_Resize(&res, respos)) goto onError; - return v; + return res; onError: - Py_DECREF(v); + Py_DECREF(res); + /* free replacement */ + if (unicode2) { + Py_XDECREF(unicode); + } + Py_DECREF(errors); return NULL; } -PyObject *PyUnicode_AsCharmapString(PyObject *unicode, - PyObject *mapping) +PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p, + int size, + PyObject *mapping, + const char *errors) { - if (!PyUnicode_Check(unicode) || mapping == NULL) { - PyErr_BadArgument(); + PyObject *unicode; + PyObject *errorstr; + PyObject *res; + + unicode = PyUnicode_FromUnicode(p, size); + if (!unicode) return NULL; + if (errors) { + errorstr = PyString_FromString(errors); + if (!errorstr) { + Py_DECREF(unicode); + return NULL; + } + } + else { + Py_INCREF(Py_None); + errorstr = Py_None; } - return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode), - PyUnicode_GET_SIZE(unicode), - mapping, - NULL); + + res = PyUnicode_EncodeCharmapEx(unicode, mapping, errorstr); + Py_DECREF(unicode); + Py_DECREF(errorstr); + return res; } +PyObject *PyUnicode_AsCharmapString(PyObject *unicode, PyObject *mapping) +{ + return PyUnicode_EncodeCharmapEx(unicode, mapping, NULL); +} + static int translate_error(const Py_UNICODE **source, Py_UNICODE **dest, @@ -2316,58 +2565,137 @@ /* --- Decimal Encoder ---------------------------------------------------- */ -int PyUnicode_EncodeDecimal(Py_UNICODE *s, - int length, - char *output, - const char *errors) -{ - Py_UNICODE *p, *end; +int PyUnicode_EncodeDecimalEx(PyObject *unicode, + char *output, + PyObject *errors) +{ + /* current input position */ + int unicodepos; + /* the next two variables are used as a "micro stack": + during processing of a replacement string unicode2 + and unicode2pos contain the values for the original + unicode object to be encoded */ + PyObject *unicode2 = NULL; + int unicode2pos = 0; if (output == NULL) { PyErr_BadArgument(); return -1; } - p = s; - end = s + length; - while (p < end) { - register Py_UNICODE ch = *p++; + if (!PyUnicode_Check(unicode)) { + PyErr_BadArgument(); + return -1; + } + errors = PyCodec_UnicodeEncodeHandlerForObject(errors); + if (errors == NULL) + return -1; + + for (unicodepos = 0;;++unicodepos) { + Py_UNICODE c; int decimal; - - if (Py_UNICODE_ISSPACE(ch)) { + PyObject *args; + /* finished with the string? */ + if (unicodepos == PyUnicode_GET_SIZE(unicode)) { + /* processing replacement? */ + if (unicode2) { + /* forget replacement */ + Py_DECREF(unicode); + /* switch back to original */ + unicode = unicode2; + unicodepos = unicode2pos; + unicode2 = NULL; + unicode2pos = 0; + /* maybe original is finished too? */ + continue; + } + else + /* processing original => finished */ + break; + } + c = PyUnicode_AS_UNICODE(unicode)[unicodepos]; + + if (Py_UNICODE_ISSPACE(c)) { *output++ = ' '; continue; } - decimal = Py_UNICODE_TODECIMAL(ch); + decimal = Py_UNICODE_TODECIMAL(c); if (decimal >= 0) { *output++ = '0' + decimal; continue; } - if (0 < ch && ch < 256) { - *output++ = (char)ch; + if (0 < c && c < 256) { + *output++ = (char)c; continue; - } - /* All other characters are considered invalid */ - if (errors == NULL || strcmp(errors, "strict") == 0) { - PyErr_SetString(PyExc_ValueError, - "invalid decimal Unicode string"); - goto onError; } - else if (strcmp(errors, "ignore") == 0) - continue; - else if (strcmp(errors, "replace") == 0) { + if ((c == Py_UNICODE_REPLACEMENT_CHARACTER) && unicode2) { *output++ = '?'; continue; } + /* All other characters are considered invalid */ + args = Py_BuildValue("sOi", "decimal", unicode, unicodepos); + if (args == NULL) + goto onError; + /* "push" original to secondary variables */ + unicode2 = unicode; + unicode2pos = unicodepos; + /* switch to replacement */ + unicode = PyEval_CallObject(errors, args); + Py_DECREF(args); + if (unicode == NULL) + goto onError; + if (!PyUnicode_Check(unicode)) { + PyErr_Format(PyExc_ValueError, + "encoding error handler must return unicode"); + goto onError; + } + /* retry with the replacement string */ + unicodepos = -1; } /* 0-terminate the output string */ *output++ = '\0'; return 0; onError: + Py_DECREF(errors); + /* free replacement */ + if (unicode2) { + Py_XDECREF(unicode); + } + return -1; } +int PyUnicode_EncodeDecimal(Py_UNICODE *s, + int length, + char *output, + const char *errors) +{ + PyObject *unicode; + PyObject *errorstr; + int res; + + unicode = PyUnicode_FromUnicode(s, length); + if (!unicode) + return -1; + if (errors) { + errorstr = PyString_FromString(errors); + if (!errorstr) { + Py_DECREF(unicode); + return -1; + } + } + else { + Py_INCREF(Py_None); + errorstr = Py_None; + } + + res = PyUnicode_EncodeDecimalEx(unicode, output, errorstr); + Py_DECREF(unicode); + Py_DECREF(errorstr); + return res; +} + /* --- Helpers ------------------------------------------------------------ */ static @@ -3475,17 +3803,21 @@ \n\ Return an encoded string version of S. Default encoding is the current\n\ default string encoding. errors may be given to set a different error\n\ -handling scheme. Default is 'strict' meaning that encoding errors raise\n\ -a ValueError. Other possible values are 'ignore' and 'replace'."; +handling scheme. Default is None meaning that encoding errors raise\n\ +a UnicodeError. 'strict' does the same. Other possible values are\n\ +ignore' and 'replace' or a callable that will be called with the encoding,\n\ +the original string and the position of the unencodable character and must\n\ +return a unicode string that will be encoded instead of the unencodable\n\ +character."; static PyObject * unicode_encode(PyUnicodeObject *self, PyObject *args) { char *encoding = NULL; - char *errors = NULL; - if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors)) + PyObject *errors = NULL; + if (!PyArg_ParseTuple(args, "|sO:encode", &encoding, &errors)) return NULL; - return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors); + return PyUnicode_AsEncodedStringEx((PyObject *)self, encoding, errors); } static char expandtabs__doc__[] = Index: Python/codecs.c =================================================================== RCS file: /cvsroot/python/python/dist/src/Python/codecs.c,v retrieving revision 2.13 diff -u -r2.13 codecs.c --- Python/codecs.c 2000/09/26 05:46:01 2.13 +++ Python/codecs.c 2001/06/13 13:14:39 @@ -236,11 +236,47 @@ return args; } +static +PyObject *args_tupleex(PyObject *object, + PyObject *errors) +{ + PyObject *args; + + args = PyTuple_New(2); + if (args == NULL) + return NULL; + Py_INCREF(object); + PyTuple_SET_ITEM(args,0,object); + errors = PyCodec_UnicodeEncodeHandlerForObject(errors); + if (!errors) { + Py_DECREF(args); + return NULL; + } + PyTuple_SET_ITEM(args, 1, errors); + return args; +} + /* Build a codec by calling factory(stream[,errors]) or just factory(errors) depending on whether the given parameters are non-NULL. */ static +PyObject *build_stream_codecex(PyObject *factory, + PyObject *stream, + PyObject *errors) +{ + PyObject *args, *codec; + + args = args_tupleex(stream, errors); + if (args == NULL) + return NULL; + + codec = PyEval_CallObject(factory, args); + Py_DECREF(args); + return codec; +} + +static PyObject *build_stream_codec(PyObject *factory, PyObject *stream, const char *errors) @@ -309,29 +345,51 @@ return NULL; } -PyObject *PyCodec_StreamWriter(const char *encoding, - PyObject *stream, - const char *errors) +PyObject *PyCodec_StreamWriterEx(const char *encoding, + PyObject *stream, + PyObject *errors) { PyObject *codecs; codecs = _PyCodec_Lookup(encoding); if (codecs == NULL) goto onError; - return build_stream_codec(PyTuple_GET_ITEM(codecs,3),stream,errors); + return build_stream_codecex(PyTuple_GET_ITEM(codecs,3),stream,errors); onError: return NULL; } +PyObject *PyCodec_StreamWriter(const char *encoding, + PyObject *stream, + const char *errors) +{ + PyObject *errorstr; + PyObject *res; + + if (errors) { + errorstr = PyString_FromString(errors); + if (!errorstr) + return NULL; + } + else { + Py_INCREF(Py_None); + errorstr = Py_None; + } + + res = PyCodec_StreamWriterEx(encoding,stream,errorstr); + Py_DECREF(errorstr); + return res; +} + /* Encode an object (e.g. an Unicode object) using the given encoding and return the resulting encoded object (usually a Python string). errors is passed to the encoder factory as argument if non-NULL. */ -PyObject *PyCodec_Encode(PyObject *object, - const char *encoding, - const char *errors) +PyObject *PyCodec_EncodeEx(PyObject *object, + const char *encoding, + PyObject *errors) { PyObject *encoder = NULL; PyObject *args = NULL, *result; @@ -341,11 +399,11 @@ if (encoder == NULL) goto onError; - args = args_tuple(object, errors); + args = args_tupleex(object, errors); if (args == NULL) goto onError; - result = PyEval_CallObject(encoder,args); + result = PyEval_CallObject(encoder, args); if (result == NULL) goto onError; @@ -370,6 +428,30 @@ return NULL; } +PyObject *PyCodec_Encode(PyObject *object, + const char *encoding, + const char *errors) +{ + PyObject *errorstr; + PyObject *res; + + if (errors) { + errorstr = PyString_FromString(errors); + if (!errorstr) { + Py_DECREF(object); + return NULL; + } + } + else { + Py_INCREF(Py_None); + errorstr = Py_None; + } + + res = PyCodec_EncodeEx(object, encoding, errorstr); + Py_DECREF(errorstr); + return res; +} + /* Decode an object (usually a Python string) using the given encoding and return an equivalent object (e.g. an Unicode object). @@ -414,6 +496,156 @@ Py_XDECREF(decoder); Py_XDECREF(result); return NULL; +} + +/* return a new reference to one of the builtin unicode encode + error handlers or None. + error can be: + + * NULL, Py_None, "strict" or u"strict" for + codecs.raise_unicodeencode_errors + * "ignore" or u"ignore" for codecs.ignore_unicodeencode_errors + * "replace" or u"replace" for codecs.replace_unicodeencode_errors + * a callable which will be returned directy + + everything else will raise an exception */ +PyObject *PyCodec_UnicodeEncodeHandlerForObject(PyObject *error) +{ + static Py_UNICODE strict[] = { 's', 't', 'r', 'i', 'c', 't' }; + static Py_UNICODE ignore[] = { 'i', 'g', 'n', 'o', 'r', 'e' }; + static Py_UNICODE replace[] = { 'r', 'e', 'p', 'l', 'a', 'c', 'e' }; + static PyMethodDef strictMethod = { + "raise_unicodeencode_errors", + PyCodec_RaiseUnicodeEncodeErrors, + METH_VARARGS + }; + static PyMethodDef ignoreMethod = { + "ignore_unicodeencode_errors", + PyCodec_IgnoreUnicodeEncodeErrors, + METH_VARARGS + }; + static PyMethodDef replaceMethod = { + "replace_unicodeencode_errors", + PyCodec_ReplaceUnicodeEncodeErrors, + METH_VARARGS + }; + PyMethodDef *method = NULL; + PyObject *res = NULL; + + if (error==NULL || error==Py_None) + method = &strictMethod; + else if (PyCallable_Check(error)) { + res = error; + Py_INCREF(error); + } + else if (PyString_Check(error)) { + char *s = PyString_AS_STRING(error); + int size = PyString_GET_SIZE(error); + if (size==6 && !memcmp(s, "strict", size)) + method = &strictMethod; + else if (size==6 && !memcmp(s, "ignore", size)) + method = &ignoreMethod; + else if (size==7 && !memcmp(s, "replace", size)) + method = &replaceMethod; + else + PyErr_SetString(PyExc_ValueError, "unknown error handler name"); + } + else if (PyUnicode_Check(error)) { + Py_UNICODE *s = PyUnicode_AS_UNICODE(error); + int size = PyUnicode_GET_SIZE(error); + if (size==sizeof(strict) && !memcmp(s, strict, sizeof(strict))) + method = &strictMethod; + else if (size==sizeof(ignore) && !memcmp(s, ignore, sizeof(ignore))) + method = &ignoreMethod; + else if (size==sizeof(replace) && !memcmp(s, replace, sizeof(replace))) + method = &replaceMethod; + else + PyErr_SetString(PyExc_ValueError, "unknown error handler name"); + } + else + PyErr_SetString(PyExc_TypeError, "wrong type for error handler"); + if (method) + res = PyCFunction_New(method, NULL); + return res; +} + + +void PyCodec_RaiseUnicodeEncodeError(const char *encoding, Py_UNICODE c, int pos) +{ + PyErr_Format(PyExc_UnicodeError, + "encoding '%.400s' can't encode character '\\u%x' in position %d", + encoding, (long)c, pos); +} + + +PyObject *PyCodec_RaiseUnicodeEncodeErrors(PyObject *self, PyObject *args) +{ + char *encoding; + Py_UNICODE *unicode; + int pos; + + if (PyArg_ParseTuple(args, "sui:raise_unicodeencode_errors", &encoding, &unicode, &pos)) + PyCodec_RaiseUnicodeEncodeError(encoding, unicode[pos], pos); + return NULL; +} + + +PyObject *PyCodec_IgnoreUnicodeEncodeErrors(PyObject *self, PyObject *args) +{ + char *encoding; + Py_UNICODE *unicode; + int pos; + + if (!PyArg_ParseTuple(args, "sui:ignore_unicodeencode_errors", &encoding, &unicode, &pos)) + return NULL; + + return PyUnicode_FromUnicode(NULL, 0); +} + + +PyObject *PyCodec_ReplaceUnicodeEncodeErrors(PyObject *self, PyObject *args) +{ + char *encoding; + Py_UNICODE *unicode; + int pos; + Py_UNICODE res = Py_UNICODE_REPLACEMENT_CHARACTER; + + if (!PyArg_ParseTuple(args, "sui:replace_unicodeencode_errors", &encoding, &unicode, &pos)) + return NULL; + + return PyUnicode_FromUnicode(&res, 1); +} + +PyObject *PyCodec_XMLCharRefReplaceUnicodeEncodeErrors(PyObject *self, PyObject *args) +{ + static Py_UNICODE hexdigits[] = { + '0', '1', '2', '3', '4', '5', '6', '7', + '8', '9', 'a', 'b', 'c', 'd', 'e', 'f' + }; + char *encoding; + Py_UNICODE *unicode; + int pos; + Py_UNICODE buf[9]; + Py_UNICODE *p = buf; + Py_UNICODE c; + + if (!PyArg_ParseTuple(args, "sui:xmlcharrefreplace_unicodeencode_errors", &encoding, &unicode, &pos)) + return NULL; + + c = unicode[pos]; + *p++ = '&'; + *p++ = '#'; + *p++ = 'x'; + if (c>=0x1000) + *p++ = hexdigits[c>>12]; + if (c>=0x0100) + *p++ = hexdigits[(c>>8)&0xf]; + if (c>=0x0010) + *p++ = hexdigits[(c>>4)&0xf]; + *p++ = hexdigits[c&0xf]; + *p++ = ';'; + + return PyUnicode_FromUnicode(buf, p-buf); } void _PyCodecRegistry_Init(void)