Index: Include/codecs.h =================================================================== RCS file: /cvsroot/python/python/dist/src/Include/codecs.h,v retrieving revision 2.3 diff -u -r2.3 codecs.h --- Include/codecs.h 2000/08/03 16:24:24 2.3 +++ Include/codecs.h 2001/07/27 03:17:23 @@ -53,12 +53,19 @@ object is passed through the encoder function found for the given encoding using the error handling method defined by errors. errors - may be NULL to use the default method defined for the codec. + may be NULL to use the strict encoding. Raises a LookupError in case no encoder can be found. */ +extern DL_IMPORT(PyObject *) PyCodec_EncodeEx( + PyObject *object, + const char *encoding, + PyObject *errors + ); + +/* DEPRECATED */ extern DL_IMPORT(PyObject *) PyCodec_Encode( PyObject *object, const char *encoding, @@ -69,12 +76,19 @@ object is passed through the decoder function found for the given encoding using the error handling method defined by errors. errors - may be NULL to use the default method defined for the codec. + may be NULL to use strict error handling. Raises a LookupError in case no encoder can be found. */ +extern DL_IMPORT(PyObject *) PyCodec_DecodeEx( + PyObject *object, + const char *encoding, + PyObject *errors + ); + +/* DEPRECATED */ extern DL_IMPORT(PyObject *) PyCodec_Decode( PyObject *object, const char *encoding, @@ -103,6 +117,13 @@ /* Get a StreamReader factory function for the given encoding. */ +extern DL_IMPORT(PyObject *) PyCodec_StreamReaderEx( + const char *encoding, + PyObject *stream, + PyObject *errors + ); + +/* DEPRECATED */ extern DL_IMPORT(PyObject *) PyCodec_StreamReader( const char *encoding, PyObject *stream, @@ -111,11 +132,85 @@ /* Get a StreamWriter factory function for the given encoding. */ +extern DL_IMPORT(PyObject *) PyCodec_StreamWriterEx( + const char *encoding, + PyObject *stream, + PyObject *errors + ); + +/* DEPRECATED */ extern DL_IMPORT(PyObject *) PyCodec_StreamWriter( const char *encoding, PyObject *stream, const char *errors ); + +/* Unicode encoding error handling callback registry API */ + +/* Register the encoding error handling callback function error under the name name + this function will be called by the encoder when it encounters + an unencodable character, name is specified as the error parameter + in the call to the encode function. + Return 0 on success, -1 on error */ +extern DL_IMPORT(int) PyCodec_RegisterUnicodeEncodeErrorHandler(char *name, PyObject *error); + +/* Lookup the error handling callback function registered under the name error + if error is a string or unicode object. As special cases NULL or Py_None can be + passed, in which case the error handling callback for strict encoding will be returned. + If error is callable, a new reference to it will be returned directly. */ +extern DL_IMPORT(PyObject *) PyCodec_LookupUnicodeEncodeErrorHandler(PyObject *error); + +/* Raises a Unicode exception */ +extern DL_IMPORT(void) PyCodec_RaiseUnicodeEncodeError(const char *encoding, Py_UNICODE c, int pos, const char *reason); + +/* Encode error handler that raises an exception */ +extern DL_IMPORT(PyObject *) PyCodec_RaiseUnicodeEncodeErrors(PyObject *self, PyObject *args); + +/* Encode error handler that returns a empty string and so ignores the + unencodable character */ +extern DL_IMPORT(PyObject *) PyCodec_IgnoreUnicodeEncodeErrors(PyObject *self, PyObject *args); + +/* Encode error handler that returns u"?" and thus replaces the + the unencodable character */ +extern DL_IMPORT(PyObject *) PyCodec_ReplaceUnicodeEncodeErrors(PyObject *self, PyObject *args); + +/* Encode error handler that returns an XML character reference for the + unencodable character */ +extern DL_IMPORT(PyObject *) PyCodec_XMLCharRefReplaceUnicodeEncodeErrors(PyObject *self, PyObject *args); + +/* Encode error handler that returns an \u (or \U) escape sequence + for the unencodable character */ +extern DL_IMPORT(PyObject *) PyCodec_EscapeReplaceUnicodeEncodeErrors(PyObject *self, PyObject *args); + + +/* Unicode decoding error handling callback registry API */ + +/* Register the decoding error handling callback function error under the name name + this function will be called by the decoder when it encounters + undecodable bytes, as name is specified as the error parameter + in the call to the encode function. + Return 0 on success, -1 on error */ +extern DL_IMPORT(int) PyCodec_RegisterUnicodeDecodeErrorHandler(char *name, PyObject *error); + +/* Lookup the decoding error handling callback function registered under the name error + if error is a string or unicode object. As special cases NULL or Py_None can be + passed, in which case the error handling callback for strict encoding will be returned. + If error is callable, a new reference to it will be returned directly. */ +extern DL_IMPORT(PyObject *) PyCodec_LookupUnicodeDecodeErrorHandler(PyObject *error); + +/* Raises a Unicode exception */ +extern DL_IMPORT(void) PyCodec_RaiseUnicodeDecodeError(const char *encoding, char c, int pos, const char *reason); + +/* Decode error handler that raises an exception */ +extern DL_IMPORT(PyObject *) PyCodec_RaiseUnicodeDecodeErrors(PyObject *self, PyObject *args); + +/* Decode error handler that returns a empty string and so ignores the + undecodable byte (probably resulting in more errors from the next bytes) */ +extern DL_IMPORT(PyObject *) PyCodec_IgnoreUnicodeDecodeErrors(PyObject *self, PyObject *args); + +/* Decode error handler that returns "?" as a replacement for + the undecodable byte. */ +extern DL_IMPORT(PyObject *) PyCodec_ReplaceUnicodeDecodeErrors(PyObject *self, PyObject *args); #ifdef __cplusplus } Index: Include/stringobject.h =================================================================== RCS file: /cvsroot/python/python/dist/src/Include/stringobject.h,v retrieving revision 2.28 diff -u -r2.28 stringobject.h --- Include/stringobject.h 2001/06/16 05:11:17 2.28 +++ Include/stringobject.h 2001/07/27 03:17:23 @@ -83,9 +83,15 @@ /* --- Generic Codecs ----------------------------------------------------- */ -/* Create an object by decoding the encoded string s of the - given size. */ +/* Create an object by decoding the encoded string object */ +extern DL_IMPORT(PyObject*) PyString_DecodeEx( + PyObject *str, /* encoded string object */ + const char *encoding, /* encoding */ + PyObject *errors /* error handling */ + ); + +/* DEPRECATED */ extern DL_IMPORT(PyObject*) PyString_Decode( const char *s, /* encoded string */ int size, /* size of buffer */ @@ -93,9 +99,15 @@ const char *errors /* error handling */ ); -/* Encodes a char buffer of the given size and returns a - Python object. */ +/* Encodes string object and returns a Python object. */ +extern DL_IMPORT(PyObject*) PyString_EncodeEx( + PyObject *str, /* encoded string object */ + const char *encoding, /* encoding */ + PyObject *errors /* error handling */ + ); + +/* DEPRECATED */ extern DL_IMPORT(PyObject*) PyString_Encode( const char *s, /* string char buffer */ int size, /* number of chars to encode */ @@ -106,6 +118,12 @@ /* Encodes a string object and returns the result as Python object. */ +extern DL_IMPORT(PyObject*) PyString_AsEncodedObjectEx( + PyObject *str, /* string object */ + const char *encoding, /* encoding */ + PyObject *errors /* error handling */ + ); + extern DL_IMPORT(PyObject*) PyString_AsEncodedObject( PyObject *str, /* string object */ const char *encoding, /* encoding */ @@ -129,6 +147,13 @@ /* Decodes a string object and returns the result as Python object. */ +extern DL_IMPORT(PyObject*) PyString_AsDecodedObjectEx( + PyObject *str, /* string object */ + const char *encoding, /* encoding */ + PyObject *errors /* error handling */ + ); + +/* DEPRECATED */ extern DL_IMPORT(PyObject*) PyString_AsDecodedObject( PyObject *str, /* string object */ const char *encoding, /* encoding */ Index: Include/unicodeobject.h =================================================================== RCS file: /cvsroot/python/python/dist/src/Include/unicodeobject.h,v retrieving revision 2.27 diff -u -r2.27 unicodeobject.h --- Include/unicodeobject.h 2001/06/27 22:08:26 2.27 +++ Include/unicodeobject.h 2001/07/27 03:17:24 @@ -316,6 +316,13 @@ */ +extern DL_IMPORT(PyObject*) PyUnicode_FromEncodedObjectEx( + register PyObject *obj, /* Object */ + const char *encoding, /* encoding */ + PyObject *errors /* error handling */ + ); + +/* DEPRECATED */ extern DL_IMPORT(PyObject*) PyUnicode_FromEncodedObject( register PyObject *obj, /* Object */ const char *encoding, /* encoding */ @@ -409,8 +416,13 @@ /* --- Generic Codecs ----------------------------------------------------- */ -/* Create a Unicode object by decoding the encoded string s of the - given size. */ +/* Create a Unicode object by decoding the encoded string object str */ + +extern DL_IMPORT(PyObject*) PyUnicode_DecodeEx( + PyObject *str, /* encoded string object */ + const char *encoding, /* encoding */ + PyObject *errors /* error handling */ + ); extern DL_IMPORT(PyObject*) PyUnicode_Decode( const char *s, /* encoded string */ @@ -419,9 +431,15 @@ const char *errors /* error handling */ ); -/* Encodes a Py_UNICODE buffer of the given size and returns a - Python string object. */ +/* Encodes a Unicode object and returns a Python string object. */ +extern DL_IMPORT(PyObject*) PyUnicode_EncodeEx( + PyObject *unicode, /* Unicode object */ + const char *encoding, /* encoding */ + PyObject *errors /* error handling */ + ); + +/* DEPRECATED */ extern DL_IMPORT(PyObject*) PyUnicode_Encode( const Py_UNICODE *s, /* Unicode char buffer */ int size, /* number of Py_UNICODE chars to encode */ @@ -429,9 +447,12 @@ const char *errors /* error handling */ ); +#define PyUnicode_AsEncodedStringEx PyUnicode_EncodeEx + /* Encodes a Unicode object and returns the result as Python string object. */ +/* DEPRECATED */ extern DL_IMPORT(PyObject*) PyUnicode_AsEncodedString( PyObject *unicode, /* Unicode object */ const char *encoding, /* encoding */ @@ -440,6 +461,12 @@ /* --- UTF-8 Codecs ------------------------------------------------------- */ +extern DL_IMPORT(PyObject*) PyUnicode_DecodeUTF8Ex( + PyObject *string, /* UTF-8 encoded string object */ + PyObject *errors /* error handling */ + ); + +/* DEPRECATED */ extern DL_IMPORT(PyObject*) PyUnicode_DecodeUTF8( const char *string, /* UTF-8 encoded string */ int length, /* size of string */ @@ -450,6 +477,12 @@ PyObject *unicode /* Unicode object */ ); +extern DL_IMPORT(PyObject*) PyUnicode_EncodeUTF8Ex( + PyObject *unicode, /* Unicode object */ + PyObject *errors /* error handling */ + ); + +/* DEPRECATED */ extern DL_IMPORT(PyObject*) PyUnicode_EncodeUTF8( const Py_UNICODE *data, /* Unicode char buffer */ int length, /* number of Py_UNICODE chars to encode */ @@ -481,6 +514,15 @@ */ +extern DL_IMPORT(PyObject*) PyUnicode_DecodeUTF16Ex( + PyObject *string, /* UTF-16 encoded string object */ + PyObject *errors, /* error handling */ + int *byteorder /* pointer to byteorder to use + 0=native;-1=LE,1=BE; updated on + exit */ + ); + +/* DEPRECATED */ extern DL_IMPORT(PyObject*) PyUnicode_DecodeUTF16( const char *string, /* UTF-16 encoded string */ int length, /* size of string */ @@ -517,6 +559,13 @@ */ +extern DL_IMPORT(PyObject*) PyUnicode_EncodeUTF16Ex( + PyObject *unicode, /* Unicode object */ + PyObject *errors, /* error handling */ + int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */ + ); + +/* DEPRECATED */ extern DL_IMPORT(PyObject*) PyUnicode_EncodeUTF16( const Py_UNICODE *data, /* Unicode char buffer */ int length, /* number of Py_UNICODE chars to encode */ @@ -526,6 +575,12 @@ /* --- Unicode-Escape Codecs ---------------------------------------------- */ +extern DL_IMPORT(PyObject*) PyUnicode_DecodeUnicodeEscapeEx( + PyObject *string, /* Unicode-Escape encoded string object */ + PyObject *errors /* error handling */ + ); + +/* DEPRECATED */ extern DL_IMPORT(PyObject*) PyUnicode_DecodeUnicodeEscape( const char *string, /* Unicode-Escape encoded string */ int length, /* size of string */ @@ -536,6 +591,10 @@ PyObject *unicode /* Unicode object */ ); +extern DL_IMPORT(PyObject*) PyUnicode_EncodeUnicodeEscapeEx( + PyObject *unicode /* Unicode object */ + ); + extern DL_IMPORT(PyObject*) PyUnicode_EncodeUnicodeEscape( const Py_UNICODE *data, /* Unicode char buffer */ int length /* Number of Py_UNICODE chars to encode */ @@ -543,13 +602,19 @@ /* --- Raw-Unicode-Escape Codecs ------------------------------------------ */ +extern DL_IMPORT(PyObject*) PyUnicode_DecodeRawUnicodeEscapeEx( + PyObject *string, /* Raw-Unicode-Escape encoded string object*/ + PyObject *errors /* error handling */ + ); + +/* DEPRECATED */ extern DL_IMPORT(PyObject*) PyUnicode_DecodeRawUnicodeEscape( const char *string, /* Raw-Unicode-Escape encoded string */ int length, /* size of string */ const char *errors /* error handling */ ); -extern DL_IMPORT(PyObject*) PyUnicode_AsRawUnicodeEscapeString( +extern DL_IMPORT(PyObject*) PyUnicode_EncodeRawUnicodeEscapeEx( PyObject *unicode /* Unicode object */ ); @@ -558,12 +623,19 @@ int length /* Number of Py_UNICODE chars to encode */ ); +#define PyUnicode_AsRawUnicodeEscapeString PyUnicode_EncodeRawUnicodeEscapeEx + /* --- Latin-1 Codecs ----------------------------------------------------- Note: Latin-1 corresponds to the first 256 Unicode ordinals. */ +extern DL_IMPORT(PyObject*) PyUnicode_DecodeLatin1Ex( + PyObject *string, /* Latin-1 encoded string object */ + PyObject *errors /* error handling */ + ); + extern DL_IMPORT(PyObject*) PyUnicode_DecodeLatin1( const char *string, /* Latin-1 encoded string */ int length, /* size of string */ @@ -574,6 +646,12 @@ PyObject *unicode /* Unicode object */ ); +extern DL_IMPORT(PyObject*) PyUnicode_EncodeLatin1Ex( + PyObject *unicode, /* Unicode object */ + PyObject *errors /* error handling */ + ); + +/* DEPRECATED */ extern DL_IMPORT(PyObject*) PyUnicode_EncodeLatin1( const Py_UNICODE *data, /* Unicode char buffer */ int length, /* Number of Py_UNICODE chars to encode */ @@ -586,6 +664,12 @@ */ +extern DL_IMPORT(PyObject*) PyUnicode_DecodeASCIIEx( + PyObject *string, /* ASCII encoded string object */ + PyObject *errors /* error handling */ + ); + +/* DEPRECATED */ extern DL_IMPORT(PyObject*) PyUnicode_DecodeASCII( const char *string, /* ASCII encoded string */ int length, /* size of string */ @@ -596,6 +680,12 @@ PyObject *unicode /* Unicode object */ ); +extern DL_IMPORT(PyObject*) PyUnicode_EncodeASCIIEx( + PyObject *unicode, /* Unicode object */ + PyObject *errors /* error handling */ + ); + +/* DEPRECATED */ extern DL_IMPORT(PyObject*) PyUnicode_EncodeASCII( const Py_UNICODE *data, /* Unicode char buffer */ int length, /* Number of Py_UNICODE chars to encode */ @@ -624,6 +714,14 @@ */ +extern DL_IMPORT(PyObject*) PyUnicode_DecodeCharmapEx( + PyObject *string, /* Encoded string object */ + PyObject *mapping, /* character mapping + (char ordinal -> unicode ordinal) */ + PyObject *errors /* error handling */ + ); + +/* DEPRECATED */ extern DL_IMPORT(PyObject*) PyUnicode_DecodeCharmap( const char *string, /* Encoded string */ int length, /* size of string */ @@ -638,6 +736,14 @@ (unicode ordinal -> char ordinal) */ ); +extern DL_IMPORT(PyObject*) PyUnicode_EncodeCharmapEx( + PyObject *unicode, /* Unicode object */ + PyObject *mapping, /* character mapping + (unicode ordinal -> char ordinal) */ + PyObject *errors /* error handling */ + ); + +/* DEPRECATED */ extern DL_IMPORT(PyObject*) PyUnicode_EncodeCharmap( const Py_UNICODE *data, /* Unicode char buffer */ int length, /* Number of Py_UNICODE chars to encode */ @@ -670,6 +776,12 @@ /* --- MBCS codecs for Windows -------------------------------------------- */ +extern DL_IMPORT(PyObject*) PyUnicode_DecodeMBCSEx( + PyObject *string, /* MBCS encoded string object */ + PyObject *errors /* error handling */ + ); + +/* DEPRECATED */ extern DL_IMPORT(PyObject*) PyUnicode_DecodeMBCS( const char *string, /* MBCS encoded string */ int length, /* size of string */ @@ -680,6 +792,12 @@ PyObject *unicode /* Unicode object */ ); +extern DL_IMPORT(PyObject*) PyUnicode_EncodeMBCSEx( + const Pyobject *unicode, /* Unicode object */ + const PyObject *errors /* error handling */ + ); + +/* DEPRECATED */ extern DL_IMPORT(PyObject*) PyUnicode_EncodeMBCS( const Py_UNICODE *data, /* Unicode char buffer */ int length, /* Number of Py_UNICODE chars to encode */ @@ -694,7 +812,8 @@ an output buffer using standard ASCII digit codes. The output buffer has to provide at least length+1 bytes of storage - area. The output string is 0-terminated. + area (more if longer replacement string are generated). + The output string is 0-terminated. The encoder converts whitespace to ' ', decimal characters to their corresponding ASCII digit and all other Latin-1 characters except @@ -703,15 +822,23 @@ Error handling is defined by the errors argument: - NULL or "strict": raise a ValueError - "ignore": ignore the wrong characters (these are not copied to the - output buffer) - "replace": replaces illegal characters with '?' + NULL, None, "strict" or u"strict": raise a UnicodeError + "ignore" or u"ignore": ignore the wrong characters (these are + not copied to the output buffer) + "replace" or u"replace": replaces illegal characters with '?' + callable object: use what the object returns as replacement Returns 0 on success, -1 on failure. */ +extern DL_IMPORT(int) PyUnicode_EncodeDecimalEx( + PyObject *unicode, /* Unicode object */ + char *output, /* Output buffer; must have size >= length */ + PyObject *errors /* error handling */ + ); + +/* DEPRECATED */ extern DL_IMPORT(int) PyUnicode_EncodeDecimal( Py_UNICODE *s, /* Unicode buffer */ int length, /* Number of Py_UNICODE chars to encode */ @@ -772,7 +899,7 @@ */ extern DL_IMPORT(PyObject *) PyUnicode_Translate( - PyObject *str, /* String */ + PyObject *str, /* String */ PyObject *table, /* Translate table */ const char *errors /* error handling */ ); Index: Lib/codecs.py =================================================================== RCS file: /cvsroot/python/python/dist/src/Lib/codecs.py,v retrieving revision 1.19 diff -u -r1.19 codecs.py --- Lib/codecs.py 2001/05/29 06:06:54 1.19 +++ Lib/codecs.py 2001/07/27 03:17:24 @@ -51,13 +51,17 @@ The .encode()/.decode() methods may implement different error handling schemes by providing the errors argument. These - string values are defined: + values are defined: - 'strict' - raise a ValueError error (or a subclass) + None or 'strict' - raise a UnicodeError error (or a subclass) 'ignore' - ignore the character and continue with the next 'replace' - replace with a suitable replacement character; - Python will use the official U+FFFD REPLACEMENT - CHARACTER for the builtin Unicode codecs. + Python will use the official U+FFFD REPLACEMENT + CHARACTER for the builtin Unicode codecs. + callable object - call the object with the arguments + encoding name, character, position + and encode the unicode object returned + instead of the original character. """ def encode(self, input, errors='strict'): @@ -66,7 +70,7 @@ object, length consumed). errors defines the error handling to apply. It defaults to - 'strict' handling. + strict handling. The method may not store state in the Codec instance. Use StreamCodec for codecs which have to keep state in order to @@ -122,9 +126,15 @@ schemes by providing the errors keyword argument. These parameters are defined: - 'strict' - raise a ValueError (or a subclass) - 'ignore' - ignore the character and continue with the next - 'replace'- replace with a suitable replacement character + None or 'strict' - raise a UnicodeError error (or a subclass) + 'ignore' - ignore the character and continue with the next + 'replace' - replace with a suitable replacement character; + Python will use the official U+FFFD REPLACEMENT + CHARACTER for the builtin Unicode codecs. + callable object - call the object with the arguments + encoding name, character, position + and encode the unicode object returned + instead of the original character. """ self.stream = stream Index: Lib/encodings/base64_codec.py =================================================================== RCS file: /cvsroot/python/python/dist/src/Lib/encodings/base64_codec.py,v retrieving revision 1.1 diff -u -r1.1 base64_codec.py --- Lib/encodings/base64_codec.py 2001/05/15 12:00:02 1.1 +++ Lib/encodings/base64_codec.py 2001/07/27 03:17:24 @@ -10,21 +10,19 @@ ### Codec APIs -def base64_encode(input,errors='strict'): +def base64_encode(input,errors='None'): """ Encodes the object input and returns a tuple (output object, length consumed). - errors defines the error handling to apply. It defaults to - 'strict' handling which is the only currently supported - error handling for this codec. + errors defines the error handling to apply. As there are + no unencodable characters, errors will be ignored. """ - assert errors == 'strict' output = base64.encodestring(input) return (output, len(input)) -def base64_decode(input,errors='strict'): +def base64_decode(input,errors=None): """ Decodes the object input and returns a tuple (output object, length consumed). @@ -33,12 +31,10 @@ buffer slot. Python strings, buffer objects and memory mapped files are examples of objects providing this slot. - errors defines the error handling to apply. It defaults to - 'strict' handling which is the only currently supported - error handling for this codec. + errors defines the error handling to apply. It will + be ignored here. """ - assert errors == 'strict' output = base64.decodestring(input) return (output, len(input)) Index: Lib/encodings/hex_codec.py =================================================================== RCS file: /cvsroot/python/python/dist/src/Lib/encodings/hex_codec.py,v retrieving revision 1.1 diff -u -r1.1 hex_codec.py --- Lib/encodings/hex_codec.py 2001/05/15 12:00:02 1.1 +++ Lib/encodings/hex_codec.py 2001/07/27 03:17:24 @@ -10,21 +10,19 @@ ### Codec APIs -def hex_encode(input,errors='strict'): +def hex_encode(input,errors=None): """ Encodes the object input and returns a tuple (output object, length consumed). - errors defines the error handling to apply. It defaults to - 'strict' handling which is the only currently supported - error handling for this codec. + errors defines the error handling to apply. As there + are no unencodable characters, errors will be ignored. """ - assert errors == 'strict' output = binascii.b2a_hex(input) return (output, len(input)) -def hex_decode(input,errors='strict'): +def hex_decode(input,errors=None): """ Decodes the object input and returns a tuple (output object, length consumed). @@ -33,12 +31,10 @@ buffer slot. Python strings, buffer objects and memory mapped files are examples of objects providing this slot. - errors defines the error handling to apply. It defaults to - 'strict' handling which is the only currently supported - error handling for this codec. + errors defines the error handling to apply. It will be + ignored here. """ - assert errors == 'strict' output = binascii.a2b_hex(input) return (output, len(input)) Index: Lib/encodings/quopri_codec.py =================================================================== RCS file: /cvsroot/python/python/dist/src/Lib/encodings/quopri_codec.py,v retrieving revision 1.1 diff -u -r1.1 quopri_codec.py --- Lib/encodings/quopri_codec.py 2001/05/15 15:34:07 1.1 +++ Lib/encodings/quopri_codec.py 2001/07/27 03:17:24 @@ -9,15 +9,12 @@ except ImportError: from StringIO import StringIO -def quopri_encode(input, errors='strict'): +def quopri_encode(input, errors=None): """Encode the input, returning a tuple (output object, length consumed). - errors defines the error handling to apply. It defaults to - 'strict' handling which is the only currently supported - error handling for this codec. - + errors defines the error handling to apply. As there are + not unencodable characters for quopri, errors will be ignored. """ - assert errors == 'strict' f = StringIO(input) g = StringIO() quopri.encode(f, g, 1) Index: Lib/encodings/uu_codec.py =================================================================== RCS file: /cvsroot/python/python/dist/src/Lib/encodings/uu_codec.py,v retrieving revision 1.1 diff -u -r1.1 uu_codec.py --- Lib/encodings/uu_codec.py 2001/05/15 12:00:02 1.1 +++ Lib/encodings/uu_codec.py 2001/07/27 03:17:24 @@ -12,17 +12,17 @@ ### Codec APIs -def uu_encode(input,errors='strict',filename='',mode=0666): +def uu_encode(input,errors=None,filename='',mode=0666): """ Encodes the object input and returns a tuple (output object, length consumed). errors defines the error handling to apply. It defaults to - 'strict' handling which is the only currently supported + strict handling which is the only currently supported error handling for this codec. """ - assert errors == 'strict' + assert errors is None or errors == "strict" or errors == codecs.raise_unicodeencode_errors from cStringIO import StringIO from binascii import b2a_uu infile = StringIO(input) @@ -40,7 +40,7 @@ return (outfile.getvalue(), len(input)) -def uu_decode(input,errors='strict'): +def uu_decode(input,errors=None): """ Decodes the object input and returns a tuple (output object, length consumed). @@ -49,15 +49,13 @@ buffer slot. Python strings, buffer objects and memory mapped files are examples of objects providing this slot. - errors defines the error handling to apply. It defaults to - 'strict' handling which is the only currently supported - error handling for this codec. + errors defines the error handling to apply. It will be + ignored here. Note: filename and file mode information in the input data is ignored. """ - assert errors == 'strict' from cStringIO import StringIO from binascii import a2b_uu infile = StringIO(input) Index: Lib/encodings/zlib_codec.py =================================================================== RCS file: /cvsroot/python/python/dist/src/Lib/encodings/zlib_codec.py,v retrieving revision 1.1 diff -u -r1.1 zlib_codec.py --- Lib/encodings/zlib_codec.py 2001/05/15 12:00:02 1.1 +++ Lib/encodings/zlib_codec.py 2001/07/27 03:17:24 @@ -11,21 +11,19 @@ ### Codec APIs -def zlib_encode(input,errors='strict'): +def zlib_encode(input,errors=None): """ Encodes the object input and returns a tuple (output object, length consumed). - errors defines the error handling to apply. It defaults to - 'strict' handling which is the only currently supported - error handling for this codec. + errors defines the error handling to apply. As there + are no unencodable characters errors will be ignored. """ - assert errors == 'strict' output = zlib.compress(input) return (output, len(input)) -def zlib_decode(input,errors='strict'): +def zlib_decode(input,errors=None): """ Decodes the object input and returns a tuple (output object, length consumed). @@ -34,12 +32,10 @@ buffer slot. Python strings, buffer objects and memory mapped files are examples of objects providing this slot. - errors defines the error handling to apply. It defaults to - 'strict' handling which is the only currently supported - error handling for this codec. + errors defines the error handling to apply. It will be ignored + here """ - assert errors == 'strict' output = zlib.decompress(input) return (output, len(input)) Index: Modules/_codecsmodule.c =================================================================== RCS file: /cvsroot/python/python/dist/src/Modules/_codecsmodule.c,v retrieving revision 2.8 diff -u -r2.8 _codecsmodule.c --- Modules/_codecsmodule.c 2001/06/26 15:11:00 2.8 +++ Modules/_codecsmodule.c 2001/07/27 03:17:25 @@ -103,11 +103,11 @@ PyObject *args) { PyObject *obj; - const char *errors = NULL; + PyObject *errors = NULL; const char *data; int size; - if (!PyArg_ParseTuple(args, "O|z:unicode_internal_decode", + if (!PyArg_ParseTuple(args, "O|O:unicode_internal_decode", &obj, &errors)) return NULL; @@ -126,64 +126,60 @@ utf_8_decode(PyObject *self, PyObject *args) { - const char *data; - int size; - const char *errors = NULL; + PyObject *data; + PyObject *errors = NULL; - if (!PyArg_ParseTuple(args, "t#|z:utf_8_decode", - &data, &size, &errors)) + if (!PyArg_ParseTuple(args, "O!|O:utf_8_decode", + &PyString_Type, &data, &errors)) return NULL; - return codec_tuple(PyUnicode_DecodeUTF8(data, size, errors), - size); + return codec_tuple(PyUnicode_DecodeUTF8Ex(data, errors), + PyString_GET_SIZE(data)); } static PyObject * utf_16_decode(PyObject *self, PyObject *args) { - const char *data; - int size; - const char *errors = NULL; + PyObject *data; + PyObject *errors = NULL; int byteorder = 0; - if (!PyArg_ParseTuple(args, "t#|z:utf_16_decode", - &data, &size, &errors)) + if (!PyArg_ParseTuple(args, "O!|O:utf_16_decode", + &PyString_Type, &data, &errors)) return NULL; - return codec_tuple(PyUnicode_DecodeUTF16(data, size, errors, &byteorder), - size); + return codec_tuple(PyUnicode_DecodeUTF16Ex(data, errors, &byteorder), + PyString_GET_SIZE(data)); } static PyObject * utf_16_le_decode(PyObject *self, PyObject *args) { - const char *data; - int size; - const char *errors = NULL; + PyObject *data; + PyObject *errors = NULL; int byteorder = -1; - if (!PyArg_ParseTuple(args, "t#|z:utf_16_le_decode", - &data, &size, &errors)) + if (!PyArg_ParseTuple(args, "O!|O:utf_16_le_decode", + &PyString_Type, &data, &errors)) return NULL; - return codec_tuple(PyUnicode_DecodeUTF16(data, size, errors, &byteorder), - size); + return codec_tuple(PyUnicode_DecodeUTF16Ex(data, errors, &byteorder), + PyString_GET_SIZE(data)); } static PyObject * utf_16_be_decode(PyObject *self, PyObject *args) { - const char *data; - int size; - const char *errors = NULL; + PyObject *data; + PyObject *errors = NULL; int byteorder = 1; - if (!PyArg_ParseTuple(args, "t#|z:utf_16_be_decode", - &data, &size, &errors)) + if (!PyArg_ParseTuple(args, "O!|O:utf_16_be_decode", + &PyString_Type, &data, &errors)) return NULL; - return codec_tuple(PyUnicode_DecodeUTF16(data, size, errors, &byteorder), - size); + return codec_tuple(PyUnicode_DecodeUTF16Ex(data, errors, &byteorder), + PyString_GET_SIZE(data)); } /* This non-standard version also provides access to the byteorder @@ -198,20 +194,19 @@ utf_16_ex_decode(PyObject *self, PyObject *args) { - const char *data; - int size; - const char *errors = NULL; + PyObject *data; + PyObject *errors = NULL; int byteorder = 0; PyObject *unicode, *tuple; - if (!PyArg_ParseTuple(args, "t#|zi:utf_16_ex_decode", - &data, &size, &errors, &byteorder)) + if (!PyArg_ParseTuple(args, "O!|Oi:utf_16_ex_decode", + &PyString_Type, &data, &errors, &byteorder)) return NULL; - unicode = PyUnicode_DecodeUTF16(data, size, errors, &byteorder); + unicode = PyUnicode_DecodeUTF16Ex(data, errors, &byteorder); if (unicode == NULL) return NULL; - tuple = Py_BuildValue("Oii", unicode, size, byteorder); + tuple = Py_BuildValue("Oii", unicode, PyString_GET_SIZE(data), byteorder); Py_DECREF(unicode); return tuple; } @@ -220,83 +215,78 @@ unicode_escape_decode(PyObject *self, PyObject *args) { - const char *data; - int size; - const char *errors = NULL; + PyObject *data; + PyObject *errors = NULL; - if (!PyArg_ParseTuple(args, "t#|z:unicode_escape_decode", - &data, &size, &errors)) + if (!PyArg_ParseTuple(args, "O!|O:unicode_escape_decode", + &PyString_Type, &data, &errors)) return NULL; - return codec_tuple(PyUnicode_DecodeUnicodeEscape(data, size, errors), - size); + return codec_tuple(PyUnicode_DecodeUnicodeEscapeEx(data, errors), + PyString_GET_SIZE(data)); } static PyObject * raw_unicode_escape_decode(PyObject *self, PyObject *args) { - const char *data; - int size; - const char *errors = NULL; + PyObject *data; + PyObject *errors = NULL; - if (!PyArg_ParseTuple(args, "t#|z:raw_unicode_escape_decode", - &data, &size, &errors)) + if (!PyArg_ParseTuple(args, "O!|O:raw_unicode_escape_decode", + &PyString_Type, &data, &errors)) return NULL; - return codec_tuple(PyUnicode_DecodeRawUnicodeEscape(data, size, errors), - size); + return codec_tuple(PyUnicode_DecodeRawUnicodeEscapeEx(data, errors), + PyString_GET_SIZE(data)); } static PyObject * latin_1_decode(PyObject *self, PyObject *args) { - const char *data; - int size; - const char *errors = NULL; + PyObject *data; + PyObject *errors = NULL; - if (!PyArg_ParseTuple(args, "t#|z:latin_1_decode", - &data, &size, &errors)) + if (!PyArg_ParseTuple(args, "O!|O:latin_1_decode", + &PyString_Type, &data, &errors)) return NULL; - return codec_tuple(PyUnicode_DecodeLatin1(data, size, errors), - size); + return codec_tuple(PyUnicode_DecodeLatin1Ex(data, errors), + PyString_GET_SIZE(data)); } static PyObject * ascii_decode(PyObject *self, PyObject *args) { - const char *data; - int size; - const char *errors = NULL; + PyObject *data; + PyObject *errors = NULL; - if (!PyArg_ParseTuple(args, "t#|z:ascii_decode", - &data, &size, &errors)) + if (!PyArg_ParseTuple(args, "O!|O:ascii_decode", + &PyString_Type, &data, &errors)) return NULL; - return codec_tuple(PyUnicode_DecodeASCII(data, size, errors), - size); + return codec_tuple(PyUnicode_DecodeASCIIEx(data, errors), + PyString_GET_SIZE(data)); } static PyObject * charmap_decode(PyObject *self, PyObject *args) { - const char *data; - int size; - const char *errors = NULL; + PyObject *data; + PyObject *errors = NULL; PyObject *mapping = NULL; - if (!PyArg_ParseTuple(args, "t#|zO:charmap_decode", - &data, &size, &errors, &mapping)) + if (!PyArg_ParseTuple(args, "O!|OO:charmap_decode", + &PyString_Type, &data, &errors, &mapping)) return NULL; if (mapping == Py_None) mapping = NULL; - return codec_tuple(PyUnicode_DecodeCharmap(data, size, mapping, errors), - size); + return codec_tuple(PyUnicode_DecodeCharmapEx(data, mapping, errors), + PyString_GET_SIZE(data)); } #if defined(MS_WIN32) && defined(HAVE_USABLE_WCHAR_T) @@ -305,16 +295,15 @@ mbcs_decode(PyObject *self, PyObject *args) { - const char *data; - int size; - const char *errors = NULL; + PyObject *data; + PyObject *errors = NULL; - if (!PyArg_ParseTuple(args, "t#|z:mbcs_decode", - &data, &size, &errors)) + if (!PyArg_ParseTuple(args, "O!|O:mbcs_decode", + &PyString_Type, &data, &size, &errors)) return NULL; - return codec_tuple(PyUnicode_DecodeMBCS(data, size, errors), - size); + return codec_tuple(PyUnicode_DecodeMBCSEx(data, errors), + PyString_GET_SIZE(data)); } #endif /* MS_WIN32 */ @@ -327,9 +316,9 @@ { const char *data; int size; - const char *errors = NULL; + PyObject *errors = NULL; - if (!PyArg_ParseTuple(args, "s#|z:readbuffer_encode", + if (!PyArg_ParseTuple(args, "s#|O:readbuffer_encode", &data, &size, &errors)) return NULL; @@ -343,9 +332,9 @@ { const char *data; int size; - const char *errors = NULL; + PyObject *errors = NULL; - if (!PyArg_ParseTuple(args, "t#|z:charbuffer_encode", + if (!PyArg_ParseTuple(args, "t#|O:charbuffer_encode", &data, &size, &errors)) return NULL; @@ -358,11 +347,11 @@ PyObject *args) { PyObject *obj; - const char *errors = NULL; + PyObject *errors = NULL; const char *data; int size; - if (!PyArg_ParseTuple(args, "O|z:unicode_internal_encode", + if (!PyArg_ParseTuple(args, "O|O:unicode_internal_encode", &obj, &errors)) return NULL; @@ -382,22 +371,21 @@ static PyObject * utf_8_encode(PyObject *self, - PyObject *args) + PyObject *args) { PyObject *str, *v; - const char *errors = NULL; + PyObject *errors = NULL; - if (!PyArg_ParseTuple(args, "O|z:utf_8_encode", + if (!PyArg_ParseTuple(args, "O|O:utf_8_encode", &str, &errors)) return NULL; str = PyUnicode_FromObject(str); if (str == NULL) return NULL; - v = codec_tuple(PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(str), - PyUnicode_GET_SIZE(str), - errors), - PyUnicode_GET_SIZE(str)); + v = codec_tuple( + PyUnicode_EncodeUTF8Ex(str, errors), + PyUnicode_GET_SIZE(str)); Py_DECREF(str); return v; } @@ -411,172 +399,161 @@ static PyObject * utf_16_encode(PyObject *self, - PyObject *args) + PyObject *args) { PyObject *str, *v; - const char *errors = NULL; + PyObject *errors = NULL; int byteorder = 0; - if (!PyArg_ParseTuple(args, "O|zi:utf_16_encode", + if (!PyArg_ParseTuple(args, "O|Oi:utf_16_encode", &str, &errors, &byteorder)) return NULL; str = PyUnicode_FromObject(str); if (str == NULL) return NULL; - v = codec_tuple(PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(str), - PyUnicode_GET_SIZE(str), - errors, - byteorder), - PyUnicode_GET_SIZE(str)); + v = codec_tuple( + PyUnicode_EncodeUTF16Ex(str, errors, byteorder), + PyUnicode_GET_SIZE(str)); Py_DECREF(str); return v; } static PyObject * utf_16_le_encode(PyObject *self, - PyObject *args) + PyObject *args) { PyObject *str, *v; - const char *errors = NULL; + PyObject *errors = NULL; - if (!PyArg_ParseTuple(args, "O|z:utf_16_le_encode", + if (!PyArg_ParseTuple(args, "O|O:utf_16_le_encode", &str, &errors)) return NULL; str = PyUnicode_FromObject(str); if (str == NULL) return NULL; - v = codec_tuple(PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(str), - PyUnicode_GET_SIZE(str), - errors, - -1), - PyUnicode_GET_SIZE(str)); + v = codec_tuple( + PyUnicode_EncodeUTF16Ex(str, errors, -1), + PyUnicode_GET_SIZE(str)); Py_DECREF(str); return v; } static PyObject * utf_16_be_encode(PyObject *self, - PyObject *args) + PyObject *args) { PyObject *str, *v; - const char *errors = NULL; + PyObject *errors = NULL; - if (!PyArg_ParseTuple(args, "O|z:utf_16_be_encode", + if (!PyArg_ParseTuple(args, "O|O:utf_16_be_encode", &str, &errors)) return NULL; str = PyUnicode_FromObject(str); if (str == NULL) return NULL; - v = codec_tuple(PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(str), - PyUnicode_GET_SIZE(str), - errors, - +1), - PyUnicode_GET_SIZE(str)); + v = codec_tuple( + PyUnicode_EncodeUTF16Ex(str, errors, +1), + PyUnicode_GET_SIZE(str)); Py_DECREF(str); return v; } static PyObject * unicode_escape_encode(PyObject *self, - PyObject *args) + PyObject *args) { PyObject *str, *v; - const char *errors = NULL; + PyObject *errors = NULL; - if (!PyArg_ParseTuple(args, "O|z:unicode_escape_encode", + if (!PyArg_ParseTuple(args, "O|O:unicode_escape_encode", &str, &errors)) return NULL; str = PyUnicode_FromObject(str); if (str == NULL) return NULL; - v = codec_tuple(PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(str), - PyUnicode_GET_SIZE(str)), - PyUnicode_GET_SIZE(str)); + v = codec_tuple( + PyUnicode_EncodeUnicodeEscapeEx(str), + PyUnicode_GET_SIZE(str)); Py_DECREF(str); return v; } static PyObject * raw_unicode_escape_encode(PyObject *self, - PyObject *args) + PyObject *args) { PyObject *str, *v; - const char *errors = NULL; + PyObject *errors = NULL; - if (!PyArg_ParseTuple(args, "O|z:raw_unicode_escape_encode", + if (!PyArg_ParseTuple(args, "O|O:raw_unicode_escape_encode", &str, &errors)) return NULL; str = PyUnicode_FromObject(str); if (str == NULL) return NULL; - v = codec_tuple(PyUnicode_EncodeRawUnicodeEscape( - PyUnicode_AS_UNICODE(str), - PyUnicode_GET_SIZE(str)), - PyUnicode_GET_SIZE(str)); + v = codec_tuple( + PyUnicode_EncodeRawUnicodeEscapeEx(str), + PyUnicode_GET_SIZE(str)); Py_DECREF(str); return v; } static PyObject * latin_1_encode(PyObject *self, - PyObject *args) + PyObject *args) { PyObject *str, *v; - const char *errors = NULL; + PyObject *errors = NULL; - if (!PyArg_ParseTuple(args, "O|z:latin_1_encode", + if (!PyArg_ParseTuple(args, "O|O:latin_1_encode", &str, &errors)) return NULL; str = PyUnicode_FromObject(str); if (str == NULL) return NULL; - v = codec_tuple(PyUnicode_EncodeLatin1( - PyUnicode_AS_UNICODE(str), - PyUnicode_GET_SIZE(str), - errors), - PyUnicode_GET_SIZE(str)); + v = codec_tuple( + PyUnicode_EncodeLatin1Ex(str, errors), + PyUnicode_GET_SIZE(str)); Py_DECREF(str); return v; } static PyObject * ascii_encode(PyObject *self, - PyObject *args) + PyObject *args) { PyObject *str, *v; - const char *errors = NULL; + PyObject *errors = NULL; - if (!PyArg_ParseTuple(args, "O|z:ascii_encode", + if (!PyArg_ParseTuple(args, "O|O:ascii_encode", &str, &errors)) return NULL; str = PyUnicode_FromObject(str); if (str == NULL) return NULL; - v = codec_tuple(PyUnicode_EncodeASCII( - PyUnicode_AS_UNICODE(str), - PyUnicode_GET_SIZE(str), - errors), - PyUnicode_GET_SIZE(str)); + v = codec_tuple( + PyUnicode_EncodeASCIIEx(str, errors), + PyUnicode_GET_SIZE(str)); Py_DECREF(str); return v; } static PyObject * charmap_encode(PyObject *self, - PyObject *args) + PyObject *args) { PyObject *str, *v; - const char *errors = NULL; + PyObject *errors = NULL; PyObject *mapping = NULL; - if (!PyArg_ParseTuple(args, "O|zO:charmap_encode", + if (!PyArg_ParseTuple(args, "O|OO:charmap_encode", &str, &errors, &mapping)) return NULL; if (mapping == Py_None) @@ -585,12 +562,9 @@ str = PyUnicode_FromObject(str); if (str == NULL) return NULL; - v = codec_tuple(PyUnicode_EncodeCharmap( - PyUnicode_AS_UNICODE(str), - PyUnicode_GET_SIZE(str), - mapping, - errors), - PyUnicode_GET_SIZE(str)); + v = codec_tuple( + PyUnicode_EncodeCharmapEx(str, mapping, errors), + PyUnicode_GET_SIZE(str)); Py_DECREF(str); return v; } @@ -602,27 +576,73 @@ PyObject *args) { PyObject *str, *v; - const char *errors = NULL; + PyObject *errors = NULL; - if (!PyArg_ParseTuple(args, "O|z:mbcs_encode", + if (!PyArg_ParseTuple(args, "O|O:mbcs_encode", &str, &errors)) return NULL; str = PyUnicode_FromObject(str); if (str == NULL) return NULL; - v = codec_tuple(PyUnicode_EncodeMBCS( - PyUnicode_AS_UNICODE(str), - PyUnicode_GET_SIZE(str), - errors), - PyUnicode_GET_SIZE(str)); + v = codec_tuple( + PyUnicode_EncodeMBCSEx(str, errors), + PyUnicode_GET_SIZE(str)); Py_DECREF(str); return v; } #endif /* MS_WIN32 */ + +/* --- Error handler registry --------------------------------------------- */ + +static PyObject *register_unicodeencodeerrorhandler(PyObject *self, PyObject *args) +{ + char *name; + PyObject *handler; -/* --- Module API --------------------------------------------------------- */ + if (!PyArg_ParseTuple(args, "sO:register_unicodeencodeerrorhandler", + &name, &handler)) + return NULL; + if (PyCodec_RegisterUnicodeEncodeErrorHandler(name, handler)) + return NULL; + Py_INCREF(Py_None); + return Py_None; +} + +static PyObject *lookup_unicodeencodeerrorhandler(PyObject *self, PyObject *args) +{ + PyObject *name; + + if (!PyArg_ParseTuple(args, "O:lookup_unicodeencodeerrorhandler", + &name)) + return NULL; + return PyCodec_LookupUnicodeEncodeErrorHandler(name); +} + +static PyObject *register_unicodedecodeerrorhandler(PyObject *self, PyObject *args) +{ + char *name; + PyObject *handler; + + if (!PyArg_ParseTuple(args, "sO:register_unicodedecodeerrorhandler", + &name, &handler)) + return NULL; + if (PyCodec_RegisterUnicodeDecodeErrorHandler(name, handler)) + return NULL; + Py_INCREF(Py_None); + return Py_None; +} + +static PyObject *lookup_unicodedecodeerrorhandler(PyObject *self, PyObject *args) +{ + PyObject *name; + + if (!PyArg_ParseTuple(args, "O:lookup_unicodedecodeerrorhandler", + &name)) + return NULL; + return PyCodec_LookupUnicodeDecodeErrorHandler(name); +} static PyMethodDef _codecs_functions[] = { {"register", codecregister, 1}, @@ -654,7 +674,19 @@ {"mbcs_encode", mbcs_encode, 1}, {"mbcs_decode", mbcs_decode, 1}, #endif - {NULL, NULL} /* sentinel */ + {"raise_unicodeencode_errors", PyCodec_RaiseUnicodeEncodeErrors, 1}, + {"ignore_unicodeencode_errors", PyCodec_IgnoreUnicodeEncodeErrors, 1}, + {"replace_unicodeencode_errors", PyCodec_ReplaceUnicodeEncodeErrors, 1}, + {"xmlcharrefreplace_unicodeencode_errors", PyCodec_XMLCharRefReplaceUnicodeEncodeErrors, 1}, + {"escapereplace_unicodeencode_errors", PyCodec_EscapeReplaceUnicodeEncodeErrors, 1}, + {"register_unicodeencodeerrorhandler", register_unicodeencodeerrorhandler, 1}, + {"lookup_unicodeencodeerrorhandler", lookup_unicodeencodeerrorhandler, 1}, + {"raise_unicodedecode_errors", PyCodec_RaiseUnicodeDecodeErrors, 1}, + {"ignore_unicodedecode_errors", PyCodec_IgnoreUnicodeDecodeErrors, 1}, + {"replace_unicodedecode_errors", PyCodec_ReplaceUnicodeDecodeErrors, 1}, + {"register_unicodedecodeerrorhandler", register_unicodedecodeerrorhandler, 1}, + {"lookup_unicodedecodeerrorhandler", lookup_unicodedecodeerrorhandler, 1}, + {NULL, NULL} /* sentinel */ }; DL_EXPORT(void) Index: Objects/stringobject.c =================================================================== RCS file: /cvsroot/python/python/dist/src/Objects/stringobject.c,v retrieving revision 2.120 diff -u -r2.120 stringobject.c --- Objects/stringobject.c 2001/06/16 05:42:57 2.120 +++ Objects/stringobject.c 2001/07/27 03:17:25 @@ -147,24 +147,63 @@ return (PyObject *) op; } +PyObject *PyString_DecodeEx(PyObject *str, + const char *encoding, + PyObject *errors) +{ + PyObject *v = PyString_AsDecodedObjectEx(str, encoding, errors); + + /* Convert Unicode to a string using the default encoding */ + if (PyUnicode_Check(v)) { + PyObject *temp = v; + v = PyUnicode_AsEncodedString(v, NULL, NULL); + Py_DECREF(temp); + if (v == NULL) + goto onError; + } + if (!PyString_Check(v)) { + PyErr_Format(PyExc_TypeError, + "decoder did not return a string object (type=%.400s)", + v->ob_type->tp_name); + Py_DECREF(v); + goto onError; + } + + return v; + + onError: + return NULL; +} + PyObject *PyString_Decode(const char *s, int size, const char *encoding, const char *errors) { - PyObject *v, *str; + PyObject *str; + PyObject *errorstr = NULL; + PyObject *res; str = PyString_FromStringAndSize(s, size); - if (str == NULL) + if (!str) return NULL; - v = PyString_AsDecodedString(str, encoding, errors); + if (errors) { + errorstr = PyString_FromString(errors); + if (!errorstr) { + Py_DECREF(str); + return NULL; + } + } + + res = PyString_DecodeEx(str, encoding, errorstr); Py_DECREF(str); - return v; + Py_XDECREF(errorstr); + return res; } -PyObject *PyString_AsDecodedObject(PyObject *str, +PyObject *PyString_AsDecodedObjectEx(PyObject *str, const char *encoding, - const char *errors) + PyObject *errors) { PyObject *v; @@ -177,7 +216,7 @@ encoding = PyUnicode_GetDefaultEncoding(); /* Decode via the codec registry */ - v = PyCodec_Decode(str, encoding, errors); + v = PyCodec_DecodeEx(str, encoding, errors); if (v == NULL) goto onError; @@ -187,6 +226,26 @@ return NULL; } +PyObject *PyString_AsDecodedObject(PyObject *str, + const char *encoding, + const char *errors) +{ + PyObject *errorstr = NULL; + PyObject *res; + + if (errors) { + errorstr = PyString_FromString(errors); + if (!errorstr) { + Py_DECREF(str); + return NULL; + } + } + + res = PyString_AsDecodedObjectEx(str, encoding, errorstr); + Py_XDECREF(errorstr); + return res; +} + PyObject *PyString_AsDecodedString(PyObject *str, const char *encoding, const char *errors) @@ -219,24 +278,64 @@ return NULL; } +PyObject *PyString_EncodeEx(PyObject *str, + const char *encoding, + PyObject *errors) +{ + PyObject *v = PyString_AsEncodedObjectEx(str, encoding, errors); + + if (v == NULL) + goto onError; + + /* Convert Unicode to a string using the default encoding */ + if (PyUnicode_Check(v)) { + PyObject *temp = v; + v = PyUnicode_AsEncodedString(v, NULL, NULL); + Py_DECREF(temp); + if (v == NULL) + goto onError; + } + if (!PyString_Check(v)) { + PyErr_Format(PyExc_TypeError, + "encoder did not return a string object (type=%.400s)", + v->ob_type->tp_name); + Py_DECREF(v); + goto onError; + } + + onError: + return NULL; +} + PyObject *PyString_Encode(const char *s, int size, const char *encoding, const char *errors) { - PyObject *v, *str; + PyObject *str; + PyObject *errorstr = NULL; + PyObject *res; str = PyString_FromStringAndSize(s, size); - if (str == NULL) + if (!str) return NULL; - v = PyString_AsEncodedString(str, encoding, errors); + if (errors) { + errorstr = PyString_FromString(errors); + if (!errorstr) { + Py_DECREF(str); + return NULL; + } + } + + res = PyString_EncodeEx(str, encoding, errorstr); Py_DECREF(str); - return v; + Py_XDECREF(errorstr); + return res; } -PyObject *PyString_AsEncodedObject(PyObject *str, +PyObject *PyString_AsEncodedObjectEx(PyObject *str, const char *encoding, - const char *errors) + PyObject *errors) { PyObject *v; @@ -249,7 +348,7 @@ encoding = PyUnicode_GetDefaultEncoding(); /* Encode via the codec registry */ - v = PyCodec_Encode(str, encoding, errors); + v = PyCodec_EncodeEx(str, encoding, errors); if (v == NULL) goto onError; @@ -259,6 +358,26 @@ return NULL; } +PyObject *PyString_AsEncodedObject(PyObject *str, + const char *encoding, + const char *errors) +{ + PyObject *errorstr = NULL; + PyObject *res; + + if (errors) { + errorstr = PyString_FromString(errors); + if (!errorstr) { + Py_DECREF(str); + return NULL; + } + } + + res = PyString_AsEncodedObjectEx(str, encoding, errorstr); + Py_XDECREF(errorstr); + return res; +} + PyObject *PyString_AsEncodedString(PyObject *str, const char *encoding, const char *errors) @@ -1930,16 +2049,17 @@ Decodes S using the codec registered for encoding. encoding defaults\n\ to the default encoding. errors may be given to set a different error\n\ handling scheme. Default is 'strict' meaning that encoding errors raise\n\ -a ValueError. Other possible values are 'ignore' and 'replace'."; +a ValueError. Other possible values are 'ignore' and 'replace' or other\n\ +registered error callback names."; static PyObject * string_decode(PyStringObject *self, PyObject *args) { char *encoding = NULL; - char *errors = NULL; - if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors)) + PyObject *errors = NULL; + if (!PyArg_ParseTuple(args, "|sO:decode", &encoding, &errors)) return NULL; - return PyString_AsDecodedObject((PyObject *)self, encoding, errors); + return PyString_AsDecodedObjectEx((PyObject *)self, encoding, errors); } Index: Objects/unicodeobject.c =================================================================== RCS file: /cvsroot/python/python/dist/src/Objects/unicodeobject.c,v retrieving revision 2.104 diff -u -r2.104 unicodeobject.c --- Objects/unicodeobject.c 2001/07/25 16:05:59 2.104 +++ Objects/unicodeobject.c 2001/07/27 03:17:27 @@ -391,12 +391,12 @@ PyObject *PyUnicode_FromObject(register PyObject *obj) { - return PyUnicode_FromEncodedObject(obj, NULL, "strict"); + return PyUnicode_FromEncodedObjectEx(obj, NULL, NULL); } -PyObject *PyUnicode_FromEncodedObject(register PyObject *obj, +PyObject *PyUnicode_FromEncodedObjectEx(register PyObject *obj, const char *encoding, - const char *errors) + PyObject *errors) { const char *s; int len; @@ -433,10 +433,8 @@ } goto done; } - else if (PyString_Check(obj)) { - s = PyString_AS_STRING(obj); - len = PyString_GET_SIZE(obj); - } + else if (PyString_Check(obj)) + len = PyString_GET_SIZE(obj); else if (PyObject_AsCharBuffer(obj, &s, &len)) { /* Overwrite the error message with something more useful in case of a TypeError. */ @@ -447,6 +445,13 @@ obj->ob_type->tp_name); goto onError; } + else { + /* ...DecodeEx needs a string */ + obj = PyString_FromStringAndSize(s, len); + if (!obj) + goto onError; + owned = 1; + } /* Convert to Unicode */ if (len == 0) { @@ -454,7 +459,7 @@ v = (PyObject *)unicode_empty; } else - v = PyUnicode_Decode(s, len, encoding, errors); + v = PyUnicode_DecodeEx(obj, encoding, errors); done: if (owned) { @@ -469,29 +474,43 @@ return NULL; } -PyObject *PyUnicode_Decode(const char *s, - int size, +PyObject *PyUnicode_FromEncodedObject(register PyObject *obj, + const char *encoding, + const char *errors) +{ + PyObject *errorstr = NULL; + PyObject *res; + + if (errors) { + errorstr = PyString_FromString(errors); + if (!errorstr) + return NULL; + } + + res = PyUnicode_FromEncodedObjectEx(obj, encoding, errorstr); + Py_XDECREF(errorstr); + return res; +} + +PyObject *PyUnicode_DecodeEx(PyObject *str, const char *encoding, - const char *errors) + PyObject *errors) { - PyObject *buffer = NULL, *unicode; + PyObject *unicode; if (encoding == NULL) encoding = PyUnicode_GetDefaultEncoding(); /* Shortcuts for common default encodings */ if (strcmp(encoding, "utf-8") == 0) - return PyUnicode_DecodeUTF8(s, size, errors); - else if (strcmp(encoding, "latin-1") == 0) - return PyUnicode_DecodeLatin1(s, size, errors); + return PyUnicode_DecodeUTF8Ex(str, errors); + else if ((strcmp(encoding, "latin-1") == 0) || (strcmp(encoding, "iso-8859-1") == 0)) + return PyUnicode_DecodeLatin1Ex(str, errors); else if (strcmp(encoding, "ascii") == 0) - return PyUnicode_DecodeASCII(s, size, errors); + return PyUnicode_DecodeASCIIEx(str, errors); /* Decode via the codec registry */ - buffer = PyBuffer_FromMemory((void *)s, size); - if (buffer == NULL) - goto onError; - unicode = PyCodec_Decode(buffer, encoding, errors); + unicode = PyCodec_DecodeEx(str, encoding, errors); if (unicode == NULL) goto onError; if (!PyUnicode_Check(unicode)) { @@ -501,55 +520,62 @@ Py_DECREF(unicode); goto onError; } - Py_DECREF(buffer); return unicode; onError: - Py_XDECREF(buffer); return NULL; } -PyObject *PyUnicode_Encode(const Py_UNICODE *s, +PyObject *PyUnicode_Decode(const char *s, int size, const char *encoding, const char *errors) { - PyObject *v, *unicode; - - unicode = PyUnicode_FromUnicode(s, size); - if (unicode == NULL) + PyObject *str; + PyObject *errorstr = NULL; + PyObject *res; + + str = PyString_FromStringAndSize(s, size); + if (!str) return NULL; - v = PyUnicode_AsEncodedString(unicode, encoding, errors); - Py_DECREF(unicode); - return v; + if (errors) { + errorstr = PyString_FromString(errors); + if (!errorstr) { + Py_DECREF(str); + return NULL; + } + } + + res = PyUnicode_DecodeEx(str, encoding, errorstr); + Py_DECREF(str); + Py_XDECREF(errorstr); + return res; } -PyObject *PyUnicode_AsEncodedString(PyObject *unicode, - const char *encoding, - const char *errors) +PyObject *PyUnicode_EncodeEx(PyObject *unicode, + const char *encoding, + PyObject *errors) { PyObject *v; - + if (!PyUnicode_Check(unicode)) { PyErr_BadArgument(); goto onError; } - if (encoding == NULL) + if (encoding == NULL) encoding = PyUnicode_GetDefaultEncoding(); /* Shortcuts for common default encodings */ - if (errors == NULL) { - if (strcmp(encoding, "utf-8") == 0) - return PyUnicode_AsUTF8String(unicode); - else if (strcmp(encoding, "latin-1") == 0) - return PyUnicode_AsLatin1String(unicode); - else if (strcmp(encoding, "ascii") == 0) - return PyUnicode_AsASCIIString(unicode); - } + if (strcmp(encoding, "utf-8") == 0) + return PyUnicode_EncodeUTF8Ex(unicode, errors); + else if ((strcmp(encoding, "latin-1") == 0) || (strcmp(encoding, "iso-8859-1") == 0)) + return PyUnicode_EncodeLatin1Ex(unicode, errors); + else if (strcmp(encoding, "ascii") == 0) + return PyUnicode_EncodeASCIIEx(unicode, errors); /* Encode via the codec registry */ - v = PyCodec_Encode(unicode, encoding, errors); + v = PyCodec_EncodeEx(unicode, encoding, errors); if (v == NULL) goto onError; /* XXX Should we really enforce this ? */ @@ -566,6 +592,50 @@ return NULL; } +PyObject *PyUnicode_Encode(const Py_UNICODE *s, + int size, + const char *encoding, + const char *errors) +{ + PyObject *unicode; + PyObject *errorstr = NULL; + PyObject *res; + + unicode = PyUnicode_FromUnicode(s, size); + if (!unicode) + return NULL; + if (errors) { + errorstr = PyString_FromString(errors); + if (!errorstr) { + Py_DECREF(unicode); + return NULL; + } + } + + res = PyUnicode_EncodeEx(unicode, encoding, errorstr); + Py_DECREF(unicode); + Py_XDECREF(errorstr); + return res; +} + +PyObject *PyUnicode_AsEncodedString(PyObject *unicode, + const char *encoding, + const char *errors) +{ + PyObject *errorstr = NULL; + PyObject *res; + + if (errors) { + errorstr = PyString_FromString(errors); + if (!errorstr) + return NULL; + } + + res = PyUnicode_AsEncodedStringEx(unicode, encoding, errorstr); + Py_XDECREF(errorstr); + return res; +} + /* Return a Python string holding the default encoded value of the Unicode object. @@ -665,49 +735,103 @@ 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0 }; -static -int utf8_decoding_error(const char **source, - Py_UNICODE **dest, - const char *errors, - const char *details) -{ - if ((errors == NULL) || - (strcmp(errors,"strict") == 0)) { - PyErr_Format(PyExc_UnicodeError, - "UTF-8 decoding error: %.400s", - details); - return -1; - } - else if (strcmp(errors,"ignore") == 0) { - (*source)++; - return 0; - } - else if (strcmp(errors,"replace") == 0) { - (*source)++; - **dest = Py_UNICODE_REPLACEMENT_CHARACTER; - (*dest)++; - return 0; - } - else { - PyErr_Format(PyExc_ValueError, - "UTF-8 decoding error; unknown error handling code: %.400s", - errors); - return -1; +/* error handling callback helper: + build arguments, call the callback and check the arguments, + if no exception occured, copy the replacement to the output + and adjust various state variables. + return 0 on success, -1 on error +*/ + +static int unicode_decode_call_errorhandler(PyObject *errors, PyObject **errorHandler, + const char *encoding, const char *reason, + PyObject *input, int *inpos, const char **inptr, + PyObject **output, int *outpos, Py_UNICODE **outptr) +{ + static char *argparse = "O!i;decoding error handler must return (unicode, int) tuple"; + + PyObject *args; + PyObject *restuple = NULL; + PyObject *repunicode = NULL; + int insize = PyString_GET_SIZE(input); + int outsize = PyUnicode_GET_SIZE(*output); + int requiredsize; + int newpos; + Py_UNICODE *repptr; + int repsize; + int res = -1; + + if (*errorHandler == NULL) { + *errorHandler = PyCodec_LookupUnicodeDecodeErrorHandler(errors); + if (*errorHandler == NULL) + goto onError; + } + + /* we don't need a state */ + args = Py_BuildValue("sOisO", encoding, input, *inpos, reason, Py_None); + if (args == NULL) + goto onError; + restuple = PyEval_CallObject(*errorHandler, args); + Py_DECREF(args); + if (restuple == NULL) + goto onError; + if (!PyTuple_Check(restuple)) { + PyErr_Format(PyExc_TypeError, &argparse[4]); + goto onError; + } + if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos)) + goto onError; + if (newpos<0) + newpos = 0; + else if (newpos>insize) + newpos = insize; + + /* need more space? (at least enough for what we + have+the replacement+the rest of the string (starting + a the new input position, so we won't have to check space + when there are no errors) */ + repptr = PyUnicode_AS_UNICODE(repunicode); + repsize = PyUnicode_GET_SIZE(repunicode); + requiredsize = *outpos + repsize + insize-newpos; + if (requiredsize > outsize) { + if (requiredsize<2*outsize) + requiredsize = 2*outsize; + if (PyUnicode_Resize(output, requiredsize)) + goto onError; + *outptr = PyUnicode_AS_UNICODE(*output) + *outpos; } + *inpos = newpos; + *inptr = PyString_AS_STRING(input) + newpos; + Py_UNICODE_COPY(*outptr, repptr, repsize); + *outptr += repsize; + *outpos += repsize; + /* we made it! */ + res = 0; + + onError: + Py_XDECREF(restuple); + return res; } -PyObject *PyUnicode_DecodeUTF8(const char *s, - int size, - const char *errors) +PyObject *PyUnicode_DecodeUTF8Ex(PyObject *str, + PyObject *errors) { + const char *s; + int inpos; + int outpos; + int size; int n; const char *e; PyUnicodeObject *unicode; Py_UNICODE *p; const char *errmsg = ""; + PyObject *errorHandler = NULL; + if (!PyString_Check(str)) + return NULL; + s = PyString_AS_STRING(str); + size = PyString_GET_SIZE(str); /* Note: size will always be longer than the resulting Unicode - character count */ + character count (if there are no replacements) */ unicode = _PyUnicode_New(size); if (!unicode) return NULL; @@ -721,6 +845,7 @@ while (s < e) { Py_UCS4 ch = (unsigned char)*s; + if (ch < 0x80) { *p++ = (Py_UNICODE)ch; s++; @@ -816,21 +941,54 @@ continue; utf8Error: - if (utf8_decoding_error(&s, &p, errors, errmsg)) - goto onError; + inpos = s-PyString_AS_STRING(str); + outpos = p-PyUnicode_AS_UNICODE(unicode); + if (unicode_decode_call_errorhandler( + errors, &errorHandler, + "utf8", errmsg, + str, &inpos, &s, + (PyObject **)&unicode, &outpos, &p)) + goto onError; } /* Adjust length */ if (_PyUnicode_Resize(&unicode, p - unicode->str)) goto onError; + Py_XDECREF(errorHandler); return (PyObject *)unicode; onError: Py_DECREF(unicode); + Py_XDECREF(errorHandler); return NULL; } +PyObject *PyUnicode_DecodeUTF8(const char *s, + int size, + const char *errors) +{ + PyObject *str; + PyObject *errorstr = NULL; + PyObject *res; + + str = PyString_FromStringAndSize(s, size); + if (!str) + return NULL; + if (errors) { + errorstr = PyString_FromString(errors); + if (!errorstr) { + Py_DECREF(str); + return NULL; + } + } + + res = PyUnicode_DecodeUTF8Ex(str, errorstr); + Py_DECREF(str); + Py_XDECREF(errorstr); + return res; +} + /* Not used anymore, now that the encoder supports UTF-16 surrogates. */ #if 0 @@ -865,18 +1023,26 @@ } #endif -PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s, - int size, - const char *errors) +PyObject *PyUnicode_EncodeUTF8Ex(PyObject *unicode, + PyObject *errors) { + Py_UNICODE *s; + int size; PyObject *v; char *p; char *q; Py_UCS4 ch2; - unsigned int cbAllocated = 3 * size; + unsigned int cbAllocated; unsigned int cbWritten = 0; int i = 0; + if (!PyUnicode_Check(unicode)) { + PyErr_BadArgument(); + return NULL; + } + s = PyUnicode_AS_UNICODE(unicode); + size = PyUnicode_GET_SIZE(unicode); + cbAllocated = 3 * size; v = PyString_FromStringAndSize(NULL, cbAllocated); if (v == NULL) return NULL; @@ -944,72 +1110,61 @@ return NULL; } -PyObject *PyUnicode_AsUTF8String(PyObject *unicode) +PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s, + int size, + const char *errors) { - if (!PyUnicode_Check(unicode)) { - PyErr_BadArgument(); - return NULL; + PyObject *unicode; + PyObject *errorstr = NULL; + PyObject *res; + + unicode = PyUnicode_FromUnicode(s, size); + if (!unicode) + return NULL; + if (errors) { + errorstr = PyString_FromString(errors); + if (!errorstr) { + Py_DECREF(unicode); + return NULL; + } } - return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode), - PyUnicode_GET_SIZE(unicode), - NULL); -} -/* --- UTF-16 Codec ------------------------------------------------------- */ + res = PyUnicode_EncodeUTF8Ex(unicode, errorstr); + Py_DECREF(unicode); + Py_XDECREF(errorstr); + return res; +} -static -int utf16_decoding_error(const Py_UCS2 **source, - Py_UNICODE **dest, - const char *errors, - const char *details) +PyObject *PyUnicode_AsUTF8String(PyObject *unicode) { - if ((errors == NULL) || - (strcmp(errors,"strict") == 0)) { - PyErr_Format(PyExc_UnicodeError, - "UTF-16 decoding error: %.400s", - details); - return -1; - } - else if (strcmp(errors,"ignore") == 0) { - return 0; - } - else if (strcmp(errors,"replace") == 0) { - if (dest) { - **dest = Py_UNICODE_REPLACEMENT_CHARACTER; - (*dest)++; - } - return 0; - } - else { - PyErr_Format(PyExc_ValueError, - "UTF-16 decoding error; " - "unknown error handling code: %.400s", - errors); - return -1; - } + return PyUnicode_EncodeUTF8Ex(unicode, NULL); } -PyObject *PyUnicode_DecodeUTF16(const char *s, - int size, - const char *errors, +/* --- UTF-16 Codec ------------------------------------------------------- */ + +PyObject *PyUnicode_DecodeUTF16Ex(PyObject *str, + PyObject *errors, int *byteorder) { + const char *s; + int inpos; + int outpos; + int size; PyUnicodeObject *unicode; + PyObject *errorHandler = NULL; Py_UNICODE *p; - const Py_UCS2 *q, *e; + const char *e; int bo = 0; const char *errmsg = ""; - /* size should be an even number */ - if (size % sizeof(Py_UCS2) != 0) { - if (utf16_decoding_error(NULL, NULL, errors, "truncated data")) - return NULL; - /* The remaining input chars are ignored if we fall through - here... */ - } + if (!PyString_Check(str)) + return NULL; + s = PyString_AS_STRING(str); + size = PyString_GET_SIZE(str); /* Note: size will always be longer than the resulting Unicode - character count */ + character count (as long as we have long replacements on + errors) */ unicode = _PyUnicode_New(size); if (!unicode) return NULL; @@ -1018,8 +1173,7 @@ /* Unpack UTF-16 encoded data */ p = unicode->str; - q = (Py_UCS2 *)s; - e = q + (size / sizeof(Py_UCS2)); + e = s + size; if (byteorder) bo = *byteorder; @@ -1030,27 +1184,38 @@ stream as-is (giving a ZWNBSP character). */ if (bo == 0) { #ifdef BYTEORDER_IS_LITTLE_ENDIAN - if (*q == 0xFEFF) { - q++; + if (*((Py_UCS2 *)s) == 0xFEFF) { + s += sizeof(Py_UCS2); bo = -1; - } else if (*q == 0xFFFE) { - q++; + } else if (*((Py_UCS2 *)s) == 0xFFFE) { + s += sizeof(Py_UCS2); bo = 1; } #else - if (*q == 0xFEFF) { - q++; + if (*((Py_UCS2 *)s) == 0xFEFF) { + s += sizeof(Py_UCS2); bo = 1; - } else if (*q == 0xFFFE) { - q++; + } else if (*((Py_UCS2 *)s) == 0xFFFE) { + s += sizeof(Py_UCS2); bo = -1; } #endif } - while (q < e) { - register Py_UCS2 ch = *q++; + while (s < e) { + register Py_UCS2 ch; + + /* remaing bytes at the end? (size should be an even number) */ + if (e-s= e) { + if (s >= e) { errmsg = "unexpected end of data"; goto utf16Error; } if (0xD800 <= ch && ch <= 0xDBFF) { - Py_UCS2 ch2 = *q++; + Py_UCS2 ch2 = *((Py_UCS2 *)s); + s += sizeof(Py_UCS2); #ifdef BYTEORDER_IS_LITTLE_ENDIAN if (bo == 1) ch2 = (ch2 >> 8) | (ch2 << 8); @@ -1098,7 +1264,13 @@ /* Fall through to report the error */ utf16Error: - if (utf16_decoding_error(&q, &p, errors, errmsg)) + inpos = s-PyString_AS_STRING(str); + outpos = p-PyUnicode_AS_UNICODE(unicode); + if (unicode_decode_call_errorhandler( + errors, &errorHandler, + "utf16", errmsg, + str, &inpos, &s, + (PyObject **)&unicode, &outpos, &p)) goto onError; } @@ -1109,32 +1281,69 @@ if (_PyUnicode_Resize(&unicode, p - unicode->str)) goto onError; + Py_XDECREF(errorHandler); return (PyObject *)unicode; onError: Py_DECREF(unicode); + Py_XDECREF(errorHandler); return NULL; } - -#undef UTF16_ERROR -PyObject *PyUnicode_EncodeUTF16(const Py_UNICODE *s, +PyObject *PyUnicode_DecodeUTF16(const char *s, int size, const char *errors, - int byteorder) + int *byteorder) { - PyObject *v; + PyObject *str; + PyObject *errorstr = NULL; + PyObject *res; + + str = PyString_FromStringAndSize(s, size); + if (!str) + return NULL; + if (errors) { + errorstr = PyString_FromString(errors); + if (!errorstr) { + Py_DECREF(str); + return NULL; + } + } + + res = PyUnicode_DecodeUTF16Ex(str, errorstr, byteorder); + Py_DECREF(str); + Py_XDECREF(errorstr); + return res; +} + +#undef UTF16_ERROR + +PyObject *PyUnicode_EncodeUTF16Ex(PyObject *unicode, + PyObject *errors, + int byteorder) +{ + Py_UNICODE *s; + int size; + PyObject *v = NULL; Py_UCS2 *p; char *q; int i, pairs, doswap = 1; + if (!PyUnicode_Check(unicode)) { + PyErr_BadArgument(); + return NULL; + } + + s = PyUnicode_AS_UNICODE(unicode); + size = PyUnicode_GET_SIZE(unicode); + for (i = pairs = 0; i < size; i++) if (s[i] >= 0x10000) pairs++; v = PyString_FromStringAndSize(NULL, sizeof(Py_UCS2) * (size + pairs + (byteorder == 0))); if (v == NULL) - return NULL; + goto finish; q = PyString_AS_STRING(v); p = (Py_UCS2 *)q; @@ -1167,67 +1376,70 @@ *p++ = ch2; } } + finish: + Py_DECREF(errors); return v; } -PyObject *PyUnicode_AsUTF16String(PyObject *unicode) +PyObject *PyUnicode_EncodeUTF16(const Py_UNICODE *s, + int size, + const char *errors, + int byteorder) { - if (!PyUnicode_Check(unicode)) { - PyErr_BadArgument(); - return NULL; + PyObject *unicode; + PyObject *errorstr = NULL; + PyObject *res; + + unicode = PyUnicode_FromUnicode(s, size); + if (!unicode) + return NULL; + if (errors) { + errorstr = PyString_FromString(errors); + if (!errorstr) { + Py_DECREF(unicode); + return NULL; + } } - return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode), - PyUnicode_GET_SIZE(unicode), - NULL, - 0); -} -/* --- Unicode Escape Codec ----------------------------------------------- */ + res = PyUnicode_EncodeUTF16Ex(unicode, errorstr, byteorder); + Py_DECREF(unicode); + Py_XDECREF(errorstr); + return res; +} -static -int unicodeescape_decoding_error(const char **source, - Py_UNICODE *x, - const char *errors, - const char *details) +PyObject *PyUnicode_AsUTF16String(PyObject *unicode) { - if ((errors == NULL) || - (strcmp(errors,"strict") == 0)) { - PyErr_Format(PyExc_UnicodeError, - "Unicode-Escape decoding error: %.400s", - details); - return -1; - } - else if (strcmp(errors,"ignore") == 0) { - return 0; - } - else if (strcmp(errors,"replace") == 0) { - *x = Py_UNICODE_REPLACEMENT_CHARACTER; - return 0; - } - else { - PyErr_Format(PyExc_ValueError, - "Unicode-Escape decoding error; " - "unknown error handling code: %.400s", - errors); - return -1; - } + return PyUnicode_EncodeUTF16Ex(unicode, NULL, 0); } +/* --- Unicode Escape Codec ----------------------------------------------- */ + static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL; -PyObject *PyUnicode_DecodeUnicodeEscape(const char *s, - int size, - const char *errors) +PyObject *PyUnicode_DecodeUnicodeEscapeEx(PyObject *str, + PyObject *errors) { + const char *s; + int inpos; + int outpos; + int size; PyUnicodeObject *v; + PyObject *errorHandler = NULL; Py_UNICODE *p, *buf; const char *end; - char* message; + const char *message; Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */ + if (!PyString_Check(str)) + return NULL; + s = PyString_AS_STRING(str); + size = PyString_GET_SIZE(str); + /* Escaped strings will always be longer than the resulting Unicode string, so we start with size here and then reduce the - length after conversion to the true value. */ + length after conversion to the true value. + (but if the error callback returns a long replacement string + we'll have to allocate more space) */ v = _PyUnicode_New(size); if (v == NULL) goto onError; @@ -1240,7 +1452,7 @@ while (s < end) { unsigned char c; Py_UNICODE x; - int i, digits; + int digits; /* Non-escape characters are interpreted as Unicode ordinals */ if (*s != '\\') { @@ -1296,14 +1508,19 @@ message = "truncated \\UXXXXXXXX escape"; hexescape: chr = 0; - for (i = 0; i < digits; i++) { - c = (unsigned char) s[i]; + outpos = p-buf; + for (; digits>0; ++s, --digits) { + c = (unsigned char) *s; if (!isxdigit(c)) { - if (unicodeescape_decoding_error(&s, &x, errors, message)) + s--; + inpos = s-PyString_AS_STRING(str); + if (unicode_decode_call_errorhandler( + errors, &errorHandler, + "unicodeescape", message, + str, &inpos, &s, + (PyObject **)&v, &outpos, &p)) goto onError; - chr = x; - i++; - break; + goto nextByte; } chr = (chr<<4) & ~0xF; if (c >= '0' && c <= '9') @@ -1313,7 +1530,6 @@ else chr += 10 + c - 'A'; } - s += i; store: /* when we get here, chr is a 32-bit unicode character */ if (chr <= 0xffff) @@ -1330,12 +1546,16 @@ *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF); #endif } else { - if (unicodeescape_decoding_error( - &s, &x, errors, - "illegal Unicode character") - ) + /* One character back */ + s--; + inpos = s-PyString_AS_STRING(str); + outpos = p-buf; + if (unicode_decode_call_errorhandler( + errors, &errorHandler, + "unicodeescape", "illegal Unicode character", + str, &inpos, &s, + (PyObject **)&v, &outpos, &p)) goto onError; - *p++ = x; /* store replacement character */ } break; @@ -1370,9 +1590,15 @@ goto store; } } - if (unicodeescape_decoding_error(&s, &x, errors, message)) + s--; + inpos = s-PyString_AS_STRING(str); + outpos = p-buf; + if (unicode_decode_call_errorhandler( + errors, &errorHandler, + "unicodeescape", message, + str, &inpos, &s, + (PyObject **)&v, &outpos, &p)) goto onError; - *p++ = x; break; default: @@ -1380,9 +1606,12 @@ *p++ = (unsigned char)s[-1]; break; } + nextByte: + ; } if (_PyUnicode_Resize(&v, (int)(p - buf))) goto onError; + Py_XDECREF(errorHandler); return (PyObject *)v; ucnhashError: @@ -1390,13 +1619,40 @@ PyExc_UnicodeError, "\\N escapes not supported (can't load unicodedata module)" ); + Py_XDECREF(errorHandler); return NULL; onError: Py_XDECREF(v); + Py_XDECREF(errorHandler); return NULL; } +PyObject *PyUnicode_DecodeUnicodeEscape(const char *s, + int size, + const char *errors) +{ + PyObject *str; + PyObject *errorstr = NULL; + PyObject *res; + + str = PyString_FromStringAndSize(s, size); + if (!str) + return NULL; + if (errors) { + errorstr = PyString_FromString(errors); + if (!errorstr) { + Py_DECREF(str); + return NULL; + } + } + + res = PyUnicode_DecodeUnicodeEscapeEx(str, errorstr); + Py_DECREF(str); + Py_XDECREF(errorstr); + return res; +} + /* Return a Unicode-Escape string version of the Unicode object. If quotes is true, the string is enclosed in u"" or u'' quotes as @@ -1540,6 +1796,18 @@ return NULL; } +PyObject *PyUnicode_EncodeUnicodeEscapeEx(PyObject *unicode) +{ + if (!PyUnicode_Check(unicode)) { + PyErr_BadArgument(); + return NULL; + } + + return unicodeescape_string( + PyUnicode_AS_UNICODE(unicode), + PyUnicode_GET_SIZE(unicode), 0); +} + PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s, int size) { @@ -1548,25 +1816,29 @@ PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode) { - if (!PyUnicode_Check(unicode)) { - PyErr_BadArgument(); - return NULL; - } - return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode), - PyUnicode_GET_SIZE(unicode)); + return PyUnicode_EncodeUnicodeEscapeEx(unicode); } /* --- Raw Unicode Escape Codec ------------------------------------------- */ -PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s, - int size, - const char *errors) +PyObject *PyUnicode_DecodeRawUnicodeEscapeEx(PyObject *str, + PyObject *errors) { + const char *s; + int inpos; + int outpos; + int size; PyUnicodeObject *v; + PyObject *errorHandler = NULL; Py_UNICODE *p, *buf; const char *end; const char *bs; + if (!PyString_Check(str)) + return NULL; + s = PyString_AS_STRING(str); + size = PyString_GET_SIZE(str); + /* Escaped strings will always be longer than the resulting Unicode string, so we start with size here and then reduce the length after conversion to the true value. */ @@ -1605,14 +1877,18 @@ s++; /* \uXXXX with 4 hex digits */ - for (x = 0, i = 0; i < 4; i++) { - c = (unsigned char)s[i]; + outpos = p-buf; + for (x = 0, i = 0; i < 4; ++i, ++s) { + c = (unsigned char)*s; if (!isxdigit(c)) { - if (unicodeescape_decoding_error(&s, &x, errors, - "truncated \\uXXXX")) + inpos = s-PyString_AS_STRING(str); + if (unicode_decode_call_errorhandler( + errors, &errorHandler, + "rawunicodeescape", "truncated \\uXXXX", + str, &inpos, &s, + (PyObject **)&v, &outpos, &p)) goto onError; - i++; - break; + goto nextByte; } x = (x<<4) & ~0xF; if (c >= '0' && c <= '9') @@ -1622,27 +1898,62 @@ else x += 10 + c - 'A'; } - s += i; *p++ = x; + nextByte: + ; } if (_PyUnicode_Resize(&v, (int)(p - buf))) goto onError; + Py_XDECREF(errorHandler); return (PyObject *)v; onError: Py_XDECREF(v); + Py_XDECREF(errorHandler); return NULL; } -PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s, - int size) +PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s, + int size, + const char *errors) { - PyObject *repr; - char *p; + PyObject *str; + PyObject *errorstr = NULL; + PyObject *res; + + str = PyString_FromStringAndSize(s, size); + if (!str) + return NULL; + if (errors) { + errorstr = PyString_FromString(errors); + if (!errorstr) { + Py_DECREF(str); + return NULL; + } + } + + res = PyUnicode_DecodeRawUnicodeEscapeEx(str, errorstr); + Py_DECREF(str); + Py_XDECREF(errorstr); + return res; +} + +PyObject *PyUnicode_EncodeRawUnicodeEscapeEx(PyObject *unicode) +{ + PyObject *repr; + char *p; char *q; + Py_UNICODE *s; + int size; static const char *hexdigit = "0123456789abcdef"; + if (!PyUnicode_Check(unicode)) { + PyErr_BadArgument(); + return NULL; + } + s = PyUnicode_AS_UNICODE(unicode); + size = PyUnicode_GET_SIZE(unicode); repr = PyString_FromStringAndSize(NULL, 6 * size); if (repr == NULL) return NULL; @@ -1676,25 +1987,37 @@ return NULL; } -PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode) +PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s, + int size) { - if (!PyUnicode_Check(unicode)) { - PyErr_BadArgument(); + PyObject *unicode; + PyObject *res; + + unicode = PyUnicode_FromUnicode(s, size); + if (!unicode) return NULL; - } - return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode), - PyUnicode_GET_SIZE(unicode)); + + res = PyUnicode_EncodeRawUnicodeEscapeEx(unicode); + Py_DECREF(unicode); + return res; } /* --- Latin-1 Codec ------------------------------------------------------ */ -PyObject *PyUnicode_DecodeLatin1(const char *s, - int size, - const char *errors) +PyObject *PyUnicode_DecodeLatin1Ex(PyObject *str, + PyObject *errors) { + const char *s; + int size; PyUnicodeObject *v; Py_UNICODE *p; - + + if (!PyString_Check(str)) + return NULL; + + s = PyString_AS_STRING(str); + size = PyString_GET_SIZE(str); + /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */ if (size == 1 && *(unsigned char*)s < 256) { Py_UNICODE r = *(unsigned char*)s; @@ -1716,122 +2039,257 @@ return NULL; } -static -int latin1_encoding_error(const Py_UNICODE **source, - char **dest, - const char *errors, - const char *details) +PyObject *PyUnicode_DecodeLatin1(const char *s, + int size, + const char *errors) { - if ((errors == NULL) || - (strcmp(errors,"strict") == 0)) { - PyErr_Format(PyExc_UnicodeError, - "Latin-1 encoding error: %.400s", - details); - return -1; - } - else if (strcmp(errors,"ignore") == 0) { - return 0; + PyObject *str; + PyObject *errorstr = NULL; + PyObject *res; + + str = PyString_FromStringAndSize(s, size); + if (!str) + return NULL; + if (errors) { + errorstr = PyString_FromString(errors); + if (!errorstr) { + Py_DECREF(str); + return NULL; + } } - else if (strcmp(errors,"replace") == 0) { - **dest = '?'; - (*dest)++; - return 0; + + res = PyUnicode_DecodeLatin1Ex(str, errorstr); + Py_DECREF(str); + Py_XDECREF(errorstr); + return res; +} + +/* error handling callback helper: + build arguments, call the callback and check the arguments, + put the result into newpos and return the replacement string, which + has to be freed by the caller +*/ + +static PyObject *unicode_encode_call_errorhandler(PyObject *errors, PyObject **errorHandler, + const char *encoding, const char *reason, PyObject *unicode, int unicodepos, + int *newpos) +{ + static char *argparse = "O!i;encoding error handler must return (unicode, int) tuple"; + + PyObject *args; + PyObject *restuple; + PyObject *resunicode; + int size = PyUnicode_GET_SIZE(unicode); + + if (*errorHandler == NULL) { + *errorHandler = PyCodec_LookupUnicodeEncodeErrorHandler(errors); + if (*errorHandler == NULL) + return NULL; + } + + /* we don't need a state */ + args = Py_BuildValue("sOisO", encoding, unicode, unicodepos, reason, Py_None); + if (args == NULL) + return NULL; + restuple = PyEval_CallObject(*errorHandler, args); + Py_DECREF(args); + if (restuple == NULL) + return NULL; + if (!PyTuple_Check(restuple)) { + PyErr_Format(PyExc_TypeError, &argparse[4]); + Py_DECREF(restuple); + return NULL; } - else { - PyErr_Format(PyExc_ValueError, - "Latin-1 encoding error; " - "unknown error handling code: %.400s", - errors); - return -1; + if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &resunicode, newpos)) { + Py_DECREF(restuple); + return NULL; } + if (*newpos<0) + *newpos = -1; + else if (*newpos>size) + *newpos = size-1; + else + --*newpos; + Py_INCREF(resunicode); + Py_DECREF(restuple); + return resunicode; } -PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p, - int size, - const char *errors) -{ - PyObject *repr; - char *s, *start; +/* Encode a Unicode object as ASCII (limit==128) or + latin-1 (limit==256) +*/ - repr = PyString_FromStringAndSize(NULL, size); - if (repr == NULL) - return NULL; - if (size == 0) - return repr; +static PyObject *unicode_encode_ucs1ex(PyObject *unicode, + PyObject *errors, int limit) +{ + /* output object */ + PyObject *res; + /* size of input */ + int unicodesize; + /* remaining size of input */ + int remainingunicodesize; + Py_UNICODE *uni; + Py_UNICODE *origuni; + char *str; + /* current output position */ + int respos = 0; + int ressize; + char *encoding = (limit == 256) ? "latin-1" : "ascii"; + char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)"; + PyObject *errorHandler = NULL; - s = PyString_AS_STRING(repr); - start = s; - while (size-- > 0) { - Py_UNICODE ch = *p++; - if (ch >= 256) { - if (latin1_encoding_error(&p, &s, errors, - "ordinal not in range(256)")) + if (!PyUnicode_Check(unicode)) { + PyErr_BadArgument(); + return NULL; + } + remainingunicodesize = unicodesize = PyUnicode_GET_SIZE(unicode); + origuni = uni = PyUnicode_AS_UNICODE(unicode); + /* allocate enough for a simple encoding without + replacements, if we need more, we'll resize */ + res = PyString_FromStringAndSize(NULL, unicodesize); + if (res == NULL) + goto onError; + if (remainingunicodesize == 0) + return res; + str = PyString_AS_STRING(res); + ressize = unicodesize; + + for (;remainingunicodesize; --remainingunicodesize, ++uni) { + Py_UNICODE c = *uni; + + /* can we encode this? */ + if (c ressize) { + if (requiredsize<2*ressize) + requiredsize = 2*ressize; + if (_PyString_Resize(&res, requiredsize)) { + Py_DECREF(repunicode); + goto onError; + } + str = PyString_AS_STRING(res) + respos; + ressize = requiredsize; + } + /* check if there is anything unencodable in the replacement + and copy it to the output */ + while (remaining-->0) { + c = *uni2++; + if (c >= limit) { + PyCodec_RaiseUnicodeEncodeError(encoding, c, unicodepos, reason); + Py_DECREF(repunicode); + goto onError; + } + *str++ = (char)c; + } + unicodepos = newpos; + Py_DECREF(repunicode); } - else - *s++ = (char)ch; } - /* Resize if error handling skipped some characters */ - if (s - start < PyString_GET_SIZE(repr)) - if (_PyString_Resize(&repr, s - start)) + /* Resize if we allocated to much */ + respos = str-PyString_AS_STRING(res); + if (respos 0) { - register unsigned char c; - - c = (unsigned char)*s++; - if (c < 128) + while (s < e) { + register unsigned char c = (unsigned char)*s; + if (c < 128) { *p++ = c; - else if (ascii_decoding_error(&s, &p, errors, - "ordinal not in range(128)")) + s++; + } + else { + int inpos = s-PyString_AS_STRING(str); + int outpos = p-PyUnicode_AS_UNICODE(v); + if (unicode_decode_call_errorhandler( + errors, &errorHandler, + "ascii", "ordinal not in range(128)", + str, &inpos, &s, + (PyObject **)&v, &outpos, &p)) goto onError; + } } if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v)) if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v)))) goto onError; + Py_XDECREF(errorHandler); return (PyObject *)v; onError: Py_XDECREF(v); + Py_XDECREF(errorHandler); return NULL; } -static -int ascii_encoding_error(const Py_UNICODE **source, - char **dest, - const char *errors, - const char *details) -{ - if ((errors == NULL) || - (strcmp(errors,"strict") == 0)) { - PyErr_Format(PyExc_UnicodeError, - "ASCII encoding error: %.400s", - details); - return -1; - } - else if (strcmp(errors,"ignore") == 0) { - return 0; - } - else if (strcmp(errors,"replace") == 0) { - **dest = '?'; - (*dest)++; - return 0; - } - else { - PyErr_Format(PyExc_ValueError, - "ASCII encoding error; " - "unknown error handling code: %.400s", - errors); - return -1; - } -} - -PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p, +PyObject *PyUnicode_DecodeASCII(const char *s, int size, const char *errors) { - PyObject *repr; - char *s, *start; - - repr = PyString_FromStringAndSize(NULL, size); - if (repr == NULL) - return NULL; - if (size == 0) - return repr; + PyObject *str; + PyObject *errorstr = NULL; + PyObject *res; - s = PyString_AS_STRING(repr); - start = s; - while (size-- > 0) { - Py_UNICODE ch = *p++; - if (ch >= 128) { - if (ascii_encoding_error(&p, &s, errors, - "ordinal not in range(128)")) - goto onError; + str = PyString_FromStringAndSize(s, size); + if (!str) + return NULL; + if (errors) { + errorstr = PyString_FromString(errors); + if (!errorstr) { + Py_DECREF(str); + return NULL; } - else - *s++ = (char)ch; } - /* Resize if error handling skipped some characters */ - if (s - start < PyString_GET_SIZE(repr)) - if (_PyString_Resize(&repr, s - start)) - goto onError; - return repr; - onError: - Py_DECREF(repr); - return NULL; + res = PyUnicode_DecodeASCIIEx(str, errorstr); + Py_DECREF(str); + Py_XDECREF(errorstr); + return res; } +PyObject *PyUnicode_EncodeASCIIEx(PyObject *unicode, + PyObject *errors) +{ + return unicode_encode_ucs1ex(unicode, errors, 128); +} + +PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p, + int size, + const char *errors) +{ + return unicode_encode_ucs1(p, size, errors, 128); +} + PyObject *PyUnicode_AsASCIIString(PyObject *unicode) { - if (!PyUnicode_Check(unicode)) { - PyErr_BadArgument(); - return NULL; - } - return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode), - PyUnicode_GET_SIZE(unicode), - NULL); + return PyUnicode_EncodeASCIIEx(unicode, NULL); } #if defined(MS_WIN32) && defined(HAVE_USABLE_WCHAR_T) /* --- MBCS codecs for Windows -------------------------------------------- */ -PyObject *PyUnicode_DecodeMBCS(const char *s, - int size, - const char *errors) +PyObject *PyUnicode_DecodeMBCSEx(PyObject *str, + PyObject *errors) { + const char *s; + int size; PyUnicodeObject *v; Py_UNICODE *p; + if (!PyString_Check(str)) + return NULL; + + s = PyString_AS_STRING(str); + size = PyString_GET_SIZE(str); + /* First get the size of the result */ DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0); if (size > 0 && usize==0) @@ -1971,20 +2411,51 @@ return (PyObject *)v; } -PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p, +PyObject *PyUnicode_DecodeMBCS(const char *s, int size, const char *errors) { + PyObject *str; + PyObject *errorstr = NULL; + PyObject *res; + + str = PyString_FromStringAndSize(s, size); + if (!str) + return NULL; + if (errors) { + errorstr = PyString_FromString(errors); + if (!errorstr) { + Py_DECREF(str); + return NULL; + } + } + + res = PyUnicode_DecodeMBCSEx(str, errorstr); + Py_DECREF(str); + Py_XDECREF(errorstr); + return res; +} + +PyObject *PyUnicode_EncodeMBCSEx(PyObject *unicode, + PyObject *errors) +{ PyObject *repr; - char *s; DWORD mbcssize; + if (!PyUnicode_Check(unicode)) { + PyErr_BadArgument(); + return NULL; + } + /* If there are no characters, bail now! */ - if (size==0) - return PyString_FromString(""); + if (PyUNICODE_GET_SIZE(unicode) == 0) + return PyString_FromString(""); /* First get the size of the result */ - mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL); + mbcssize = WideCharToMultiByte(CP_ACP, 0, + PyUnicode_AS_UNICODE(unicode), + PyUnicode_GET_SIZE(unicode), + NULL, 0, NULL, NULL); if (mbcssize==0) return PyErr_SetFromWindowsErrWithFilename(0, NULL); @@ -1995,69 +2466,77 @@ return repr; /* Do the conversion */ - s = PyString_AS_STRING(repr); - if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) { + if (0 == WideCharToMultiByte(CP_ACP, 0, + PyUnicode_AS_UNICODE(unicode), + PyUnicode_GET_SIZE(unicode), + PyString_AS_STRING(repr), + mbcssize, NULL, NULL)) { Py_DECREF(repr); return PyErr_SetFromWindowsErrWithFilename(0, NULL); } return repr; } - -#endif /* MS_WIN32 */ -/* --- Character Mapping Codec -------------------------------------------- */ - -static -int charmap_decoding_error(const char **source, - Py_UNICODE **dest, - const char *errors, - const char *details) +PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p, + int size, + const char *errors) { - if ((errors == NULL) || - (strcmp(errors,"strict") == 0)) { - PyErr_Format(PyExc_UnicodeError, - "charmap decoding error: %.400s", - details); - return -1; - } - else if (strcmp(errors,"ignore") == 0) { - return 0; - } - else if (strcmp(errors,"replace") == 0) { - **dest = Py_UNICODE_REPLACEMENT_CHARACTER; - (*dest)++; - return 0; - } - else { - PyErr_Format(PyExc_ValueError, - "charmap decoding error; " - "unknown error handling code: %.400s", - errors); - return -1; + PyObject *unicode; + PyObject *errorstr = NULL; + PyObject *res; + + unicode = PyUnicode_FromUnicode(p, size); + if (!unicode) + return NULL; + if (errors) { + errorstr = PyString_FromString(errors); + if (!errorstr) { + Py_DECREF(unicode); + return NULL; + } } + + res = PyUnicode_EncodeMBCSEx(unicode, errorstr); + Py_DECREF(unicode); + Py_XDECREF(errorstr); + return res; } -PyObject *PyUnicode_DecodeCharmap(const char *s, - int size, +#endif /* MS_WIN32 */ + +/* --- Character Mapping Codec -------------------------------------------- */ + +PyObject *PyUnicode_DecodeCharmapEx(PyObject *str, PyObject *mapping, - const char *errors) + PyObject *errors) { + const char *s; + const char *e; + int inpos; + int outpos; + int size; PyUnicodeObject *v; + PyObject *errorHandler = NULL; Py_UNICODE *p; int extrachars = 0; /* Default to Latin-1 */ if (mapping == NULL) - return PyUnicode_DecodeLatin1(s, size, errors); + return PyUnicode_DecodeLatin1Ex(str, errors); + + s = PyString_AS_STRING(str); + size = PyString_GET_SIZE(str); + e = s + size; v = _PyUnicode_New(size); + if (v == NULL) goto onError; if (size == 0) return (PyObject *)v; p = PyUnicode_AS_UNICODE(v); - while (size-- > 0) { - unsigned char ch = *s++; + while (s < e) { + unsigned char ch = *s; PyObject *w, *x; /* Get mapping (char ordinal -> integer, Unicode char or None) */ @@ -2086,11 +2565,17 @@ goto onError; } *p++ = (Py_UNICODE)value; + s++; } else if (x == Py_None) { /* undefined mapping */ - if (charmap_decoding_error(&s, &p, errors, - "character maps to ")) { + inpos = s-PyString_AS_STRING(str); + outpos = p-PyUnicode_AS_UNICODE(v); + if (unicode_decode_call_errorhandler( + errors, &errorHandler, + "charmap", "character maps to ", + str, &inpos, &s, + (PyObject **)&v, &outpos, &p)) { Py_DECREF(x); goto onError; } @@ -2098,10 +2583,11 @@ else if (PyUnicode_Check(x)) { int targetsize = PyUnicode_GET_SIZE(x); - if (targetsize == 1) + if (targetsize == 1) { /* 1-1 mapping */ *p++ = *PyUnicode_AS_UNICODE(x); - + s++; + } else if (targetsize > 1) { /* 1-n mapping */ if (targetsize > extrachars) { @@ -2124,6 +2610,7 @@ extrachars -= targetsize; } /* 1-0 mapping: skip the character */ + s++; } else { /* wrong return value */ @@ -2137,68 +2624,116 @@ if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v)) if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v)))) goto onError; + Py_XDECREF(errorHandler); return (PyObject *)v; onError: Py_XDECREF(v); + Py_XDECREF(errorHandler); return NULL; } - -static -int charmap_encoding_error(const Py_UNICODE **source, - char **dest, - const char *errors, - const char *details) -{ - if ((errors == NULL) || - (strcmp(errors,"strict") == 0)) { - PyErr_Format(PyExc_UnicodeError, - "charmap encoding error: %.400s", - details); - return -1; - } - else if (strcmp(errors,"ignore") == 0) { - return 0; - } - else if (strcmp(errors,"replace") == 0) { - **dest = '?'; - (*dest)++; - return 0; - } - else { - PyErr_Format(PyExc_ValueError, - "charmap encoding error; " - "unknown error handling code: %.400s", - errors); - return -1; - } -} -PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p, +PyObject *PyUnicode_DecodeCharmap(const char *s, int size, PyObject *mapping, const char *errors) { - PyObject *v; - char *s; + PyObject *str; + PyObject *errorstr = NULL; + PyObject *res; + + str = PyString_FromStringAndSize(s, size); + if (!str) + return NULL; + if (errors) { + errorstr = PyString_FromString(errors); + if (!errorstr) { + Py_DECREF(str); + return NULL; + } + } + + res = PyUnicode_DecodeCharmapEx(str, mapping, errorstr); + Py_DECREF(str); + Py_XDECREF(errorstr); + return res; +} + +/* For this and the other encode functions the loop through + the string is done in the following way: A stack with two + strings is kept and the loop always encodes a character from + the string at the stacktop. If an error is encountered and + the stack has only one entry (during encoding of the original + string) the callback is called and the unicode object returned + is pushed onto the stack, so the encoding continues with the + replacement string. If the stack has two entries when an + error is encountered, the replacement string itself has + an unencodable character and an exception will be raised. + When the encoder has reached the end of it's current string + there are two possibilities: when the stack contains two + entries, this was the replacement string, so the replacement + string will be popped from the stack and encoding continues + with the next character from the original string. If the + stack had only one entry, encoding is finished. */ +PyObject *PyUnicode_EncodeCharmapEx(PyObject *unicode, + PyObject *mapping, + PyObject *errors) +{ + /* current input position */ + int unicodepos; + /* output object */ + PyObject *res; + /* current output position */ + int respos = 0; + /* the next two variables are used as a "micro stack": + during processing of a replacement string unicode2 + and unicode2pos contain the values for the original + unicode object to be encoded */ + PyObject *unicode2 = NULL; + int unicode2pos = 0; + PyObject *errorHandler = NULL; + int extrachars = 0; /* Default to Latin-1 */ if (mapping == NULL) - return PyUnicode_EncodeLatin1(p, size, errors); + return PyUnicode_EncodeLatin1Ex(unicode, errors); - v = PyString_FromStringAndSize(NULL, size); - if (v == NULL) + if (!PyUnicode_Check(unicode)) { + PyErr_BadArgument(); + return NULL; + } + res = PyString_FromStringAndSize(NULL, PyUnicode_GET_SIZE(unicode)); + if (res == NULL) return NULL; - if (size == 0) - return v; - s = PyString_AS_STRING(v); - while (size-- > 0) { - Py_UNICODE ch = *p++; + if (PyUnicode_GET_SIZE(unicode) == 0) + return res; + for (unicodepos = 0;;++unicodepos) { + Py_UNICODE c; PyObject *w, *x; + /* finished with current string? */ + if (unicodepos == PyUnicode_GET_SIZE(unicode)) { + /* currently processing replacement? */ + if (unicode2) { + /* forget replacement string */ + Py_DECREF(unicode); + /* switch back to original */ + unicode = unicode2; + unicodepos = unicode2pos; + unicode2 = NULL; + /* maybe original is finished too? */ + continue; + } + else + /* currently processing original => finished */ + break; + } + + c = PyUnicode_AS_UNICODE(unicode)[unicodepos]; + /* Get mapping (Unicode ordinal -> string char, integer or None) */ - w = PyInt_FromLong((long)ch); + w = PyInt_FromLong((long)c); if (w == NULL) goto onError; x = PyObject_GetItem(mapping, w); @@ -2222,39 +2757,53 @@ Py_DECREF(x); goto onError; } - *s++ = (char)value; + PyString_AS_STRING(res)[respos++] = (char)value; } + /* undefined mapping */ else if (x == Py_None) { - /* undefined mapping */ - if (charmap_encoding_error(&p, &s, errors, - "character maps to ")) { + /* error while replacing */ + if (unicode2) { + /* report original position; FIXME should we give a better name? */ + PyCodec_RaiseUnicodeEncodeError("charmap", c, unicode2pos, "ordinal not in mapping"); Py_DECREF(x); goto onError; } + else { + /* "push" original to secondary variables */ + unicode2 = unicode; + /* switch to replacement */ + unicode = unicode_encode_call_errorhandler(errors, &errorHandler, + "charmap", "ordinal not in mapping", unicode, unicodepos, &unicode2pos); + if (unicode == NULL) { + Py_DECREF(x); + goto onError; + } + unicodepos = -1; + /* retry with the replacement string */ + continue; + } } else if (PyString_Check(x)) { int targetsize = PyString_GET_SIZE(x); if (targetsize == 1) /* 1-1 mapping */ - *s++ = *PyString_AS_STRING(x); + PyString_AS_STRING(res)[respos++] = *PyString_AS_STRING(x); else if (targetsize > 1) { /* 1-n mapping */ if (targetsize > extrachars) { /* resize first */ - int oldpos = (int)(s - PyString_AS_STRING(v)); int needed = (targetsize - extrachars) + \ - (targetsize << 2); + (targetsize << 2); extrachars += needed; - if (_PyString_Resize(&v, PyString_GET_SIZE(v) + needed)) { + if (_PyString_Resize(&res, PyString_GET_SIZE(res) + needed)) { Py_DECREF(x); goto onError; } - s = PyString_AS_STRING(v) + oldpos; } - memcpy(s, PyString_AS_STRING(x), targetsize); - s += targetsize; + memcpy(&PyString_AS_STRING(res)[respos], PyString_AS_STRING(x), targetsize); + respos += targetsize; extrachars -= targetsize; } /* 1-0 mapping: skip the character */ @@ -2268,29 +2817,57 @@ } Py_DECREF(x); } - if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v)) - if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v)))) + /* Resize if we allocated to much */ + if (respos < PyString_GET_SIZE(res)) + if (_PyString_Resize(&res, respos)) goto onError; - return v; + return res; onError: - Py_DECREF(v); + Py_DECREF(res); + /* free replacement */ + if (unicode2) { + Py_XDECREF(unicode); + } + Py_XDECREF(errorHandler); return NULL; } -PyObject *PyUnicode_AsCharmapString(PyObject *unicode, - PyObject *mapping) +PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p, + int size, + PyObject *mapping, + const char *errors) { - if (!PyUnicode_Check(unicode) || mapping == NULL) { - PyErr_BadArgument(); + PyObject *unicode; + PyObject *errorstr; + PyObject *res; + + unicode = PyUnicode_FromUnicode(p, size); + if (!unicode) return NULL; + if (errors) { + errorstr = PyString_FromString(errors); + if (!errorstr) { + Py_DECREF(unicode); + return NULL; + } + } + else { + Py_INCREF(Py_None); + errorstr = Py_None; } - return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode), - PyUnicode_GET_SIZE(unicode), - mapping, - NULL); + + res = PyUnicode_EncodeCharmapEx(unicode, mapping, errorstr); + Py_DECREF(unicode); + Py_DECREF(errorstr); + return res; } +PyObject *PyUnicode_AsCharmapString(PyObject *unicode, PyObject *mapping) +{ + return PyUnicode_EncodeCharmapEx(unicode, mapping, NULL); +} + static int translate_error(const Py_UNICODE **source, Py_UNICODE **dest, @@ -2426,48 +3003,74 @@ /* --- Decimal Encoder ---------------------------------------------------- */ -int PyUnicode_EncodeDecimal(Py_UNICODE *s, - int length, - char *output, - const char *errors) -{ - Py_UNICODE *p, *end; +int PyUnicode_EncodeDecimalEx(PyObject *unicode, + char *output, + PyObject *errors) +{ + /* current input position */ + int unicodepos; + /* the next two variables are used as a "micro stack": + during processing of a replacement string unicode2 + and unicode2pos contain the values for the original + unicode object to be encoded */ + PyObject *unicode2 = NULL; + int unicode2pos = 0; + PyObject *errorHandler = NULL; if (output == NULL) { PyErr_BadArgument(); return -1; } - p = s; - end = s + length; - while (p < end) { - register Py_UNICODE ch = *p++; + if (!PyUnicode_Check(unicode)) { + PyErr_BadArgument(); + return -1; + } + for (unicodepos = 0;;++unicodepos) { + Py_UNICODE c; int decimal; - - if (Py_UNICODE_ISSPACE(ch)) { + /* finished with the string? */ + if (unicodepos == PyUnicode_GET_SIZE(unicode)) { + /* processing replacement? */ + if (unicode2) { + /* forget replacement */ + Py_DECREF(unicode); + /* switch back to original */ + unicode = unicode2; + unicodepos = unicode2pos; + unicode2 = NULL; + unicode2pos = 0; + /* maybe original is finished too? */ + continue; + } + else + /* processing original => finished */ + break; + } + c = PyUnicode_AS_UNICODE(unicode)[unicodepos]; + + if (Py_UNICODE_ISSPACE(c)) { *output++ = ' '; continue; } - decimal = Py_UNICODE_TODECIMAL(ch); + decimal = Py_UNICODE_TODECIMAL(c); if (decimal >= 0) { *output++ = '0' + decimal; continue; } - if (0 < ch && ch < 256) { - *output++ = (char)ch; - continue; - } + if (0 < c && c < 256) + *output++ = (char)c; /* All other characters are considered invalid */ - if (errors == NULL || strcmp(errors, "strict") == 0) { - PyErr_SetString(PyExc_ValueError, - "invalid decimal Unicode string"); - goto onError; - } - else if (strcmp(errors, "ignore") == 0) - continue; - else if (strcmp(errors, "replace") == 0) { - *output++ = '?'; - continue; + else { + /* "push" original to secondary variables */ + unicode2 = unicode; + /* switch to replacement */ + unicode = unicode_encode_call_errorhandler(errors, &errorHandler, + "charmap", "ordinal not in range(256) or decimal digit", unicode, unicodepos, &unicode2pos); + if (unicode == NULL) + goto onError; + unicodepos = -1; + /* retry with the replacement string */ } } /* 0-terminate the output string */ @@ -2475,9 +3078,45 @@ return 0; onError: + Py_XDECREF(errorHandler); + /* free replacement */ + if (unicode2) { + Py_XDECREF(unicode); + } + return -1; } +int PyUnicode_EncodeDecimal(Py_UNICODE *s, + int length, + char *output, + const char *errors) +{ + PyObject *unicode; + PyObject *errorstr; + int res; + + unicode = PyUnicode_FromUnicode(s, length); + if (!unicode) + return -1; + if (errors) { + errorstr = PyString_FromString(errors); + if (!errorstr) { + Py_DECREF(unicode); + return -1; + } + } + else { + Py_INCREF(Py_None); + errorstr = Py_None; + } + + res = PyUnicode_EncodeDecimalEx(unicode, output, errorstr); + Py_DECREF(unicode); + Py_DECREF(errorstr); + return res; +} + /* --- Helpers ------------------------------------------------------------ */ static @@ -3588,17 +4227,23 @@ \n\ Return an encoded string version of S. Default encoding is the current\n\ default string encoding. errors may be given to set a different error\n\ -handling scheme. Default is 'strict' meaning that encoding errors raise\n\ -a ValueError. Other possible values are 'ignore' and 'replace'."; +handling scheme. Default is None meaning that unencodable characters\n +raise UnicodeError. 'strict' does the same. Other possible values are\n\ +ignore' and 'replace' or a callable that will be called with the encoding,\n\ +the original unicode string, the position of the unencodable character\n\ +and an object describing the current state of the encoder and must\n\ +return a tuple with a unicode string that will be encoded instead of the\n\ +unencodable character and the position in the original string where encoding\n\ +should continue."; static PyObject * unicode_encode(PyUnicodeObject *self, PyObject *args) { char *encoding = NULL; - char *errors = NULL; - if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors)) + PyObject *errors = NULL; + if (!PyArg_ParseTuple(args, "|sO:encode", &encoding, &errors)) return NULL; - return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors); + return PyUnicode_AsEncodedStringEx((PyObject *)self, encoding, errors); } static char expandtabs__doc__[] = Index: Python/bltinmodule.c =================================================================== RCS file: /cvsroot/python/python/dist/src/Python/bltinmodule.c,v retrieving revision 2.218 diff -u -r2.218 bltinmodule.c --- Python/bltinmodule.c 2001/07/26 16:29:25 2.218 +++ Python/bltinmodule.c 2001/07/27 03:17:27 @@ -136,11 +136,11 @@ { PyObject *v; char *encoding = NULL; - char *errors = NULL; + PyObject *errors = NULL; - if ( !PyArg_ParseTuple(args, "O|ss:unicode", &v, &encoding, &errors) ) + if ( !PyArg_ParseTuple(args, "O|sO:unicode", &v, &encoding, &errors) ) return NULL; - return PyUnicode_FromEncodedObject(v, encoding, errors); + return PyUnicode_FromEncodedObjectEx(v, encoding, errors); } static char unicode_doc[] = @@ -462,8 +462,7 @@ "complex() literal too large to convert"); return NULL; } - if (PyUnicode_EncodeDecimal(PyUnicode_AS_UNICODE(v), - PyUnicode_GET_SIZE(v), + if (PyUnicode_EncodeDecimalEx(v, s_buffer, NULL)) return NULL; Index: Python/codecs.c =================================================================== RCS file: /cvsroot/python/python/dist/src/Python/codecs.c,v retrieving revision 2.13 diff -u -r2.13 codecs.c --- Python/codecs.c 2000/09/26 05:46:01 2.13 +++ Python/codecs.c 2001/07/27 03:17:27 @@ -236,11 +236,45 @@ return args; } +static +PyObject *args_tupleex(PyObject *object, + PyObject *errors) +{ + PyObject *args; + + args = PyTuple_New(2); + if (args == NULL) + return NULL; + if (errors==NULL) + errors = Py_None; + Py_INCREF(object); + Py_INCREF(errors); + PyTuple_SET_ITEM(args, 0, object); + PyTuple_SET_ITEM(args, 1, errors); + return args; +} + /* Build a codec by calling factory(stream[,errors]) or just factory(errors) depending on whether the given parameters are non-NULL. */ static +PyObject *build_stream_codecex(PyObject *factory, + PyObject *stream, + PyObject *errors) +{ + PyObject *args, *codec; + + args = args_tupleex(stream, errors); + if (args == NULL) + return NULL; + + codec = PyEval_CallObject(factory, args); + Py_DECREF(args); + return codec; +} + +static PyObject *build_stream_codec(PyObject *factory, PyObject *stream, const char *errors) @@ -294,6 +328,21 @@ return NULL; } +PyObject *PyCodec_StreamReaderEx(const char *encoding, + PyObject *stream, + PyObject *errors) +{ + PyObject *codecs; + + codecs = _PyCodec_Lookup(encoding); + if (codecs == NULL) + goto onError; + return build_stream_codecex(PyTuple_GET_ITEM(codecs,2),stream,errors); + + onError: + return NULL; +} + PyObject *PyCodec_StreamReader(const char *encoding, PyObject *stream, const char *errors) @@ -309,29 +358,51 @@ return NULL; } -PyObject *PyCodec_StreamWriter(const char *encoding, - PyObject *stream, - const char *errors) +PyObject *PyCodec_StreamWriterEx(const char *encoding, + PyObject *stream, + PyObject *errors) { PyObject *codecs; codecs = _PyCodec_Lookup(encoding); if (codecs == NULL) goto onError; - return build_stream_codec(PyTuple_GET_ITEM(codecs,3),stream,errors); + return build_stream_codecex(PyTuple_GET_ITEM(codecs,3),stream,errors); onError: return NULL; } +PyObject *PyCodec_StreamWriter(const char *encoding, + PyObject *stream, + const char *errors) +{ + PyObject *errorstr; + PyObject *res; + + if (errors) { + errorstr = PyString_FromString(errors); + if (!errorstr) + return NULL; + } + else { + Py_INCREF(Py_None); + errorstr = Py_None; + } + + res = PyCodec_StreamWriterEx(encoding,stream,errorstr); + Py_DECREF(errorstr); + return res; +} + /* Encode an object (e.g. an Unicode object) using the given encoding and return the resulting encoded object (usually a Python string). errors is passed to the encoder factory as argument if non-NULL. */ -PyObject *PyCodec_Encode(PyObject *object, - const char *encoding, - const char *errors) +PyObject *PyCodec_EncodeEx(PyObject *object, + const char *encoding, + PyObject *errors) { PyObject *encoder = NULL; PyObject *args = NULL, *result; @@ -341,11 +412,11 @@ if (encoder == NULL) goto onError; - args = args_tuple(object, errors); + args = args_tupleex(object, errors); if (args == NULL) goto onError; - result = PyEval_CallObject(encoder,args); + result = PyEval_CallObject(encoder, args); if (result == NULL) goto onError; @@ -370,14 +441,38 @@ return NULL; } +PyObject *PyCodec_Encode(PyObject *object, + const char *encoding, + const char *errors) +{ + PyObject *errorstr; + PyObject *res; + + if (errors) { + errorstr = PyString_FromString(errors); + if (!errorstr) { + Py_DECREF(object); + return NULL; + } + } + else { + Py_INCREF(Py_None); + errorstr = Py_None; + } + + res = PyCodec_EncodeEx(object, encoding, errorstr); + Py_DECREF(errorstr); + return res; +} + /* Decode an object (usually a Python string) using the given encoding and return an equivalent object (e.g. an Unicode object). errors is passed to the decoder factory as argument if non-NULL. */ -PyObject *PyCodec_Decode(PyObject *object, +PyObject *PyCodec_DecodeEx(PyObject *object, const char *encoding, - const char *errors) + PyObject *errors) { PyObject *decoder = NULL; PyObject *args = NULL, *result = NULL; @@ -387,7 +482,7 @@ if (decoder == NULL) goto onError; - args = args_tuple(object, errors); + args = args_tupleex(object, errors); if (args == NULL) goto onError; @@ -416,14 +511,409 @@ return NULL; } +PyObject *PyCodec_Decode(PyObject *object, + const char *encoding, + const char *errors) +{ + PyObject *errorstr; + PyObject *res; + + if (errors) { + errorstr = PyString_FromString(errors); + if (!errorstr) { + Py_DECREF(object); + return NULL; + } + } + else { + Py_INCREF(Py_None); + errorstr = Py_None; + } + + res = PyCodec_DecodeEx(object, encoding, errorstr); + Py_DECREF(errorstr); + return res; +} + +static PyObject *_PyCodec_UnicodeEncodeErrorHandlerRegistry; + +/* Register the error handling callback function error under the name name + this function will be called by the encode when it encounters + an unencodable character, as name is specified as the error parameter + in the call to the encode function. + Return 0 on success, -1 on error */ +int PyCodec_RegisterUnicodeEncodeErrorHandler(char *name, PyObject *error) +{ + if (!PyCallable_Check(error)) { + PyErr_SetString(PyExc_TypeError, + "handler must be callable"); + return -1; + } + return PyDict_SetItemString(_PyCodec_UnicodeEncodeErrorHandlerRegistry, name, error); +} + +/* Lookup the error handling callback function registered under the name error + if error is a string or unicode object. As special cases NULL or Py_None can be + passed, in which case the error handling callback for strict encoding will be returned. + If error is callable, a new reference to it will be returned directly. */ +PyObject *PyCodec_LookupUnicodeEncodeErrorHandler(PyObject *error) +{ + PyObject *handler = NULL; + + if (error==NULL || error==Py_None) + handler = PyDict_GetItemString(_PyCodec_UnicodeEncodeErrorHandlerRegistry, "strict"); + else if (PyCallable_Check(error)) + handler = error; + else { + handler = PyDict_GetItem(_PyCodec_UnicodeEncodeErrorHandlerRegistry, error); + if (!handler) + PyErr_SetString(PyExc_LookupError, "unknown error handler name"); + } + Py_XINCREF(handler); + return handler; +} + +void PyCodec_RaiseUnicodeEncodeError(const char *encoding, Py_UNICODE c, int pos, const char *reason) +{ + PyErr_Format(PyExc_UnicodeError, + "encoding '%.400s' can't encode character '\\u%x' in position %d: %.400s", + encoding, (long)c, pos, reason); +} + + +PyObject *PyCodec_RaiseUnicodeEncodeErrors(PyObject *self, PyObject *args) +{ + char *encoding; + Py_UNICODE *unicode; + int size; + int pos; + const char *reason; + PyObject *state; + + if (PyArg_ParseTuple(args, "su#isO:raise_unicodeencode_errors", + &encoding, &unicode, &size, &pos, &reason, &state)) + PyCodec_RaiseUnicodeEncodeError(encoding, unicode[pos], pos, reason); + return NULL; +} + + +PyObject *PyCodec_IgnoreUnicodeEncodeErrors(PyObject *self, PyObject *args) +{ + PyObject *encoding; + PyObject *unicode; + int pos; + PyObject *reason; + PyObject *state; + + if (!PyArg_ParseTuple(args, "OOiOO:ignore_unicodeencode_errors", + &encoding, &unicode, &pos, &reason, &state)) + return NULL; + /* skip the unencodable character */ + ++pos; + /* ouch: passing NULL, 0, pos gives None instead of u'' */ + return Py_BuildValue("(u#i)", &pos, 0, pos); +} + + +PyObject *PyCodec_ReplaceUnicodeEncodeErrors(PyObject *self, PyObject *args) +{ + PyObject *encoding; + PyObject *unicode; + int pos; + PyObject *reason; + PyObject *state; + Py_UNICODE res = '?'; + + if (!PyArg_ParseTuple(args, "OOiOO:replace_unicodeencode_errors", + &encoding, &unicode, &pos, &reason, &state)) + return NULL; + /* skip the unencodable character */ + ++pos; + return Py_BuildValue("(u#i)", &res, 1, pos); +} + +static Py_UNICODE hexdigits[] = { + '0', '1', '2', '3', '4', '5', '6', '7', + '8', '9', 'a', 'b', 'c', 'd', 'e', 'f' +}; + +PyObject *PyCodec_XMLCharRefReplaceUnicodeEncodeErrors(PyObject *self, PyObject *args) +{ + PyObject *encoding; + Py_UNICODE *unicode; + int size; + int pos; + PyObject *reason; + PyObject *state; + Py_UNICODE buf[9]; + Py_UNICODE *p = buf; + Py_UNICODE c; + + if (!PyArg_ParseTuple(args, "Ou#iOO:xmlcharrefreplace_unicodeencode_errors", + &encoding, &unicode, &size, &pos, &reason, &state)) + return NULL; + + c = unicode[pos]; + *p++ = '&'; + *p++ = '#'; + *p++ = 'x'; + if (c>=0x1000) + *p++ = hexdigits[c>>12]; + if (c>=0x0100) + *p++ = hexdigits[(c>>8)&0xf]; + if (c>=0x0010) + *p++ = hexdigits[(c>>4)&0xf]; + *p++ = hexdigits[c&0xf]; + *p++ = ';'; + + /* skip the unencodable character */ + ++pos; + return Py_BuildValue("(u#i)", buf, p-buf, pos); +} + +PyObject *PyCodec_EscapeReplaceUnicodeEncodeErrors(PyObject *self, PyObject *args) +{ + PyObject *encoding; + Py_UNICODE *unicode; + int size; + int pos; + PyObject *reason; + PyObject *state; + Py_UNICODE buf[10]; + Py_UNICODE *p = buf; + Py_UNICODE c; + + if (!PyArg_ParseTuple(args, "Ou#iOO:escapereplace_unicodeencode_errors", + &encoding, &unicode, &size, &pos, &reason, &state)) + return NULL; + + c = unicode[pos]; + *p++ = '\\'; + if (c >= 0x00010000) { + *p++ = 'U'; + *p++ = hexdigits[(c>>28)&0xf]; + *p++ = hexdigits[(c>>24)&0xf]; + *p++ = hexdigits[(c>>20)&0xf]; + *p++ = hexdigits[(c>>16)&0xf]; + } + else if (c >= 0x100) { + *p++ = 'u'; + *p++ = hexdigits[(c>>12)&0xf]; + *p++ = hexdigits[(c>>8)&0xf]; + } + else + *p++ = 'x'; + *p++ = hexdigits[(c>>4)&0xf]; + *p++ = hexdigits[c&0xf]; + + /* skip the unencodable character */ + ++pos; + return Py_BuildValue("(u#i)", buf, p-buf, pos); +} + +static PyObject *_PyCodec_UnicodeDecodeErrorHandlerRegistry; + +int PyCodec_RegisterUnicodeDecodeErrorHandler(char *name, PyObject *error) +{ + if (!PyCallable_Check(error)) { + PyErr_SetString(PyExc_TypeError, + "handler must be callable"); + return -1; + } + return PyDict_SetItemString(_PyCodec_UnicodeDecodeErrorHandlerRegistry, name, error); +} + +PyObject *PyCodec_LookupUnicodeDecodeErrorHandler(PyObject *error) +{ + PyObject *handler = NULL; + + if (error==NULL || error==Py_None) + handler = PyDict_GetItemString(_PyCodec_UnicodeDecodeErrorHandlerRegistry, "strict"); + else if (PyCallable_Check(error)) + handler = error; + else { + handler = PyDict_GetItem(_PyCodec_UnicodeDecodeErrorHandlerRegistry, error); + if (!handler) + PyErr_SetString(PyExc_LookupError, "unknown error handler name"); + } + Py_XINCREF(handler); + return handler; +} + +void PyCodec_RaiseUnicodeDecodeError(const char *encoding, char c, int pos, const char *reason) +{ + PyErr_Format(PyExc_UnicodeError, + "encoding '%.400s' can't decode byte 0x%x in position %d: %.400s", + encoding, ((long)c)&0xff, pos, reason); +} + + +PyObject *PyCodec_RaiseUnicodeDecodeErrors(PyObject *self, PyObject *args) +{ + char *encoding; + char *str; + int size; + int pos; + const char *reason; + PyObject *state; + + if (PyArg_ParseTuple(args, "ss#isO:raise_unicodedecode_errors", + &encoding, &str, &size, &pos, &reason, &state)) + PyCodec_RaiseUnicodeDecodeError(encoding, str[pos], pos, reason); + return NULL; +} + + +PyObject *PyCodec_IgnoreUnicodeDecodeErrors(PyObject *self, PyObject *args) +{ + PyObject *encoding; + PyObject *str; + int pos; + PyObject *reason; + PyObject *state; + + if (!PyArg_ParseTuple(args, "OOiOO:ignore_unicodedecode_errors", + &encoding, &str, &pos, &reason, &state)) + return NULL; + /* skip the undecodable character */ + ++pos; + /* ouch: passing NULL, 0, pos gives None instead of u'' */ + return Py_BuildValue("(u#i)", &pos, 0, pos); +} + + +PyObject *PyCodec_ReplaceUnicodeDecodeErrors(PyObject *self, PyObject *args) +{ + PyObject *encoding; + PyObject *str; + int pos; + PyObject *reason; + PyObject *state; + Py_UNICODE res = Py_UNICODE_REPLACEMENT_CHARACTER; + + if (!PyArg_ParseTuple(args, "OOiOO:replace_unicodedecode_errors", + &encoding, &str, &pos, &reason, &state)) + return NULL; + /* skip the decodable character */ + ++pos; + return Py_BuildValue("(u#i)", &res, 1, pos); +} + void _PyCodecRegistry_Init(void) { + static struct { + char *name; + PyMethodDef def; + } methods[] = + { + { + "strict", + { + "raise_unicodeencode_errors", + PyCodec_RaiseUnicodeEncodeErrors, + METH_VARARGS + } + }, + { + "ignore", + { + "ignore_unicodeencode_errors", + PyCodec_IgnoreUnicodeEncodeErrors, + METH_VARARGS + } + }, + { + "replace", + { + "replace_unicodeencode_errors", + PyCodec_ReplaceUnicodeEncodeErrors, + METH_VARARGS + } + }, + { + "xmlcharrefreplace", + { + "xmlcharrefreplace_unicodeencode_errors", + PyCodec_XMLCharRefReplaceUnicodeEncodeErrors, + METH_VARARGS + } + }, + { + "escapereplace", + { + "escapereplace_unicodeencode_errors", + PyCodec_EscapeReplaceUnicodeEncodeErrors, + METH_VARARGS + } + }, + { + "strict", + { + "raise_unicodedecode_errors", + PyCodec_RaiseUnicodeDecodeErrors, + METH_VARARGS + } + }, + { + "ignore", + { + "ignore_unicodedecode_errors", + PyCodec_IgnoreUnicodeDecodeErrors, + METH_VARARGS + } + }, + { + "replace", + { + "replace_unicodedecode_errors", + PyCodec_ReplaceUnicodeDecodeErrors, + METH_VARARGS + } + } + }; + if (_PyCodec_SearchPath == NULL) _PyCodec_SearchPath = PyList_New(0); if (_PyCodec_SearchCache == NULL) _PyCodec_SearchCache = PyDict_New(); + if (_PyCodec_UnicodeEncodeErrorHandlerRegistry == NULL) { + int i; + _PyCodec_UnicodeEncodeErrorHandlerRegistry = PyDict_New(); + + if (_PyCodec_UnicodeEncodeErrorHandlerRegistry) { + for (i = 0; i < 5; ++i) { + PyObject *func = PyCFunction_New(&methods[i].def, NULL); + int res; + if (!func) + Py_FatalError("can't initialize codec registry"); + res = PyCodec_RegisterUnicodeEncodeErrorHandler(methods[i].name, func); + Py_DECREF(func); + if (res) + Py_FatalError("can't initialize codec registry"); + } + } + } + if (_PyCodec_UnicodeDecodeErrorHandlerRegistry == NULL) { + int i; + _PyCodec_UnicodeDecodeErrorHandlerRegistry = PyDict_New(); + + if (_PyCodec_UnicodeDecodeErrorHandlerRegistry) { + for (i = 5; i < 8; ++i) { + PyObject *func = PyCFunction_New(&methods[i].def, NULL); + int res; + if (!func) + Py_FatalError("can't initialize codec registry"); + res = PyCodec_RegisterUnicodeDecodeErrorHandler(methods[i].name, func); + Py_DECREF(func); + if (res) + Py_FatalError("can't initialize codec registry"); + } + } + } if (_PyCodec_SearchPath == NULL || - _PyCodec_SearchCache == NULL) + _PyCodec_SearchCache == NULL || + _PyCodec_UnicodeEncodeErrorHandlerRegistry == NULL || + _PyCodec_UnicodeDecodeErrorHandlerRegistry == NULL) Py_FatalError("can't initialize codec registry"); } @@ -433,4 +923,8 @@ _PyCodec_SearchPath = NULL; Py_XDECREF(_PyCodec_SearchCache); _PyCodec_SearchCache = NULL; + Py_XDECREF(_PyCodec_UnicodeEncodeErrorHandlerRegistry); + _PyCodec_UnicodeEncodeErrorHandlerRegistry = NULL; + Py_XDECREF(_PyCodec_UnicodeDecodeErrorHandlerRegistry); + _PyCodec_UnicodeDecodeErrorHandlerRegistry = NULL; }