diff -r a17710e27ea2 Doc/library/codecs.rst --- a/Doc/library/codecs.rst Fri Jun 10 12:48:13 2011 +0200 +++ b/Doc/library/codecs.rst Wed Jun 08 17:26:50 2011 +0200 @@ -1286,11 +1286,14 @@ :synopsis: Windows ANSI codepage Encode operand according to the ANSI codepage (CP_ACP). This codec only -supports ``'strict'`` and ``'replace'`` error handlers to encode, and -``'strict'`` and ``'ignore'`` error handlers to decode. +supports ``'strict'`` and ``'replace'`` error handlers to encode. Availability: Windows only. +.. versionchanged:: 3.3 + Decoding supports any error handler. "ignore" error handlers ignores + undecodable bytes, instead of replacing them, on Windows Vista and later. + .. versionchanged:: 3.2 Before 3.2, the *errors* argument was ignored; ``'replace'`` was always used to encode, and ``'ignore'`` to decode. diff -r a17710e27ea2 Include/unicodeobject.h --- a/Include/unicodeobject.h Fri Jun 10 12:48:13 2011 +0200 +++ b/Include/unicodeobject.h Wed Jun 08 17:26:50 2011 +0200 @@ -1179,6 +1179,14 @@ Py_ssize_t *consumed /* bytes consumed */ ); +PyAPI_FUNC(PyObject*) PyUnicode_DecodeCodePageStateful( + int code_page, /* code page number */ + const char *string, /* MBCS encoded string */ + Py_ssize_t length, /* size of string */ + const char *errors, /* error handling */ + Py_ssize_t *consumed /* bytes consumed */ + ); + PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString( PyObject *unicode /* Unicode object */ ); @@ -1186,7 +1194,14 @@ #ifndef Py_LIMITED_API PyAPI_FUNC(PyObject*) PyUnicode_EncodeMBCS( const Py_UNICODE *data, /* Unicode char buffer */ - Py_ssize_t length, /* Number of Py_UNICODE chars to encode */ + Py_ssize_t length, /* number of Py_UNICODE chars to encode */ + const char *errors /* error handling */ + ); + +PyAPI_FUNC(PyObject*) PyUnicode_EncodeCodePage( + int code_page, /* code page number */ + const Py_UNICODE *data, /* Unicode char buffer */ + Py_ssize_t length, /* number of Py_UNICODE chars to encode */ const char *errors /* error handling */ ); #endif diff -r a17710e27ea2 Modules/_codecsmodule.c --- a/Modules/_codecsmodule.c Fri Jun 10 12:48:13 2011 +0200 +++ b/Modules/_codecsmodule.c Wed Jun 08 17:26:50 2011 +0200 @@ -613,6 +613,31 @@ return codec_tuple(decoded, consumed); } +static PyObject * +code_page_decode(PyObject *self, + PyObject *args) +{ + Py_buffer pbuf; + const char *errors = NULL; + int final = 0; + Py_ssize_t consumed; + PyObject *decoded = NULL; + int code_page; + + if (!PyArg_ParseTuple(args, "iy*|zi:mbcs_decode", + &code_page, &pbuf, &errors, &final)) + return NULL; + consumed = pbuf.len; + + decoded = PyUnicode_DecodeCodePageStateful(code_page, + pbuf.buf, pbuf.len, errors, + final ? NULL : &consumed); + PyBuffer_Release(&pbuf); + if (decoded == NULL) + return NULL; + return codec_tuple(decoded, consumed); +} + #endif /* MS_WINDOWS */ /* --- Encoder ------------------------------------------------------------ */ @@ -1014,6 +1039,31 @@ return v; } +static PyObject * +code_page_encode(PyObject *self, + PyObject *args) +{ + PyObject *str, *v; + const char *errors = NULL; + int code_page; + + if (!PyArg_ParseTuple(args, "iO|z:code_page_encode", + &code_page, &str, &errors)) + return NULL; + + str = PyUnicode_FromObject(str); + if (str == NULL) + return NULL; + v = codec_tuple(PyUnicode_EncodeCodePage( + code_page, + PyUnicode_AS_UNICODE(str), + PyUnicode_GET_SIZE(str), + errors), + PyUnicode_GET_SIZE(str)); + Py_DECREF(str); + return v; +} + #endif /* MS_WINDOWS */ /* --- Error handler registry --------------------------------------------- */ @@ -1104,6 +1154,8 @@ #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T) {"mbcs_encode", mbcs_encode, METH_VARARGS}, {"mbcs_decode", mbcs_decode, METH_VARARGS}, + {"code_page_encode", code_page_encode, METH_VARARGS}, + {"code_page_decode", code_page_decode, METH_VARARGS}, #endif {"register_error", register_error, METH_VARARGS, register_error__doc__}, diff -r a17710e27ea2 Objects/unicodeobject.c --- a/Objects/unicodeobject.c Fri Jun 10 12:48:13 2011 +0200 +++ b/Objects/unicodeobject.c Wed Jun 08 17:26:50 2011 +0200 @@ -4950,62 +4950,59 @@ #define NEED_RETRY #endif +/* Forward */ +static PyObject * +encode_code_page(int code_page, const Py_UNICODE *p, Py_ssize_t size, + const char *errors); + /* XXX This code is limited to "true" double-byte encodings, as - a) it assumes an incomplete character consists of a single byte, and - b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte - encodings, see IsDBCSLeadByteEx documentation. */ + it assumes an incomplete character consists of a single byte. */ static int -is_dbcs_lead_byte(const char *s, int offset) +is_dbcs_lead_byte(const char *s, int offset, UINT code_page) { const char *curr = s + offset; - - if (IsDBCSLeadByte(*curr)) { - const char *prev = CharPrev(s, curr); - return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2); - } - return 0; + const char *prev; + + if (!IsDBCSLeadByteEx(*curr, code_page)) + return 0; + + prev = CharPrevExA(code_page, s, curr, 0); + return ((prev == curr) + || !IsDBCSLeadByteEx(*prev, code_page) + || (curr - prev == 2)); } /* - * Decode MBCS string into unicode object. If 'final' is set, converts - * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise. + * Decode a byte string from a Windows code page into unicode object. If + * 'final' is set, converts trailing lead-byte too. + * + * Returns consumed size if succeed, returns -2 on decode error, or raise a + * WindowsError and returns -1 on other error. */ static int -decode_mbcs(PyUnicodeObject **v, - const char *s, /* MBCS string */ - int size, /* sizeof MBCS string */ - int final, - const char *errors) -{ +decode_code_page_strict(PyUnicodeObject **v, + const char *s, /* MBCS string */ + int size, /* sizeof MBCS string */ + UINT code_page, + int final) +{ + const DWORD flags = MB_ERR_INVALID_CHARS; Py_UNICODE *p; Py_ssize_t n; DWORD usize; - DWORD flags; assert(size >= 0); - /* check and handle 'errors' arg */ - if (errors==NULL || strcmp(errors, "strict")==0) - flags = MB_ERR_INVALID_CHARS; - else if (strcmp(errors, "ignore")==0) - flags = 0; - else { - PyErr_Format(PyExc_ValueError, - "mbcs encoding does not support errors='%s'", - errors); - return -1; - } - /* Skip trailing lead-byte unless 'final' is set */ - if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1)) + if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1, code_page)) --size; /* First get the size of the result */ if (size > 0) { - usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0); - if (usize==0) - goto mbcs_decode_error; + usize = MultiByteToWideChar(code_page, flags, s, size, NULL, 0); + if (usize == 0) + goto error; } else usize = 0; @@ -5019,40 +5016,228 @@ else { /* Extend unicode object */ n = PyUnicode_GET_SIZE(*v); + /* FIXME: check for n+usize integer overflow */ if (_PyUnicode_Resize(v, n + usize) < 0) return -1; } /* Do the conversion */ - if (usize > 0) { - p = PyUnicode_AS_UNICODE(*v) + n; - if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) { - goto mbcs_decode_error; - } - } - return size; - -mbcs_decode_error: - /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then - we raise a UnicodeDecodeError - else it is a 'generic' - windows error - */ - if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) { - /* Ideally, we should get reason from FormatMessage - this - is the Windows 2000 English version of the message - */ - PyObject *exc = NULL; - const char *reason = "No mapping for the Unicode character exists " - "in the target multi-byte code page."; - make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason); + if (usize <= 0) + return size; + + p = PyUnicode_AS_UNICODE(*v) + n; + usize = MultiByteToWideChar(code_page, flags, s, size, p, usize); + if (usize != 0) + return size; + +error: + if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) { + PyErr_SetFromWindowsErrWithFilename(0, NULL); + return -1; + } + return -2; +} + +/* + * Decode a byte string from a code page into unicode object with an error + * handler. + * + * Returns consumed size if succeed, or raise a WindowsError or + * UnicodeDecodeError exception and returns -1 on error. + */ +static int +decode_code_page_errors(PyUnicodeObject **v, + const char *in, /* MBCS string */ + int size, /* sizeof MBCS string */ + UINT code_page, + const char *errors) +{ + const char *startin = in; + const char *endin = in + size; + DWORD flags; + /* FIXME: Are 10 characters enough? */ + wchar_t buffer[10], *startout, *out; + int insize; + int outsize; + const char *reason = "No mapping for the Unicode character exists " + "in the target code page."; + Py_ssize_t startinpos; + PyObject *errorHandler = NULL; + PyObject *exc = NULL; + OSVERSIONINFOEX ver; + + assert(size >= 0); + + if (errors == NULL || strcmp(errors, "strict") == 0) { + /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a + UnicodeDecodeError. Ideally, we should get reason from + FormatMessage. This is the Windows 2000 English version of the + message. */ + make_decode_exception(&exc, "mbcs", in, size, 0, 0, reason); if (exc != NULL) { PyCodec_StrictErrors(exc); Py_DECREF(exc); } - } else { - PyErr_SetFromWindowsErrWithFilename(0, NULL); - } - return -1; + return -1; + } + + ver.dwOSVersionInfoSize = sizeof(ver); + if (!GetVersionEx((OSVERSIONINFO*)&ver)) { + PyErr_SetFromWindowsErr(0); + return -1; + } + if (strcmp(errors, "replace") == 0 && ver.dwMajorVersion >= 6) + flags = 0; + else if (strcmp(errors, "ignore") == 0 && ver.dwMajorVersion < 6) + flags = 0; + else + flags = MB_ERR_INVALID_CHARS; + + if (*v == NULL) { + /* Create unicode object */ + *v = _PyUnicode_New(size); + if (*v == NULL) + return -1; + startout = PyUnicode_AS_UNICODE(*v); + } + else { + /* Extend unicode object */ + Py_ssize_t n = PyUnicode_GET_SIZE(*v); + /* FIXME: is +size enough? */ + /* FIXME: check for n+size integer overflow */ + if (_PyUnicode_Resize(v, n+ size) < 0) + return -1; + startout = PyUnicode_AS_UNICODE(*v) + n; + } + + out = startout; + while (in < endin) + { + /* Do the conversion */ + insize = 1; + outsize = MultiByteToWideChar(code_page, flags, + in, insize, + buffer, sizeof(buffer)/sizeof(buffer[0])); + if (outsize <= 0) { + Py_ssize_t endinpos; + Py_ssize_t outpos; + + if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) { + PyErr_SetFromWindowsErrWithFilename(0, NULL); + goto error; + } + + startinpos = in-startin; + endinpos = startinpos + 1; + outpos = out - PyUnicode_AS_UNICODE(*v); + if (unicode_decode_call_errorhandler( + errors, &errorHandler, + "mbcs", reason, + &startin, &endin, &startinpos, &endinpos, &exc, &in, + v, &outpos, &out)) + { + goto error; + } + } else { + in += insize; + memcpy(out, buffer, outsize * sizeof(wchar_t)); + out += outsize; + } + } + + /* write a NUL character at the end */ + *out = 0; + + /* Extend unicode object */ + outsize = out - startout; + if (_PyUnicode_Resize(v, outsize) < 0) + goto error; + goto done; + +error: + outsize = -1; +done: + Py_XDECREF(errorHandler); + Py_XDECREF(exc); + return outsize; +} + +/* + * Decode a byte string from a Windows code page into unicode object. If + * 'final' is set, converts trailing lead-byte too. + * + * Returns consumed size if succeed, or raise a WindowsError or + * UnicodeDecodeError exception and returns -1 on error. + */ +static int +decode_code_page(UINT code_page, + PyUnicodeObject **v, + const char *s, /* MBCS string */ + int size, /* sizeof MBCS string */ + int final, + const char *errors) +{ + int done; + + done = decode_code_page_strict(v, s, size, code_page, final); + if (done == -2) + done = decode_code_page_errors(v, s, size, code_page, errors); + return done; +} + +static PyObject * +decode_code_page_stateful(int code_page, + const char *s, + Py_ssize_t size, + const char *errors, + Py_ssize_t *consumed) +{ + PyUnicodeObject *v = NULL; + int done; + + if (code_page < 0) { + PyErr_SetString(PyExc_ValueError, "invalid code page number"); + return NULL; + } + + if (consumed) + *consumed = 0; + +#ifdef NEED_RETRY + retry: + if (size > INT_MAX) + done = decode_code_page(code_page, &v, s, INT_MAX, 0, errors); + else +#endif + done = decode_code_page(code_page, &v, s, (int)size, !consumed, errors); + + if (done < 0) { + Py_XDECREF(v); + return NULL; + } + + if (consumed) + *consumed += done; + +#ifdef NEED_RETRY + if (size > INT_MAX) { + s += done; + size -= done; + goto retry; + } +#endif + + return (PyObject *)v; +} + +PyObject * +PyUnicode_DecodeCodePageStateful(int code_page, + const char *s, + Py_ssize_t size, + const char *errors, + Py_ssize_t *consumed) +{ + return decode_code_page_stateful(code_page, s, size, errors, consumed); } PyObject * @@ -5061,37 +5246,7 @@ const char *errors, Py_ssize_t *consumed) { - PyUnicodeObject *v = NULL; - int done; - - if (consumed) - *consumed = 0; - -#ifdef NEED_RETRY - retry: - if (size > INT_MAX) - done = decode_mbcs(&v, s, INT_MAX, 0, errors); - else -#endif - done = decode_mbcs(&v, s, (int)size, !consumed, errors); - - if (done < 0) { - Py_XDECREF(v); - return NULL; - } - - if (consumed) - *consumed += done; - -#ifdef NEED_RETRY - if (size > INT_MAX) { - s += done; - size -= done; - goto retry; - } -#endif - - return (PyObject *)v; + return decode_code_page_stateful(CP_ACP, s, size, errors, consumed); } PyObject * @@ -5102,42 +5257,24 @@ return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL); } -/* - * Convert unicode into string object (MBCS). - * Returns 0 if succeed, -1 otherwise. - */ static int -encode_mbcs(PyObject **repr, - const Py_UNICODE *p, /* unicode */ - int size, /* size of unicode */ - const char* errors) +encode_code_page_strict(UINT code_page, + PyObject **repr, + const Py_UNICODE *p, /* unicode */ + int size) /* size of unicode */ { BOOL usedDefaultChar = FALSE; - BOOL *pusedDefaultChar; + BOOL *pusedDefaultChar = &usedDefaultChar; int mbcssize; Py_ssize_t n; PyObject *exc = NULL; - DWORD flags; + const DWORD flags = WC_NO_BEST_FIT_CHARS; assert(size >= 0); - /* check and handle 'errors' arg */ - if (errors==NULL || strcmp(errors, "strict")==0) { - flags = WC_NO_BEST_FIT_CHARS; - pusedDefaultChar = &usedDefaultChar; - } else if (strcmp(errors, "replace")==0) { - flags = 0; - pusedDefaultChar = NULL; - } else { - PyErr_Format(PyExc_ValueError, - "mbcs encoding does not support errors='%s'", - errors); - return -1; - } - /* First get the size of the result */ if (size > 0) { - mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0, + mbcssize = WideCharToMultiByte(code_page, flags, p, size, NULL, 0, NULL, pusedDefaultChar); if (mbcssize == 0) { PyErr_SetFromWindowsErrWithFilename(0, NULL); @@ -5145,7 +5282,7 @@ } /* If we used a default char, then we failed! */ if (pusedDefaultChar && *pusedDefaultChar) - goto mbcs_encode_error; + return -2; } else { mbcssize = 0; } @@ -5167,37 +5304,217 @@ /* Do the conversion */ if (size > 0) { char *s = PyBytes_AS_STRING(*repr) + n; - if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize, + if (0 == WideCharToMultiByte(code_page, flags, p, size, s, mbcssize, NULL, pusedDefaultChar)) { PyErr_SetFromWindowsErrWithFilename(0, NULL); return -1; } if (pusedDefaultChar && *pusedDefaultChar) - goto mbcs_encode_error; + return -2; } return 0; - -mbcs_encode_error: - raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character"); +} + +static int +encode_code_page_errors(UINT code_page, + PyObject **repr, + const Py_UNICODE *in, /* unicode */ + int size, /* size of unicode */ + const char* errors) +{ + BOOL usedDefaultChar = FALSE; + BOOL *pusedDefaultChar; + int outsize; + DWORD flags; + /* FIXME: is it enough? */ + char buffer[10]; + const Py_UNICODE *startin = in, *endin = in + size; + char *out, *startout; + int insize; + PyObject *errorHandler = NULL; + PyObject *exc = NULL; + /* Ideally, we should get reason from FormatMessage. This is the Windows + 2000 English version of the message. */ + const char *reason = "invalid character"; + PyObject *encoding_obj = NULL; + char *encoding; + int ret = -1; + OSVERSIONINFOEX ver; + + assert(size >= 0); + + if (code_page == CP_ACP) + encoding = "mbcs"; + else { + encoding_obj = PyBytes_FromFormat("cp%u", code_page); + if (encoding_obj == NULL) + return -1; + encoding = PyBytes_AS_STRING(encoding_obj); + } + + if (errors == NULL || strcmp(errors, "strict") == 0) { + /* The last error was ERROR_NO_UNICODE_TRANSLATION, + then we raise a UnicodeEncodeError. */ + make_encode_exception(&exc, encoding, in, size, 0, 0, reason); + if (exc != NULL) { + PyCodec_StrictErrors(exc); + Py_DECREF(exc); + } + Py_XDECREF(encoding_obj); + return ret; + } + + + /* handle 'errors' arg */ + ver.dwOSVersionInfoSize = sizeof(ver); + if (!GetVersionEx((OSVERSIONINFO*)&ver)) { + PyErr_SetFromWindowsErr(0); + return -1; + } + if (strcmp(errors, "replace") == 0 && ver.dwMajorVersion >= 6) { + flags = 0; + pusedDefaultChar = NULL; + } + else if (strcmp(errors, "ignore") == 0 && ver.dwMajorVersion < 6){ + flags = 0; + pusedDefaultChar = NULL; + } + else { + flags = WC_NO_BEST_FIT_CHARS; + pusedDefaultChar = &usedDefaultChar; + } + + /* FIXME: is '*10' enough? */ + outsize = size * 10; + + if (*repr == NULL) { + /* Create string object */ + *repr = PyBytes_FromStringAndSize(NULL, outsize); + if (*repr == NULL) + goto error; + startout = PyBytes_AS_STRING(*repr); + } + else { + /* Extend string object */ + Py_ssize_t n = PyBytes_Size(*repr); + if (_PyBytes_Resize(repr, n + outsize) < 0) + goto error; + startout = PyBytes_AS_STRING(*repr) + n; + } + + /* Do the conversion */ + out = startout; + while (in < endin) + { + /* FIXME: support surrogates */ + insize = 1; + outsize = WideCharToMultiByte(code_page, flags, in, insize, buffer, sizeof(buffer), + NULL, pusedDefaultChar); + if (outsize <= 0) { + PyErr_SetFromWindowsErrWithFilename(0, NULL); + goto error; + } + if (pusedDefaultChar && *pusedDefaultChar) { + Py_ssize_t startpos, newpos; + PyObject *rep; + +printf("%s:%s: Unable to encode U+%04X\n", encoding, errors, *in); + startpos = in - startin; + rep = unicode_encode_call_errorhandler( + errors, &errorHandler, encoding, reason, + startin, size, &exc, + startpos, startpos+insize, &newpos); + if (rep == NULL) + goto error; + + in = startin + newpos; + + if (PyBytes_Check(rep)) { + outsize = PyBytes_GET_SIZE(rep); + memcpy(out, PyBytes_AS_STRING(rep), outsize); + out += outsize; + } else { + static int recursive = 0; + PyObject *bytes; + + if (recursive) { + PyErr_SetString(PyExc_RuntimeError, "recursive call"); + goto error; + } + recursive = 1; + bytes = encode_code_page(code_page, + PyUnicode_AS_UNICODE(rep), + PyUnicode_GET_SIZE(rep), + errors); + recursive = 0; + if (bytes == NULL) + goto error; + + outsize = PyBytes_GET_SIZE(bytes); + memcpy(out, PyBytes_AS_STRING(bytes), outsize); + out += outsize; + Py_DECREF(bytes); + } + } else { + in += insize; + memcpy(out, buffer, outsize); + out += outsize; + } + } + /* write a NUL byte */ + *out = 0; + outsize = out - startout; + if (_PyBytes_Resize(repr, outsize) < 0) + goto error; + ret = 0; + +error: + Py_XDECREF(encoding_obj); + Py_XDECREF(errorHandler); Py_XDECREF(exc); - return -1; -} - -PyObject * -PyUnicode_EncodeMBCS(const Py_UNICODE *p, - Py_ssize_t size, - const char *errors) + return ret; +} + +/* + * Convert unicode into string object (MBCS). + * Returns 0 if succeed, -1 otherwise. + */ +static int +encode_code_page_chunk(UINT code_page, + PyObject **repr, + const Py_UNICODE *p, /* unicode */ + int size, /* size of unicode */ + const char* errors) +{ + int done; + + done = encode_code_page_strict(code_page, repr, p, size); + if (done == -2) + done = encode_code_page_errors(code_page, repr, p, size, errors); + return done; +} + +static PyObject * +encode_code_page(int code_page, + const Py_UNICODE *p, + Py_ssize_t size, + const char *errors) { PyObject *repr = NULL; int ret; + if (code_page < 0) { + PyErr_SetString(PyExc_ValueError, "invalid code page number"); + return NULL; + } + #ifdef NEED_RETRY retry: if (size > INT_MAX) - ret = encode_mbcs(&repr, p, INT_MAX, errors); + ret = encode_code_page_chunk(code_page, &repr, p, INT_MAX, errors); else #endif - ret = encode_mbcs(&repr, p, (int)size, errors); + ret = encode_code_page_chunk(code_page, &repr, p, (int)size, errors); if (ret < 0) { Py_XDECREF(repr); @@ -5216,6 +5533,23 @@ } PyObject * +PyUnicode_EncodeMBCS(const Py_UNICODE *p, + Py_ssize_t size, + const char *errors) +{ + return encode_code_page(CP_ACP, p, size, errors); +} + +PyObject * +PyUnicode_EncodeCodePage(int code_page, + const Py_UNICODE *p, + Py_ssize_t size, + const char *errors) +{ + return encode_code_page(code_page, p, size, errors); +} + +PyObject * PyUnicode_AsMBCSString(PyObject *unicode) { if (!PyUnicode_Check(unicode)) {