diff --git a/Doc/library/codecs.rst b/Doc/library/codecs.rst --- a/Doc/library/codecs.rst +++ b/Doc/library/codecs.rst @@ -1286,11 +1286,14 @@ functions can be used directly if desire :synopsis: Windows ANSI codepage Encode operand according to the ANSI codepage (CP_ACP). This codec only -supports ``'strict'`` and ``'replace'`` error handlers to encode, and -``'strict'`` and ``'ignore'`` error handlers to decode. +supports ``'strict'`` and ``'replace'`` error handlers to encode. Availability: Windows only. +.. versionchanged:: 3.3 + Decoding supports any error handler. "ignore" error handlers ignores + undecodable bytes, instead of replacing them, on Windows Vista and later. + .. versionchanged:: 3.2 Before 3.2, the *errors* argument was ignored; ``'replace'`` was always used to encode, and ``'ignore'`` to decode. diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -4951,25 +4951,216 @@ PyUnicode_AsASCIIString(PyObject *unicod #endif /* XXX This code is limited to "true" double-byte encodings, as - a) it assumes an incomplete character consists of a single byte, and - b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte - encodings, see IsDBCSLeadByteEx documentation. */ + it assumes an incomplete character consists of a single byte. */ static int -is_dbcs_lead_byte(const char *s, int offset) +is_dbcs_lead_byte(const char *s, int offset, UINT code_page) { const char *curr = s + offset; - - if (IsDBCSLeadByte(*curr)) { - const char *prev = CharPrev(s, curr); - return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2); - } - return 0; + const char *prev; + + if (!IsDBCSLeadByteEx(*curr, code_page)) + return 0; + + prev = CharPrevExA(code_page, s, curr, 0); + return ((prev == curr) + || !IsDBCSLeadByteEx(*prev, code_page) + || (curr - prev == 2)); } /* - * Decode MBCS string into unicode object. If 'final' is set, converts - * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise. + * Decode a byte string from a Windows code page into unicode object. If + * 'final' is set, converts trailing lead-byte too. + * + * Returns consumed size if succeed, returns -2 on decode error, or raise a + * WindowsError and returns -1 on other error. + */ +static int +decode_mbcs_strict(PyUnicodeObject **v, + const char *s, /* MBCS string */ + int size, /* sizeof MBCS string */ + UINT code_page, + int final) +{ + const DWORD flags = MB_ERR_INVALID_CHARS; + Py_UNICODE *p; + Py_ssize_t n; + DWORD usize; + + assert(size >= 0); + + /* Skip trailing lead-byte unless 'final' is set */ + if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1, code_page)) + --size; + + /* First get the size of the result */ + if (size > 0) { + usize = MultiByteToWideChar(code_page, flags, s, size, NULL, 0); + if (usize == 0) + goto error; + } else + usize = 0; + + if (*v == NULL) { + /* Create unicode object */ + *v = _PyUnicode_New(usize); + if (*v == NULL) + return -1; + n = 0; + } + else { + /* Extend unicode object */ + n = PyUnicode_GET_SIZE(*v); + /* FIXME: check for n+usize integer overflow */ + if (_PyUnicode_Resize(v, n + usize) < 0) + return -1; + } + + /* Do the conversion */ + if (usize <= 0) + return size; + + p = PyUnicode_AS_UNICODE(*v) + n; + usize = MultiByteToWideChar(code_page, flags, s, size, p, usize); + if (usize != 0) + return size; + +error: + if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) { + PyErr_SetFromWindowsErrWithFilename(0, NULL); + return -1; + } + return -2; +} + +/* + * Decode a byte string from a code page into unicode object with an error + * handler. + * + * Returns consumed size if succeed, or raise a WindowsError or + * UnicodeDecodeError exception and returns -1 on error. + */ +static int +decode_mbcs_errors(PyUnicodeObject **v, + const char *in, /* MBCS string */ + int size, /* sizeof MBCS string */ + UINT code_page, + const char *errors) +{ + const char *startin = in; + const char *endin = in + size; + DWORD flags; + /* FIXME: Are 10 characters enough? */ + wchar_t buffer[10], *out; + int insize; + int outsize; + const char *reason = "No mapping for the Unicode character exists " + "in the target code page."; + Py_ssize_t startinpos; + PyObject *errorHandler = NULL; + PyObject *exc = NULL; + OSVERSIONINFOEX ver; + + assert(size >= 0); + + if (errors == NULL || strcmp(errors, "strict") == 0) { + /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a + UnicodeDecodeError. Ideally, we should get reason from + FormatMessage. This is the Windows 2000 English version of the + message. */ + make_decode_exception(&exc, "mbcs", in, size, 0, 0, reason); + if (exc != NULL) { + PyCodec_StrictErrors(exc); + Py_DECREF(exc); + } + return -1; + } + + flags = MB_ERR_INVALID_CHARS; + ver.dwOSVersionInfoSize = sizeof(ver); + if (!GetVersionEx((OSVERSIONINFO*)&ver)) { + PyErr_SetFromWindowsErr(0); + return -1; + } + if (strcmp(errors, "replace") == 0 && ver.dwMajorVersion >= 6) + flags = 0; + else if (strcmp(errors, "ignore") == 0 && ver.dwMajorVersion < 6) + flags = 0; + + if (*v == NULL) { + /* Create unicode object */ + *v = _PyUnicode_New(size); + if (*v == NULL) + return -1; + out = PyUnicode_AS_UNICODE(*v); + } + else { + /* Extend unicode object */ + Py_ssize_t n = PyUnicode_GET_SIZE(*v); + /* FIXME: is +size enough? */ + /* FIXME: check for n+size integer overflow */ + if (_PyUnicode_Resize(v, n+ size) < 0) + return -1; + out = PyUnicode_AS_UNICODE(*v) + n; + } + + while (in < endin) + { + /* Do the conversion */ + insize = 1; + outsize = MultiByteToWideChar(code_page, flags, + in, insize, + buffer, sizeof(buffer)/sizeof(buffer[0])); + if (outsize <= 0) { + Py_ssize_t endinpos; + Py_ssize_t outpos; + + if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) { + PyErr_SetFromWindowsErrWithFilename(0, NULL); + goto error; + } + + startinpos = in-startin; + endinpos = startinpos + 1; + outpos = out - PyUnicode_AS_UNICODE(*v); + if (unicode_decode_call_errorhandler( + errors, &errorHandler, + "mbcs", reason, + &startin, &endin, &startinpos, &endinpos, &exc, &in, + v, &outpos, &out)) + { + goto error; + } + } else { + in += insize; + memcpy(out, buffer, outsize * sizeof(wchar_t)); + out += outsize; + } + } + + /* write a NUL character at the end */ + *out = 0; + + /* Extend unicode object */ + outsize = out - PyUnicode_AS_UNICODE(*v); + if (_PyUnicode_Resize(v, outsize) < 0) + goto error; + goto done; + +error: + outsize = -1; +done: + Py_XDECREF(errorHandler); + Py_XDECREF(exc); + return outsize; +} + +/* + * Decode a byte string from a Windows code page into unicode object. If + * 'final' is set, converts trailing lead-byte too. + * + * Returns consumed size if succeed, or raise a WindowsError or + * UnicodeDecodeError exception and returns -1 on error. */ static int decode_mbcs(PyUnicodeObject **v, @@ -4978,81 +5169,13 @@ decode_mbcs(PyUnicodeObject **v, int final, const char *errors) { - Py_UNICODE *p; - Py_ssize_t n; - DWORD usize; - DWORD flags; - - assert(size >= 0); - - /* check and handle 'errors' arg */ - if (errors==NULL || strcmp(errors, "strict")==0) - flags = MB_ERR_INVALID_CHARS; - else if (strcmp(errors, "ignore")==0) - flags = 0; - else { - PyErr_Format(PyExc_ValueError, - "mbcs encoding does not support errors='%s'", - errors); - return -1; - } - - /* Skip trailing lead-byte unless 'final' is set */ - if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1)) - --size; - - /* First get the size of the result */ - if (size > 0) { - usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0); - if (usize==0) - goto mbcs_decode_error; - } else - usize = 0; - - if (*v == NULL) { - /* Create unicode object */ - *v = _PyUnicode_New(usize); - if (*v == NULL) - return -1; - n = 0; - } - else { - /* Extend unicode object */ - n = PyUnicode_GET_SIZE(*v); - if (_PyUnicode_Resize(v, n + usize) < 0) - return -1; - } - - /* Do the conversion */ - if (usize > 0) { - p = PyUnicode_AS_UNICODE(*v) + n; - if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) { - goto mbcs_decode_error; - } - } - return size; - -mbcs_decode_error: - /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then - we raise a UnicodeDecodeError - else it is a 'generic' - windows error - */ - if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) { - /* Ideally, we should get reason from FormatMessage - this - is the Windows 2000 English version of the message - */ - PyObject *exc = NULL; - const char *reason = "No mapping for the Unicode character exists " - "in the target multi-byte code page."; - make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason); - if (exc != NULL) { - PyCodec_StrictErrors(exc); - Py_DECREF(exc); - } - } else { - PyErr_SetFromWindowsErrWithFilename(0, NULL); - } - return -1; + const UINT code_page = CP_ACP; + int done; + + done = decode_mbcs_strict(v, s, size, code_page, final); + if (done == -2) + done = decode_mbcs_errors(v, s, size, code_page, errors); + return done; } PyObject *