diff -r d07d0afea9a7 Doc/library/codecs.rst --- a/Doc/library/codecs.rst Mon Jun 13 16:19:06 2011 +0200 +++ b/Doc/library/codecs.rst Thu Jun 16 02:24:38 2011 +0200 @@ -1285,12 +1285,13 @@ .. module:: encodings.mbcs :synopsis: Windows ANSI codepage -Encode operand according to the ANSI codepage (CP_ACP). This codec only -supports ``'strict'`` and ``'replace'`` error handlers to encode, and -``'strict'`` and ``'ignore'`` error handlers to decode. +Encode operand according to the ANSI codepage (CP_ACP). Availability: Windows only. +.. versionchanged:: 3.3 + Support any error handler. + .. versionchanged:: 3.2 Before 3.2, the *errors* argument was ignored; ``'replace'`` was always used to encode, and ``'ignore'`` to decode. diff -r d07d0afea9a7 Include/unicodeobject.h --- a/Include/unicodeobject.h Mon Jun 13 16:19:06 2011 +0200 +++ b/Include/unicodeobject.h Thu Jun 16 02:24:38 2011 +0200 @@ -1179,6 +1179,14 @@ Py_ssize_t *consumed /* bytes consumed */ ); +PyAPI_FUNC(PyObject*) PyUnicode_DecodeCodePageStateful( + int code_page, /* code page number */ + const char *string, /* encoded string */ + Py_ssize_t length, /* size of string */ + const char *errors, /* error handling */ + Py_ssize_t *consumed /* bytes consumed */ + ); + PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString( PyObject *unicode /* Unicode object */ ); @@ -1186,7 +1194,14 @@ #ifndef Py_LIMITED_API PyAPI_FUNC(PyObject*) PyUnicode_EncodeMBCS( const Py_UNICODE *data, /* Unicode char buffer */ - Py_ssize_t length, /* Number of Py_UNICODE chars to encode */ + Py_ssize_t length, /* number of Py_UNICODE chars to encode */ + const char *errors /* error handling */ + ); + +PyAPI_FUNC(PyObject*) PyUnicode_EncodeCodePage( + int code_page, /* code page number */ + const Py_UNICODE *data, /* Unicode char buffer */ + Py_ssize_t length, /* number of Py_UNICODE chars to encode */ const char *errors /* error handling */ ); #endif diff -r d07d0afea9a7 Lib/test/test_codecs.py --- a/Lib/test/test_codecs.py Mon Jun 13 16:19:06 2011 +0200 +++ b/Lib/test/test_codecs.py Thu Jun 16 02:24:38 2011 +0200 @@ -1718,6 +1718,142 @@ self.assertEqual(sout, b"\x80") +class CodePageTest(unittest.TestCase): + def test_invalid_code_page(self): + self.assertRaises(ValueError, codecs.code_page_encode, -1, 'a') + self.assertRaises(ValueError, codecs.code_page_decode, -1, b'a') + self.assertRaises(WindowsError, codecs.code_page_encode, 123, 'a') + self.assertRaises(WindowsError, codecs.code_page_decode, 123, b'a') + + def test_code_page_name(self): + self.assertRaisesRegex(UnicodeEncodeError, 'cp932', + codecs.code_page_encode, 932, '\xff') + self.assertRaisesRegex(UnicodeDecodeError, 'cp932', + codecs.code_page_decode, 932, b'\x81\x00') + self.assertRaisesRegex(UnicodeDecodeError, 'CP_UTF8', + codecs.code_page_decode, 65001, b'\xff') + + def check_decode(self, cp, tests): + for raw, errors, expected in tests: + if expected is not None: + try: + decoded = codecs.code_page_decode(cp, raw, errors) + except UnicodeDecodeError as err: + self.fail('Unable to decode %a from "cp%s" with ' + 'errors=%r: %s' % (raw, cp, errors, err)) + self.assertEqual(decoded[0], expected, + '%a.decode("cp%s", %r)=%a != %a' + % (raw, cp, errors, decoded[0], expected)) + # assert 0 <= decoded[1] <= len(raw) + self.assertGreaterEqual(decoded[1], 0) + self.assertLessEqual(decoded[1], len(raw)) + else: + self.assertRaises(UnicodeDecodeError, + codecs.code_page_decode, cp, raw, errors) + + def check_encode(self, cp, tests): + for text, errors, expected in tests: + if expected is not None: + try: + encoded = codecs.code_page_encode(cp, text, errors) + except UnicodeEncodeError as err: + self.fail('Unable to encode %a to "cp%s" with ' + 'errors=%r: %s' % (text, cp, errors, err)) + self.assertEqual(encoded[0], expected) + self.assertEqual(encoded[1], len(text)) + else: + self.assertRaises(UnicodeEncodeError, + codecs.code_page_encode, cp, text, errors) + + def test_cp932(self): + self.check_encode(932, ( + ('abc', 'strict', b'abc'), + ('\uff44\u9a3e', 'strict', b'\x82\x84\xe9\x80'), + # not encodable + ('\xff', 'strict', None), + ('[\xff]', 'ignore', b'[]'), + ('[\xff]', 'replace', b'[y]'), + ('[\u20ac]', 'replace', b'[?]'), + )) + self.check_decode(932, ( + (b'abc', 'strict', 'abc'), + (b'\x82\x84\xe9\x80', 'strict', '\uff44\u9a3e'), + # invalid bytes + (b'\x81\x00abc', 'strict', None), + (b'\x81\x00abc', 'ignore', '\x00abc'), + (b'\x81\x00abc', 'replace', '\u30fb\x00abc'), + )) + + def test_cp1252(self): + self.check_encode(1252, ( + ('abc', 'strict', b'abc'), + ('\xe9\u20ac', 'strict', b'\xe9\x80'), + ('\xff', 'strict', b'\xff'), + )) + self.check_decode(1252, ( + (b'abc', 'strict', 'abc'), + (b'\xe9\x80', 'strict', '\xe9\u20ac'), + (b'\xff', 'strict', '\xff'), + )) + + def test_cp_utf7(self): + cp = 65000 + self.check_encode(cp, ( + ('abc', 'strict', b'abc'), + ('\xe9\u20ac', 'strict', b'+AOkgrA-'), + ('\U0010ffff', 'strict', b'+2//f/w-'), + ('\udc80', 'strict', b'+3IA-'), + ('\ufffd', 'strict', b'+//0-'), + )) + self.check_decode(cp, ( + (b'abc', 'strict', 'abc'), + (b'+AOkgrA-', 'strict', '\xe9\u20ac'), + (b'+2//f/w-', 'strict', '\U0010ffff'), + (b'+3IA-', 'strict', '\udc80'), + (b'+//0-', 'strict', '\ufffd'), + # invalid bytes + (b'[+/]', 'strict', '[]'), + (b'[\xff]', 'strict', '[\xff]'), + )) + + def test_cp_utf8(self): + vista_or_later = (sys.getwindowsversion().major >= 6) + cp = 65001 + + tests = [ + ('abc', 'strict', b'abc'), + ('\xe9\u20ac', 'strict', b'\xc3\xa9\xe2\x82\xac'), + ('\U0010ffff', 'strict', b'\xf4\x8f\xbf\xbf'), + ] + if vista_or_later: + tests.append(('\udc80', 'strict', None)) + else: + tests.append(('\udc80', 'strict', b'\xef\xbf\xbd')) + self.check_encode(cp, tests) + + tests = [ + (b'abc', 'strict', 'abc'), + (b'\xc3\xa9\xe2\x82\xac', 'strict', '\xe9\u20ac'), + (b'\xf4\x8f\xbf\xbf', 'strict', '\U0010ffff'), + (b'\xef\xbf\xbd', 'strict', '\ufffd'), + # invalid bytes + (b'[\xff]', 'strict', None), + (b'[\xff]', 'ignore', '[]'), + (b'[\xff]', 'replace', '[\ufffd]'), + ] + if vista_or_later: + tests.extend(( + (b'[\xed\xb2\x80]', 'strict', None), + (b'[\xed\xb2\x80]', 'ignore', '[]'), + (b'[\xed\xb2\x80]', 'replace', '[\ufffd\ufffd\ufffd]'), + )) + else: + tests.extend(( + (b'[\xed\xb2\x80]', 'strict', '\udc80'), + )) + self.check_decode(cp, tests) + + def test_main(): support.run_unittest( UTF32Test, @@ -1746,6 +1882,7 @@ SurrogateEscapeTest, BomTest, TransformCodecTest, + CodePageTest, ) diff -r d07d0afea9a7 Modules/_codecsmodule.c --- a/Modules/_codecsmodule.c Mon Jun 13 16:19:06 2011 +0200 +++ b/Modules/_codecsmodule.c Thu Jun 16 02:24:38 2011 +0200 @@ -613,6 +613,31 @@ return codec_tuple(decoded, consumed); } +static PyObject * +code_page_decode(PyObject *self, + PyObject *args) +{ + Py_buffer pbuf; + const char *errors = NULL; + int final = 0; + Py_ssize_t consumed; + PyObject *decoded = NULL; + int code_page; + + if (!PyArg_ParseTuple(args, "iy*|zi:mbcs_decode", + &code_page, &pbuf, &errors, &final)) + return NULL; + consumed = pbuf.len; + + decoded = PyUnicode_DecodeCodePageStateful(code_page, + pbuf.buf, pbuf.len, errors, + final ? NULL : &consumed); + PyBuffer_Release(&pbuf); + if (decoded == NULL) + return NULL; + return codec_tuple(decoded, consumed); +} + #endif /* MS_WINDOWS */ /* --- Encoder ------------------------------------------------------------ */ @@ -1014,6 +1039,31 @@ return v; } +static PyObject * +code_page_encode(PyObject *self, + PyObject *args) +{ + PyObject *str, *v; + const char *errors = NULL; + int code_page; + + if (!PyArg_ParseTuple(args, "iO|z:code_page_encode", + &code_page, &str, &errors)) + return NULL; + + str = PyUnicode_FromObject(str); + if (str == NULL) + return NULL; + v = codec_tuple(PyUnicode_EncodeCodePage( + code_page, + PyUnicode_AS_UNICODE(str), + PyUnicode_GET_SIZE(str), + errors), + PyUnicode_GET_SIZE(str)); + Py_DECREF(str); + return v; +} + #endif /* MS_WINDOWS */ /* --- Error handler registry --------------------------------------------- */ @@ -1104,6 +1154,8 @@ #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T) {"mbcs_encode", mbcs_encode, METH_VARARGS}, {"mbcs_decode", mbcs_decode, METH_VARARGS}, + {"code_page_encode", code_page_encode, METH_VARARGS}, + {"code_page_decode", code_page_decode, METH_VARARGS}, #endif {"register_error", register_error, METH_VARARGS, register_error__doc__}, diff -r d07d0afea9a7 Objects/unicodeobject.c --- a/Objects/unicodeobject.c Mon Jun 13 16:19:06 2011 +0200 +++ b/Objects/unicodeobject.c Thu Jun 16 02:24:38 2011 +0200 @@ -196,6 +196,10 @@ #endif } +#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T) +static OSVERSIONINFOEX winver; +#endif + /* --- Bloom Filters ----------------------------------------------------- */ /* stuff to implement simple "bloom filters" for Unicode characters. @@ -4950,109 +4954,329 @@ #define NEED_RETRY #endif +#ifndef WC_ERR_INVALID_CHARS +# define WC_ERR_INVALID_CHARS 0x0080 +#endif + +static char* +code_page_name(UINT code_page, PyObject **obj) +{ + *obj = NULL; + if (code_page == CP_ACP) + return "mbcs"; + if (code_page == CP_UTF7) + return "CP_UTF7"; + if (code_page == CP_UTF8) + return "CP_UTF8"; + + *obj = PyBytes_FromFormat("cp%u", code_page); + if (*obj == NULL) + return NULL; + return PyBytes_AS_STRING(*obj); +} + +/* Forward */ +static PyObject * +encode_code_page(int code_page, const Py_UNICODE *p, Py_ssize_t size, + const char *errors); + /* XXX This code is limited to "true" double-byte encodings, as - a) it assumes an incomplete character consists of a single byte, and - b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte - encodings, see IsDBCSLeadByteEx documentation. */ + it assumes an incomplete character consists of a single byte. */ static int -is_dbcs_lead_byte(const char *s, int offset) +is_dbcs_lead_byte(const char *s, int offset, UINT code_page) { const char *curr = s + offset; - - if (IsDBCSLeadByte(*curr)) { - const char *prev = CharPrev(s, curr); - return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2); - } - return 0; + const char *prev; + + if (!IsDBCSLeadByteEx(*curr, code_page)) + return 0; + + prev = CharPrevExA(code_page, s, curr, 0); + return ((prev == curr) + || !IsDBCSLeadByteEx(*prev, code_page) + || (curr - prev == 2)); } /* - * Decode MBCS string into unicode object. If 'final' is set, converts - * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise. + * Decode a byte string from a Windows code page into unicode object in strict + * mode. + * + * Returns consumed size if succeed, returns -2 on decode error, or raise a + * WindowsError and returns -1 on other error. */ static int -decode_mbcs(PyUnicodeObject **v, - const char *s, /* MBCS string */ - int size, /* sizeof MBCS string */ - int final, - const char *errors) -{ - Py_UNICODE *p; - Py_ssize_t n; - DWORD usize; +decode_code_page_strict(UINT code_page, + PyUnicodeObject **v, + const char *in, /* MBCS string */ + int insize) /* sizeof MBCS string */ +{ DWORD flags; - - assert(size >= 0); - - /* check and handle 'errors' arg */ - if (errors==NULL || strcmp(errors, "strict")==0) + Py_UNICODE *out; + DWORD outsize; + + assert(insize > 0); + + if (code_page == 65000) { + /* The CP_UTF7 decoder only supports flags=0 */ + flags = 0; + } + else flags = MB_ERR_INVALID_CHARS; - else if (strcmp(errors, "ignore")==0) - flags = 0; - else { - PyErr_Format(PyExc_ValueError, - "mbcs encoding does not support errors='%s'", - errors); - return -1; - } - - /* Skip trailing lead-byte unless 'final' is set */ - if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1)) - --size; /* First get the size of the result */ - if (size > 0) { - usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0); - if (usize==0) - goto mbcs_decode_error; - } else - usize = 0; + outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0); + if (outsize <= 0) + goto error; if (*v == NULL) { /* Create unicode object */ - *v = _PyUnicode_New(usize); + *v = _PyUnicode_New(outsize); if (*v == NULL) return -1; - n = 0; + out = PyUnicode_AS_UNICODE(*v); } else { /* Extend unicode object */ - n = PyUnicode_GET_SIZE(*v); - if (_PyUnicode_Resize(v, n + usize) < 0) + Py_ssize_t n = PyUnicode_GET_SIZE(*v); + /* FIXME: check for n+outsize integer overflow */ + if (_PyUnicode_Resize(v, n + outsize) < 0) return -1; + out = PyUnicode_AS_UNICODE(*v) + n; } /* Do the conversion */ - if (usize > 0) { - p = PyUnicode_AS_UNICODE(*v) + n; - if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) { - goto mbcs_decode_error; - } - } - return size; - -mbcs_decode_error: - /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then - we raise a UnicodeDecodeError - else it is a 'generic' - windows error - */ - if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) { - /* Ideally, we should get reason from FormatMessage - this - is the Windows 2000 English version of the message - */ - PyObject *exc = NULL; - const char *reason = "No mapping for the Unicode character exists " - "in the target multi-byte code page."; - make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason); + outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize); + if (outsize <= 0) + goto error; + return insize; + +error: + if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) + return -2; + PyErr_SetFromWindowsErrWithFilename(0, NULL); + return -1; +} + +/* + * Decode a byte string from a code page into unicode object with an error + * handler. + * + * Returns consumed size if succeed, or raise a WindowsError or + * UnicodeDecodeError exception and returns -1 on error. + */ +static int +decode_code_page_errors(UINT code_page, + PyUnicodeObject **v, + const char *in, /* MBCS string */ + int size, /* sizeof MBCS string */ + const char *errors) +{ + const char *startin = in; + const char *endin = in + size; + DWORD flags; + /* FIXME: Are 10 characters enough? */ + wchar_t buffer[10], *startout, *out; + int insize; + int outsize; + const char *reason = "No mapping for the Unicode character exists " + "in the target code page."; + Py_ssize_t startinpos; + PyObject *errorHandler = NULL; + PyObject *exc = NULL; + PyObject *encoding_obj = NULL; + char *encoding; + int ret = -1; + + assert(size >= 0); + + encoding = code_page_name(code_page, &encoding_obj); + if (encoding == NULL) + return -1; + + if (errors == NULL || strcmp(errors, "strict") == 0) { + /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a + UnicodeDecodeError. Ideally, we should get reason from + FormatMessage. This is the Windows 2000 English version of the + message. */ + make_decode_exception(&exc, encoding, in, size, 0, 0, reason); if (exc != NULL) { PyCodec_StrictErrors(exc); - Py_DECREF(exc); - } - } else { - PyErr_SetFromWindowsErrWithFilename(0, NULL); - } - return -1; + Py_CLEAR(exc); + } + goto error; + } + + if ((strcmp(errors, "replace") == 0 && winver.dwMajorVersion >= 6) + || (strcmp(errors, "ignore") == 0 && winver.dwMajorVersion < 6) + || code_page == 65000) + { + /* flags=0 replaces undecodable bytes on Windows Vista and later, + but ignores undecodable bytes on Windows older than Vista. */ + flags = 0; + } + else + flags = MB_ERR_INVALID_CHARS; + + if (*v == NULL) { + /* Create unicode object */ + *v = _PyUnicode_New(size); + if (*v == NULL) + goto error; + startout = PyUnicode_AS_UNICODE(*v); + } + else { + /* Extend unicode object */ + Py_ssize_t n = PyUnicode_GET_SIZE(*v); + /* FIXME: is +size enough? */ + /* FIXME: check for n+size integer overflow */ + if (_PyUnicode_Resize(v, n+ size) < 0) + goto error; + startout = PyUnicode_AS_UNICODE(*v) + n; + } + + out = startout; + while (in < endin) + { + /* Do the conversion */ + /* FIXME: support multibyte encodings */ + insize = 1; + outsize = MultiByteToWideChar(code_page, flags, + in, insize, + buffer, sizeof(buffer)/sizeof(buffer[0])); + if (outsize <= 0) { + Py_ssize_t endinpos; + Py_ssize_t outpos; + + if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) { + PyErr_SetFromWindowsErrWithFilename(0, NULL); + goto error; + } + + startinpos = in-startin; + endinpos = startinpos + 1; + outpos = out - PyUnicode_AS_UNICODE(*v); + if (unicode_decode_call_errorhandler( + errors, &errorHandler, + encoding, reason, + &startin, &endin, &startinpos, &endinpos, &exc, &in, + v, &outpos, &out)) + { + goto error; + } + } + else { + in += insize; + memcpy(out, buffer, outsize * sizeof(wchar_t)); + out += outsize; + } + } + + /* write a NUL character at the end */ + *out = 0; + + /* Extend unicode object */ + outsize = out - startout; + if (_PyUnicode_Resize(v, outsize) < 0) + goto error; + ret = 0; + +error: + Py_XDECREF(encoding_obj); + Py_XDECREF(errorHandler); + Py_XDECREF(exc); + return ret; +} + +/* + * Decode a byte string from a Windows code page into unicode object. If + * 'final' is set, converts trailing lead-byte too. + * + * Returns consumed size if succeed, or raise a WindowsError or + * UnicodeDecodeError exception and returns -1 on error. + */ +static int +decode_code_page(UINT code_page, + PyUnicodeObject **v, + const char *s, /* MBCS string */ + int size, /* sizeof MBCS string */ + int final, + const char *errors) +{ + int done; + + if (size == 0) { + if (*v == NULL) { + *v = _PyUnicode_New(0); + if (*v == NULL) + return -1; + } + return 0; + } + + /* Skip trailing lead-byte unless 'final' is set */ + if (!final && is_dbcs_lead_byte(s, size - 1, code_page)) + --size; + + done = decode_code_page_strict(code_page, v, s, size); + if (done == -2) + done = decode_code_page_errors(code_page, v, s, size, errors); + return done; +} + +static PyObject * +decode_code_page_stateful(int code_page, + const char *s, + Py_ssize_t size, + const char *errors, + Py_ssize_t *consumed) +{ + PyUnicodeObject *v = NULL; + int done; + + if (code_page < 0) { + PyErr_SetString(PyExc_ValueError, "invalid code page number"); + return NULL; + } + + if (consumed) + *consumed = 0; + +#ifdef NEED_RETRY + retry: + if (size > INT_MAX) + done = decode_code_page(code_page, &v, s, INT_MAX, 0, errors); + else +#endif + done = decode_code_page(code_page, &v, s, (int)size, !consumed, errors); + + if (done < 0) { + Py_XDECREF(v); + return NULL; + } + + if (consumed) + *consumed += done; + +#ifdef NEED_RETRY + if (size > INT_MAX) { + s += done; + size -= done; + goto retry; + } +#endif + + return (PyObject *)v; +} + +PyObject * +PyUnicode_DecodeCodePageStateful(int code_page, + const char *s, + Py_ssize_t size, + const char *errors, + Py_ssize_t *consumed) +{ + return decode_code_page_stateful(code_page, s, size, errors, consumed); } PyObject * @@ -5061,37 +5285,7 @@ const char *errors, Py_ssize_t *consumed) { - PyUnicodeObject *v = NULL; - int done; - - if (consumed) - *consumed = 0; - -#ifdef NEED_RETRY - retry: - if (size > INT_MAX) - done = decode_mbcs(&v, s, INT_MAX, 0, errors); - else -#endif - done = decode_mbcs(&v, s, (int)size, !consumed, errors); - - if (done < 0) { - Py_XDECREF(v); - return NULL; - } - - if (consumed) - *consumed += done; - -#ifdef NEED_RETRY - if (size > INT_MAX) { - s += done; - size -= done; - goto retry; - } -#endif - - return (PyObject *)v; + return decode_code_page_stateful(CP_ACP, s, size, errors, consumed); } PyObject * @@ -5102,102 +5296,310 @@ return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL); } +static DWORD +encode_code_page_flags(UINT code_page) +{ + if (code_page == CP_UTF8) { + if (winver.dwMajorVersion >= 6) + /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista and later */ + return WC_ERR_INVALID_CHARS; + else + return WC_NO_BEST_FIT_CHARS; + } + else if (code_page == CP_UTF7) { + /* CP_UTF7 only accepts flags=0 */ + return 0; + } + else { + return WC_NO_BEST_FIT_CHARS; + } +} + /* - * Convert unicode into string object (MBCS). - * Returns 0 if succeed, -1 otherwise. + * Encode a Unicode string to a Windows code page into a byte string in strict + * mode. + * + * Returns consumed characters if succeed, returns -2 on encode error, or raise + * a WindowsError and returns -1 on other error. */ static int -encode_mbcs(PyObject **repr, - const Py_UNICODE *p, /* unicode */ - int size, /* size of unicode */ - const char* errors) +encode_code_page_strict(UINT code_page, + PyObject **repr, + const Py_UNICODE *p, /* unicode */ + int size) /* size of unicode */ +{ + BOOL usedDefaultChar = FALSE; + BOOL *pusedDefaultChar = &usedDefaultChar; + int outsize; + PyObject *exc = NULL; + DWORD flags; + char *out; + + assert(size > 0); + + flags = encode_code_page_flags(code_page); + if (code_page != 65001 && code_page != 65000) + pusedDefaultChar = &usedDefaultChar; + else + pusedDefaultChar = NULL; + + /* First get the size of the result */ + + outsize = WideCharToMultiByte(code_page, flags, p, size, NULL, 0, + NULL, pusedDefaultChar); + if (outsize == 0) + goto error; + /* If we used a default char, then we failed! */ + if (pusedDefaultChar && *pusedDefaultChar) + return -2; + + if (*repr == NULL) { + /* Create string object */ + *repr = PyBytes_FromStringAndSize(NULL, outsize); + if (*repr == NULL) + return -1; + out = PyBytes_AS_STRING(*repr); + } + else { + /* Extend string object */ + Py_ssize_t n = PyBytes_Size(*repr); + if (_PyBytes_Resize(repr, n + outsize) < 0) + return -1; + out = PyBytes_AS_STRING(*repr) + n; + } + + /* Do the conversion */ + outsize = WideCharToMultiByte(code_page, flags, p, size, + out, outsize, + NULL, pusedDefaultChar); + if (outsize == 0) + goto error; + if (pusedDefaultChar && *pusedDefaultChar) + return -2; + return 0; + +error: + if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) + return -2; + PyErr_SetFromWindowsErrWithFilename(0, NULL); + return -1; +} + +/* + * Encode a Unicode string to a Windows code page into a byte string using a + * error handler. + * + * Returns consumed characters if succeed, or raise a WindowsError and returns + * -1 on other error. + */ +static int +encode_code_page_errors(UINT code_page, + PyObject **repr, + const Py_UNICODE *in, /* unicode */ + int insize, /* size of unicode */ + const char* errors) { BOOL usedDefaultChar = FALSE; BOOL *pusedDefaultChar; - int mbcssize; - Py_ssize_t n; + int outsize; + DWORD flags; + /* FIXME: is it enough? */ + char buffer[10]; + const Py_UNICODE *startin = in, *endin = in + insize; + char *out, *startout; + int charsize; + PyObject *errorHandler = NULL; PyObject *exc = NULL; - DWORD flags; - - assert(size >= 0); - - /* check and handle 'errors' arg */ - if (errors==NULL || strcmp(errors, "strict")==0) { - flags = WC_NO_BEST_FIT_CHARS; + /* Ideally, we should get reason from FormatMessage. This is the Windows + 2000 English version of the message. */ + const char *reason = "invalid character"; + PyObject *encoding_obj = NULL; + char *encoding; + int ret = -1; + int err; + + assert(insize > 0); + + encoding = code_page_name(code_page, &encoding_obj); + if (encoding == NULL) + return -1; + + if (errors == NULL || strcmp(errors, "strict") == 0) { + /* The last error was ERROR_NO_UNICODE_TRANSLATION, + then we raise a UnicodeEncodeError. */ + make_encode_exception(&exc, encoding, in, insize, 0, 0, reason); + if (exc != NULL) { + PyCodec_StrictErrors(exc); + Py_DECREF(exc); + } + Py_XDECREF(encoding_obj); + return ret; + } + + /* handle 'errors' arg */ + flags = encode_code_page_flags(code_page); + if ((strcmp(errors, "replace") == 0 && winver.dwMajorVersion >= 6) + || (strcmp(errors, "ignore") == 0 && winver.dwMajorVersion < 6)) + { + /* flags=0 replaces unencodable characters on Windows Vista and later, + but ignores unencodable characters on Windows older than Vista. */ + flags = 0; + } + if (code_page != 65001 && code_page != 65000) pusedDefaultChar = &usedDefaultChar; - } else if (strcmp(errors, "replace")==0) { - flags = 0; - pusedDefaultChar = NULL; - } else { - PyErr_Format(PyExc_ValueError, - "mbcs encoding does not support errors='%s'", - errors); - return -1; - } - - /* First get the size of the result */ - if (size > 0) { - mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0, - NULL, pusedDefaultChar); - if (mbcssize == 0) { - PyErr_SetFromWindowsErrWithFilename(0, NULL); - return -1; - } - /* If we used a default char, then we failed! */ - if (pusedDefaultChar && *pusedDefaultChar) - goto mbcs_encode_error; - } else { - mbcssize = 0; - } + else + pusedDefaultChar = NULL; + + /* FIXME: is '*10' enough? */ + outsize = insize * 10; if (*repr == NULL) { /* Create string object */ - *repr = PyBytes_FromStringAndSize(NULL, mbcssize); + *repr = PyBytes_FromStringAndSize(NULL, outsize); if (*repr == NULL) - return -1; - n = 0; + goto error; + startout = PyBytes_AS_STRING(*repr); } else { /* Extend string object */ - n = PyBytes_Size(*repr); - if (_PyBytes_Resize(repr, n + mbcssize) < 0) - return -1; + Py_ssize_t n = PyBytes_Size(*repr); + /* FIXME: n is wrong */ + if (_PyBytes_Resize(repr, n + outsize) < 0) + goto error; + startout = PyBytes_AS_STRING(*repr) + n; } /* Do the conversion */ - if (size > 0) { - char *s = PyBytes_AS_STRING(*repr) + n; - if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize, - NULL, pusedDefaultChar)) { - PyErr_SetFromWindowsErrWithFilename(0, NULL); - return -1; - } - if (pusedDefaultChar && *pusedDefaultChar) - goto mbcs_encode_error; - } - return 0; - -mbcs_encode_error: - raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character"); + out = startout; + while (in < endin) + { + /* FIXME: support surrogates */ + charsize = 1; + outsize = WideCharToMultiByte(code_page, flags, in, charsize, buffer, sizeof(buffer), + NULL, pusedDefaultChar); + if (outsize <= 0) { + if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) { + PyErr_SetFromWindowsErrWithFilename(0, NULL); + goto error; + } + else + err = 1; + } + else + err = (pusedDefaultChar && *pusedDefaultChar); + if (err) { + Py_ssize_t startpos, newpos; + PyObject *rep; + + startpos = in - startin; + rep = unicode_encode_call_errorhandler( + errors, &errorHandler, encoding, reason, + startin, insize, &exc, + startpos, startpos+charsize, &newpos); + if (rep == NULL) + goto error; + + in = startin + newpos; + + if (PyBytes_Check(rep)) { + outsize = PyBytes_GET_SIZE(rep); + memcpy(out, PyBytes_AS_STRING(rep), outsize); + out += outsize; + } + else { + static int recursive = 0; + PyObject *bytes; + + if (recursive) { + PyErr_SetString(PyExc_RuntimeError, "recursive call"); + goto error; + } + recursive = 1; + bytes = encode_code_page(code_page, + PyUnicode_AS_UNICODE(rep), + PyUnicode_GET_SIZE(rep), + errors); + recursive = 0; + if (bytes == NULL) + goto error; + + outsize = PyBytes_GET_SIZE(bytes); + memcpy(out, PyBytes_AS_STRING(bytes), outsize); + out += outsize; + Py_DECREF(bytes); + } + } + else { + in += charsize; + memcpy(out, buffer, outsize); + out += outsize; + } + } + /* write a NUL byte */ + *out = 0; + outsize = out - startout; + if (_PyBytes_Resize(repr, outsize) < 0) + goto error; + ret = 0; + +error: + Py_XDECREF(encoding_obj); + Py_XDECREF(errorHandler); Py_XDECREF(exc); - return -1; -} - -PyObject * -PyUnicode_EncodeMBCS(const Py_UNICODE *p, - Py_ssize_t size, - const char *errors) + return ret; +} + +/* + * Encode a Unicode string to a Windows code page into a byte string. + * + * Returns consumed characters if succeed, or raise a WindowsError and returns + * -1 on other error. + */ +static int +encode_code_page_chunk(UINT code_page, + PyObject **repr, + const Py_UNICODE *p, /* unicode */ + int size, /* size of unicode */ + const char* errors) +{ + int done; + + if (size == 0) { + if (*repr == NULL) { + *repr = PyBytes_FromStringAndSize(NULL, 0); + if (*repr == NULL) + return -1; + } + return 0; + } + + done = encode_code_page_strict(code_page, repr, p, size); + if (done == -2) + done = encode_code_page_errors(code_page, repr, p, size, errors); + return done; +} + +static PyObject * +encode_code_page(int code_page, + const Py_UNICODE *p, + Py_ssize_t size, + const char *errors) { PyObject *repr = NULL; int ret; + if (code_page < 0) { + PyErr_SetString(PyExc_ValueError, "invalid code page number"); + return NULL; + } + #ifdef NEED_RETRY retry: if (size > INT_MAX) - ret = encode_mbcs(&repr, p, INT_MAX, errors); + ret = encode_code_page_chunk(code_page, &repr, p, INT_MAX, errors); else #endif - ret = encode_mbcs(&repr, p, (int)size, errors); + ret = encode_code_page_chunk(code_page, &repr, p, (int)size, errors); if (ret < 0) { Py_XDECREF(repr); @@ -5216,6 +5618,23 @@ } PyObject * +PyUnicode_EncodeMBCS(const Py_UNICODE *p, + Py_ssize_t size, + const char *errors) +{ + return encode_code_page(CP_ACP, p, size, errors); +} + +PyObject * +PyUnicode_EncodeCodePage(int code_page, + const Py_UNICODE *p, + Py_ssize_t size, + const char *errors) +{ + return encode_code_page(code_page, p, size, errors); +} + +PyObject * PyUnicode_AsMBCSString(PyObject *unicode) { if (!PyUnicode_Check(unicode)) { @@ -10047,7 +10466,7 @@ /* Initialize the Unicode implementation */ -void _PyUnicode_Init(void) +int _PyUnicode_Init(void) { int i; @@ -10067,13 +10486,13 @@ free_list = NULL; numfree = 0; unicode_empty = _PyUnicode_New(0); - if (!unicode_empty) - return; + if (unicode_empty == NULL) + return -1; for (i = 0; i < 256; i++) unicode_latin1[i] = NULL; if (PyType_Ready(&PyUnicode_Type) < 0) - Py_FatalError("Can't initialize 'unicode'"); + return -1; /* initialize the linebreak bloom filter */ bloom_linebreak = make_bloom_mask( @@ -10081,6 +10500,15 @@ ); PyType_Ready(&EncodingMapType); + +#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T) + winver.dwOSVersionInfoSize = sizeof(winver); + if (!GetVersionEx((OSVERSIONINFO*)&winver)) { + PyErr_SetFromWindowsErr(0); + return -1; + } +#endif + return 0; } /* Finalize the Unicode implementation */ diff -r d07d0afea9a7 Python/pythonrun.c --- a/Python/pythonrun.c Mon Jun 13 16:19:06 2011 +0200 +++ b/Python/pythonrun.c Thu Jun 16 02:24:38 2011 +0200 @@ -67,7 +67,7 @@ static void call_py_exitfuncs(void); static void wait_for_thread_shutdown(void); static void call_ll_exitfuncs(void); -extern void _PyUnicode_Init(void); +extern int _PyUnicode_Init(void); extern void _PyUnicode_Fini(void); extern int _PyLong_Init(void); extern void PyLong_Fini(void); @@ -253,7 +253,8 @@ Py_FatalError("Py_Initialize: can't make modules_reloading dictionary"); /* Init Unicode implementation; relies on the codec registry */ - _PyUnicode_Init(); + if (_PyUnicode_Init() < 0) + Py_FatalError("Py_Initialize: can't initialize unicode"); bimod = _PyBuiltin_Init(); if (bimod == NULL)