Index: Objects/unicodeobject.c =================================================================== --- Objects/unicodeobject.c (revision 69939) +++ Objects/unicodeobject.c (working copy) @@ -561,10 +561,19 @@ #ifdef HAVE_WCHAR_H +#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T > 2) +# define CONVERT_WCHAR_TO_SURROGATES +#endif + PyObject *PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size) { PyUnicodeObject *unicode; + register Py_ssize_t i; + Py_ssize_t alloc; +#ifdef CONVERT_WCHAR_TO_SURROGATES + const wchar_t *orig_w; +#endif if (w == NULL) { if (size == 0) @@ -577,7 +586,17 @@ size = wcslen(w); } - unicode = _PyUnicode_New(size); + alloc = size; +#ifdef CONVERT_WCHAR_TO_SURROGATES + orig_w = w; + for (i = size; i > 0; i--) { + if (*w > 0xFFFF) + alloc++; + w++; + } + w = orig_w; +#endif + unicode = _PyUnicode_New(alloc); if (!unicode) return NULL; @@ -587,16 +606,27 @@ #else { register Py_UNICODE *u; - register Py_ssize_t i; u = PyUnicode_AS_UNICODE(unicode); - for (i = size; i > 0; i--) + for (i = size; i > 0; i--) { +#ifdef CONVERT_WCHAR_TO_SURROGATES + if (*w > 0xFFFF) { + wchar_t ordinal = *w++; + ordinal -= 0x10000; + *u++ = 0xD800 | (ordinal >> 10); + *u++ = 0xDC00 | (ordinal & 0x3FF); + continue; + } +#endif *u++ = *w++; + } } #endif return (PyObject *)unicode; } +#undef CONVERT_WCHAR_TO_SURROGATES + static void makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c) { Index: PC/pyconfig.h =================================================================== --- PC/pyconfig.h (revision 69939) +++ PC/pyconfig.h (working copy) @@ -366,6 +366,9 @@ # endif #endif +/* The size of `wchar_t', as computed by sizeof. */ +#define SIZEOF_WCHAR_T 2 + #ifdef _DEBUG # define Py_DEBUG #endif Index: Modules/_testcapimodule.c =================================================================== --- Modules/_testcapimodule.c (revision 69939) +++ Modules/_testcapimodule.c (working copy) @@ -708,6 +708,48 @@ } static PyObject * +test_widechar(PyObject *self) +{ +#if SIZEOF_WCHAR_T == 4 + const wchar_t wtext[2] = {(wchar_t)0x10ABCDu}; + size_t wtextlen = 1; +#else + const wchar_t wtext[3] = {(wchar_t)0xDBEAu, (wchar_t)0xDFCDu}; + size_t wtextlen = 2; +#endif + PyObject *wide, *utf8; + + wide = PyUnicode_FromWideChar(wtext, wtextlen); + if (wide == NULL) + return NULL; + + utf8 = PyUnicode_FromString("\xf4\x8a\xaf\x8d"); + if (utf8 == NULL) { + Py_DECREF(wide); + return NULL; + } + + if (PyUnicode_GET_SIZE(wide) != PyUnicode_GET_SIZE(utf8)) { + Py_DECREF(wide); + Py_DECREF(utf8); + return raiseTestError("test_widechar", + "wide string and utf8 string have different length"); + } + if (PyUnicode_Compare(wide, utf8)) { + Py_DECREF(wide); + Py_DECREF(utf8); + if (PyErr_Occurred()) + return NULL; + return raiseTestError("test_widechar", + "wide string and utf8 string are differents"); + } + + Py_DECREF(wide); + Py_DECREF(utf8); + Py_RETURN_NONE; +} + +static PyObject * test_empty_argparse(PyObject *self) { /* Test that formats can begin with '|'. See issue #4720. */ @@ -1206,6 +1248,7 @@ {"test_s_code", (PyCFunction)test_s_code, METH_NOARGS}, {"test_u_code", (PyCFunction)test_u_code, METH_NOARGS}, {"test_Z_code", (PyCFunction)test_Z_code, METH_NOARGS}, + {"test_widechar", (PyCFunction)test_widechar, METH_NOARGS}, #ifdef WITH_THREAD {"_test_thread_state", test_thread_state, METH_VARARGS}, {"_pending_threadfunc", pending_threadfunc, METH_VARARGS},