diff -r 4d89d03690ef Objects/unicodeobject.c --- a/Objects/unicodeobject.c Sun Aug 31 15:48:55 2014 +0200 +++ b/Objects/unicodeobject.c Mon Sep 01 23:22:18 2014 +0200 @@ -2792,37 +2792,143 @@ PyUnicode_FromFormat(const char *format, #ifdef HAVE_WCHAR_H -/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(): - convert a Unicode object to a wide character string. - - - If w is NULL: return the number of wide characters (including the null - character) required to convert the unicode object. Ignore size argument. - - - Otherwise: return the number of wide characters (excluding the null - character) written into w. Write at most size wide characters (including - the null character). */ -static Py_ssize_t -unicode_aswidechar(PyObject *unicode, - wchar_t *w, - Py_ssize_t size) -{ - Py_ssize_t res; - const wchar_t *wstr; - - wstr = PyUnicode_AsUnicodeAndSize(unicode, &res); - if (wstr == NULL) - return -1; - - if (w != NULL) { - if (size > res) - size = res + 1; - else - res = size; - Py_MEMCPY(w, wstr, size * sizeof(wchar_t)); - return res; +/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(). + Return the number of wide characters (excluding the null character) required + to convert the unicode object. Raise an exception and return -1 on error. */ +Py_ssize_t +unicode_aswidechar_len(PyObject *unicode) +{ + assert(PyUnicode_Check(unicode)); + assert(PyUnicode_IS_READY(unicode)); + +#if SIZEOF_WCHAR_T == 2 + if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) { + Py_ssize_t num_surrogates; + const Py_UCS4 *four_bytes; + const Py_UCS4 *ucs4_end; + + four_bytes = PyUnicode_4BYTE_DATA(unicode); + ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode); + num_surrogates = 0; + for (; four_bytes < ucs4_end; ++four_bytes) { + if (*four_bytes > 0xFFFF) + ++num_surrogates; + } + + return _PyUnicode_LENGTH(unicode) + num_surrogates; + } +#endif + return _PyUnicode_LENGTH(unicode); +} + +/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(). + Convert a Unicode object to a wide character string. Return the number of + wide characters (excluding the null character) written into buffer. Write at + most buflen wide characters (including the null character). Raise an + exception and return -1 on error. */ +Py_ssize_t +unicode_aswidechar(PyObject *unicode, wchar_t* buffer, Py_ssize_t buflen) +{ + Py_ssize_t copy; /* include null character */ + Py_ssize_t written; /* exclude null character */ + wchar_t *w, *wchar_end; + + assert(PyUnicode_Check(unicode)); + assert(PyUnicode_IS_READY(unicode)); + assert(buffer != NULL); + + assert(buflen >= 1); + written = 0; + if (buflen < 1) + return written; + + /* Fast-path: use memcpy() */ + if (PyUnicode_KIND(unicode) == SIZEOF_WCHAR_T) { + copy = _PyUnicode_LENGTH(unicode) + 1; + copy = Py_MIN(copy, buflen); + + written = _PyUnicode_LENGTH(unicode); + written = Py_MIN(written, copy); + + Py_MEMCPY(buffer, PyUnicode_4BYTE_DATA(unicode), + copy * sizeof(wchar_t)); + return written; + } + + w = buffer; + wchar_end = w + buflen; + +#if SIZEOF_WCHAR_T == 2 + /* Slow-path: need to create UTF-16 surrogate pairs */ + if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) { + const Py_UCS4 *four_bytes; + const Py_UCS4 *ucs4_end; + Py_ssize_t num_surrogates; + + four_bytes = PyUnicode_4BYTE_DATA(unicode); + ucs4_end = four_bytes + _PyUnicode_GET_LENGTH(unicode); + num_surrogates = 0; + + for (; four_bytes < ucs4_end; ++four_bytes) { + if (*four_bytes > 0xFFFF) + ++num_surrogates; + } + + four_bytes = PyUnicode_4BYTE_DATA(unicode); + for (; four_bytes < ucs4_end; ++four_bytes, ++w) { + if (*four_bytes > 0xFFFF) { + assert(*four_bytes <= MAX_UNICODE); + + if (w + 2 > wchar_end) + break; + + /* encode surrogate pair in this case */ + *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes); + *w = Py_UNICODE_LOW_SURROGATE(*four_bytes); + } + else + *w = *four_bytes; + } + + written = w - buffer; } else - return res + 1; +#endif + { + assert(PyUnicode_KIND(unicode) != PyUnicode_4BYTE_KIND); + written = _PyUnicode_GET_LENGTH(unicode); + written = Py_MIN(written, buflen); + +#if SIZEOF_WCHAR_T == 4 + if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) { + const Py_UCS2 *two_bytes, *end; + two_bytes = PyUnicode_2BYTE_DATA(unicode); + end = two_bytes + written; + for (; two_bytes < end; ++two_bytes, ++w) + *w = *two_bytes; + } + else if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) +#endif + { + const unsigned char *one_byte, *end; + + assert(PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND); + + one_byte = PyUnicode_1BYTE_DATA(unicode); + end = one_byte + written; + for (; one_byte < end; ++one_byte, ++w) + *w = *one_byte; + } + + assert((w - buffer) == written); + } + + assert(w <= wchar_end); + if (w < wchar_end) { + /* null-terminate the wstr */ + *w = L'\0'; + } + return written; } Py_ssize_t @@ -2834,6 +2940,13 @@ PyUnicode_AsWideChar(PyObject *unicode, PyErr_BadInternalCall(); return -1; } + if (!PyUnicode_Check(unicode)) { + PyErr_BadArgument(); + return -1; + } + if (PyUnicode_READY(unicode) < 0) + return -1; + return unicode_aswidechar(unicode, w, size); } @@ -2842,33 +2955,40 @@ PyUnicode_AsWideCharString(PyObject *uni Py_ssize_t *size) { wchar_t* buffer; - Py_ssize_t buflen; + Py_ssize_t len, buflen; if (unicode == NULL) { PyErr_BadInternalCall(); return NULL; } - - buflen = unicode_aswidechar(unicode, NULL, 0); - if (buflen == -1) - return NULL; - if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) { + if (!PyUnicode_Check(unicode)) { + PyErr_BadArgument(); + return NULL; + } + if (PyUnicode_READY(unicode) < 0) + return NULL; + + len = unicode_aswidechar_len(unicode); + if (len == -1) + return NULL; + if (PY_SSIZE_T_MAX / sizeof(wchar_t) -1 < len) { PyErr_NoMemory(); return NULL; } + buflen = len + 1; buffer = PyMem_MALLOC(buflen * sizeof(wchar_t)); if (buffer == NULL) { PyErr_NoMemory(); return NULL; } - buflen = unicode_aswidechar(unicode, buffer, buflen); - if (buflen == -1) { + len = unicode_aswidechar(unicode, buffer, buflen); + if (len == -1) { PyMem_FREE(buffer); return NULL; } if (size != NULL) - *size = buflen; + *size = len; return buffer; } @@ -3809,6 +3929,7 @@ PyUnicode_AsUnicodeAndSize(PyObject *uni PyErr_BadArgument(); return NULL; } + if (_PyUnicode_WSTR(unicode) == NULL) { /* Non-ASCII compact unicode object */ assert(_PyUnicode_KIND(unicode) != 0);