diff -r 4d89d03690ef Modules/_codecsmodule.c --- a/Modules/_codecsmodule.c Sun Aug 31 15:48:55 2014 +0200 +++ b/Modules/_codecsmodule.c Tue Sep 02 00:38:02 2014 +0200 @@ -691,18 +691,25 @@ unicode_internal_encode(PyObject *self, return NULL; if (PyUnicode_Check(obj)) { - Py_UNICODE *u; + wchar_t *u; + PyObject *encoded; if (PyUnicode_READY(obj) < 0) return NULL; - u = PyUnicode_AsUnicodeAndSize(obj, &len); + u = PyUnicode_AsWideCharString(obj, &len); if (u == NULL) return NULL; - if ((size_t)len > (size_t)PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) + if ((size_t)len > (size_t)PY_SSIZE_T_MAX / sizeof(wchar_t)) { + PyMem_Free(u); return PyErr_NoMemory(); - size = len * sizeof(Py_UNICODE); - return codec_tuple(PyBytes_FromStringAndSize((const char*)u, size), + } + encoded = PyBytes_FromStringAndSize((const char*)u, + len * sizeof(wchar_t)); + PyMem_Free(u); + if (encoded == NULL) + return NULL; + return codec_tuple(encoded, PyUnicode_GET_LENGTH(obj)); } else { diff -r 4d89d03690ef Modules/_ctypes/_ctypes.c --- a/Modules/_ctypes/_ctypes.c Sun Aug 31 15:48:55 2014 +0200 +++ b/Modules/_ctypes/_ctypes.c Tue Sep 02 00:38:02 2014 +0200 @@ -1169,9 +1169,11 @@ WCharArray_get_value(CDataObject *self) static int WCharArray_set_value(CDataObject *self, PyObject *value) { - Py_ssize_t result = 0; - Py_UNICODE *wstr; + Py_ssize_t result = -1; + wchar_t *wstr; Py_ssize_t len; + Py_ssize_t buflen; + Py_ssize_t copy; if (value == NULL) { PyErr_SetString(PyExc_TypeError, @@ -1183,23 +1185,29 @@ WCharArray_set_value(CDataObject *self, "unicode string expected instead of %s instance", Py_TYPE(value)->tp_name); return -1; - } else - Py_INCREF(value); - - wstr = PyUnicode_AsUnicodeAndSize(value, &len); + } + Py_INCREF(value); + + if (PyUnicode_READY(value) < 0) + return -1; + + wstr = PyUnicode_AsWideCharString(value, &len); if (wstr == NULL) - return -1; - if ((unsigned)len > self->b_size/sizeof(wchar_t)) { + goto done; + + buflen = self->b_size / (Py_ssize_t)sizeof(wchar_t); + if (len > buflen) { PyErr_SetString(PyExc_ValueError, "string too long"); - result = -1; + PyMem_Free(wstr); goto done; } - result = PyUnicode_AsWideChar(value, - (wchar_t *)self->b_ptr, - self->b_size/sizeof(wchar_t)); - if (result >= 0 && (size_t)result < self->b_size/sizeof(wchar_t)) - ((wchar_t *)self->b_ptr)[result] = (wchar_t)0; + + copy = Py_MIN(buflen, len + 1); + Py_MEMCPY(self->b_ptr, wstr, copy * sizeof(wchar_t)); + PyMem_Free(wstr); + result = len; + done: Py_DECREF(value); @@ -3154,7 +3162,7 @@ static int char *name; PyObject *defval; PyObject *typ; - if (!PyArg_ParseTuple(item, "i|ZO", &flag, &name, &defval)) { + if (!PyArg_ParseTuple(item, "i|UO", &flag, &name, &defval)) { PyErr_SetString(PyExc_TypeError, "paramflags must be a sequence of (int [,string [,value]]) tuples"); return 0; diff -r 4d89d03690ef Modules/_ctypes/callproc.c --- a/Modules/_ctypes/callproc.c Sun Aug 31 15:48:55 2014 +0200 +++ b/Modules/_ctypes/callproc.c Tue Sep 02 00:38:02 2014 +0200 @@ -1243,11 +1243,12 @@ static PyObject *load_library(PyObject * if (!PyArg_ParseTuple(args, "O|O:LoadLibrary", &nameobj, &ignored)) return NULL; - name = PyUnicode_AsUnicode(nameobj); + name = PyUnicode_AsWideCharString(nameobj); if (!name) return NULL; hMod = LoadLibraryW(name); + PyMem_Free(name); if (!hMod) return PyErr_SetFromWindowsErr(GetLastError()); #ifdef _WIN64 diff -r 4d89d03690ef Modules/_ctypes/cfield.c --- a/Modules/_ctypes/cfield.c Sun Aug 31 15:48:55 2014 +0200 +++ b/Modules/_ctypes/cfield.c Tue Sep 02 00:38:02 2014 +0200 @@ -1235,7 +1235,7 @@ static PyObject * U_set(void *ptr, PyObject *value, Py_ssize_t length) { Py_UNICODE *wstr; - Py_ssize_t size; + Py_ssize_t size, copy; /* It's easier to calculate in characters than in bytes */ length /= sizeof(wchar_t); @@ -1248,7 +1248,7 @@ U_set(void *ptr, PyObject *value, Py_ssi } else Py_INCREF(value); - wstr = PyUnicode_AsUnicodeAndSize(value, &size); + wstr = PyUnicode_AsWideCharString(value, &size); if (wstr == NULL) return NULL; if (size > length) { @@ -1256,14 +1256,16 @@ U_set(void *ptr, PyObject *value, Py_ssi "string too long (%zd, maximum length %zd)", size, length); Py_DECREF(value); + PyMem_Free(wstr); return NULL; - } else if (size < length-1) + } + if (size < length-1) /* copy terminating NUL character if there is space */ size += 1; - if (PyUnicode_AsWideChar(value, (wchar_t *)ptr, size) == -1) { - return NULL; - } + copy = Py_MIN(length, size); + Py_MEMCPY(ptr, wstr, copy * sizeof(wchar_t)); + PyMem_Free(wstr); return value; } diff -r 4d89d03690ef Modules/_io/fileio.c --- a/Modules/_io/fileio.c Sun Aug 31 15:48:55 2014 +0200 +++ b/Modules/_io/fileio.c Tue Sep 02 00:38:02 2014 +0200 @@ -202,7 +202,7 @@ fileio_init(PyObject *oself, PyObject *a char *mode = "r"; char *s; #ifdef MS_WINDOWS - Py_UNICODE *widename = NULL; + wchar_t *widename = NULL; #endif int ret = 0; int rwa = 0, plus = 0; @@ -259,7 +259,7 @@ fileio_init(PyObject *oself, PyObject *a PyErr_SetString(PyExc_TypeError, "embedded NUL character"); return -1; } - widename = PyUnicode_AsUnicode(nameobj); + widename = PyUnicode_AsWideCharString(nameobj, NULL); if (widename == NULL) return -1; } else @@ -455,6 +455,9 @@ fileio_init(PyObject *oself, PyObject *a internal_close(self); done: +#ifdef MS_WINDOWS + PyMem_Free(widename); +#endif Py_CLEAR(stringobj); return ret; } diff -r 4d89d03690ef Modules/_winapi.c --- a/Modules/_winapi.c Sun Aug 31 15:48:55 2014 +0200 +++ b/Modules/_winapi.c Tue Sep 02 00:38:02 2014 +0200 @@ -770,7 +770,7 @@ winapi_CreateProcess(PyObject* self, PyO environment = getenvironment(env_mapping); if (! environment) return NULL; - wenvironment = PyUnicode_AsUnicode(environment); + wenvironment = PyUnicode_AsWideCharString(environment, NULL); if (wenvironment == NULL) { Py_XDECREF(environment); @@ -796,6 +796,7 @@ winapi_CreateProcess(PyObject* self, PyO Py_END_ALLOW_THREADS Py_XDECREF(environment); + PyMem_Free(wenvironment); if (! result) return PyErr_SetFromWindowsErr(GetLastError()); diff -r 4d89d03690ef Modules/arraymodule.c --- a/Modules/arraymodule.c Sun Aug 31 15:48:55 2014 +0200 +++ b/Modules/arraymodule.c Tue Sep 02 00:38:02 2014 +0200 @@ -2602,31 +2602,31 @@ array_new(PyTypeObject *type, PyObject * Py_DECREF(v); } else if (initial != NULL && PyUnicode_Check(initial)) { - Py_UNICODE *ustr; + wchar_t *ustr; Py_ssize_t n; - ustr = PyUnicode_AsUnicode(initial); + ustr = PyUnicode_AsWideCharString(initial, &n); if (ustr == NULL) { PyErr_NoMemory(); Py_DECREF(a); return NULL; } - n = PyUnicode_GET_DATA_SIZE(initial); if (n > 0) { arrayobject *self = (arrayobject *)a; char *item = self->ob_item; - item = (char *)PyMem_Realloc(item, n); + item = (char *)PyMem_Realloc(item, n * sizeof(wchar_t)); if (item == NULL) { PyErr_NoMemory(); Py_DECREF(a); return NULL; } self->ob_item = item; - Py_SIZE(self) = n / sizeof(Py_UNICODE); - memcpy(item, ustr, n); + Py_SIZE(self) = n; + memcpy(item, ustr, n * sizeof(wchar_t)); self->allocated = Py_SIZE(self); } + PyMem_Free(ustr); } else if (initial != NULL && array_Check(initial)) { arrayobject *self = (arrayobject *)a; diff -r 4d89d03690ef Objects/unicodeobject.c --- a/Objects/unicodeobject.c Sun Aug 31 15:48:55 2014 +0200 +++ b/Objects/unicodeobject.c Tue Sep 02 00:38:02 2014 +0200 @@ -2792,37 +2792,143 @@ PyUnicode_FromFormat(const char *format, #ifdef HAVE_WCHAR_H -/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(): - convert a Unicode object to a wide character string. - - - If w is NULL: return the number of wide characters (including the null - character) required to convert the unicode object. Ignore size argument. - - - Otherwise: return the number of wide characters (excluding the null - character) written into w. Write at most size wide characters (including - the null character). */ -static Py_ssize_t -unicode_aswidechar(PyObject *unicode, - wchar_t *w, - Py_ssize_t size) -{ - Py_ssize_t res; - const wchar_t *wstr; - - wstr = PyUnicode_AsUnicodeAndSize(unicode, &res); - if (wstr == NULL) - return -1; - - if (w != NULL) { - if (size > res) - size = res + 1; - else - res = size; - Py_MEMCPY(w, wstr, size * sizeof(wchar_t)); - return res; +/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(). + Return the number of wide characters (excluding the null character) required + to convert the unicode object. Raise an exception and return -1 on error. */ +Py_ssize_t +unicode_aswidechar_len(PyObject *unicode) +{ + assert(PyUnicode_Check(unicode)); + assert(PyUnicode_IS_READY(unicode)); + +#if SIZEOF_WCHAR_T == 2 + if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) { + Py_ssize_t num_surrogates; + const Py_UCS4 *four_bytes; + const Py_UCS4 *ucs4_end; + + four_bytes = PyUnicode_4BYTE_DATA(unicode); + ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode); + num_surrogates = 0; + for (; four_bytes < ucs4_end; ++four_bytes) { + if (*four_bytes > 0xFFFF) + ++num_surrogates; + } + + return _PyUnicode_LENGTH(unicode) + num_surrogates; + } +#endif + return _PyUnicode_LENGTH(unicode); +} + +/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(). + Convert a Unicode object to a wide character string. Return the number of + wide characters (excluding the null character) written into buffer. Write at + most buflen wide characters (including the null character). Raise an + exception and return -1 on error. */ +Py_ssize_t +unicode_aswidechar(PyObject *unicode, wchar_t* buffer, Py_ssize_t buflen) +{ + Py_ssize_t copy; /* include null character */ + Py_ssize_t written; /* exclude null character */ + wchar_t *w, *wchar_end; + + assert(PyUnicode_Check(unicode)); + assert(PyUnicode_IS_READY(unicode)); + assert(buffer != NULL); + + assert(buflen >= 1); + written = 0; + if (buflen < 1) + return written; + + /* Fast-path: use memcpy() */ + if (PyUnicode_KIND(unicode) == SIZEOF_WCHAR_T) { + copy = _PyUnicode_LENGTH(unicode) + 1; + copy = Py_MIN(copy, buflen); + + written = _PyUnicode_LENGTH(unicode); + written = Py_MIN(written, copy); + + Py_MEMCPY(buffer, PyUnicode_4BYTE_DATA(unicode), + copy * sizeof(wchar_t)); + return written; + } + + w = buffer; + wchar_end = w + buflen; + +#if SIZEOF_WCHAR_T == 2 + /* Slow-path: need to create UTF-16 surrogate pairs */ + if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) { + const Py_UCS4 *four_bytes; + const Py_UCS4 *ucs4_end; + Py_ssize_t num_surrogates; + + four_bytes = PyUnicode_4BYTE_DATA(unicode); + ucs4_end = four_bytes + _PyUnicode_GET_LENGTH(unicode); + num_surrogates = 0; + + for (; four_bytes < ucs4_end; ++four_bytes) { + if (*four_bytes > 0xFFFF) + ++num_surrogates; + } + + four_bytes = PyUnicode_4BYTE_DATA(unicode); + for (; four_bytes < ucs4_end; ++four_bytes, ++w) { + if (*four_bytes > 0xFFFF) { + assert(*four_bytes <= MAX_UNICODE); + + if (w + 2 > wchar_end) + break; + + /* encode surrogate pair in this case */ + *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes); + *w = Py_UNICODE_LOW_SURROGATE(*four_bytes); + } + else + *w = *four_bytes; + } + + written = w - buffer; } else - return res + 1; +#endif + { + assert(PyUnicode_KIND(unicode) != PyUnicode_4BYTE_KIND); + written = _PyUnicode_GET_LENGTH(unicode); + written = Py_MIN(written, buflen); + +#if SIZEOF_WCHAR_T == 4 + if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) { + const Py_UCS2 *two_bytes, *end; + two_bytes = PyUnicode_2BYTE_DATA(unicode); + end = two_bytes + written; + for (; two_bytes < end; ++two_bytes, ++w) + *w = *two_bytes; + } + else if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) +#endif + { + const unsigned char *one_byte, *end; + + assert(PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND); + + one_byte = PyUnicode_1BYTE_DATA(unicode); + end = one_byte + written; + for (; one_byte < end; ++one_byte, ++w) + *w = *one_byte; + } + + assert((w - buffer) == written); + } + + assert(w <= wchar_end); + if (w < wchar_end) { + /* null-terminate the wstr */ + *w = L'\0'; + } + return written; } Py_ssize_t @@ -2834,6 +2940,13 @@ PyUnicode_AsWideChar(PyObject *unicode, PyErr_BadInternalCall(); return -1; } + if (!PyUnicode_Check(unicode)) { + PyErr_BadArgument(); + return -1; + } + if (PyUnicode_READY(unicode) < 0) + return -1; + return unicode_aswidechar(unicode, w, size); } @@ -2842,33 +2955,40 @@ PyUnicode_AsWideCharString(PyObject *uni Py_ssize_t *size) { wchar_t* buffer; - Py_ssize_t buflen; + Py_ssize_t len, buflen; if (unicode == NULL) { PyErr_BadInternalCall(); return NULL; } - - buflen = unicode_aswidechar(unicode, NULL, 0); - if (buflen == -1) - return NULL; - if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) { + if (!PyUnicode_Check(unicode)) { + PyErr_BadArgument(); + return NULL; + } + if (PyUnicode_READY(unicode) < 0) + return NULL; + + len = unicode_aswidechar_len(unicode); + if (len == -1) + return NULL; + if (PY_SSIZE_T_MAX / sizeof(wchar_t) -1 < len) { PyErr_NoMemory(); return NULL; } + buflen = len + 1; buffer = PyMem_MALLOC(buflen * sizeof(wchar_t)); if (buffer == NULL) { PyErr_NoMemory(); return NULL; } - buflen = unicode_aswidechar(unicode, buffer, buflen); - if (buflen == -1) { + len = unicode_aswidechar(unicode, buffer, buflen); + if (len == -1) { PyMem_FREE(buffer); return NULL; } if (size != NULL) - *size = buflen; + *size = len; return buffer; } @@ -3809,6 +3929,11 @@ PyUnicode_AsUnicodeAndSize(PyObject *uni PyErr_BadArgument(); return NULL; } + + if (PyErr_WarnEx(PyExc_DeprecationWarning, + "PyUnicode_AsUnicode is deprecated", 0) < 0) + return NULL; + if (_PyUnicode_WSTR(unicode) == NULL) { /* Non-ASCII compact unicode object */ assert(_PyUnicode_KIND(unicode) != 0); diff -r 4d89d03690ef Python/fileutils.c --- a/Python/fileutils.c Sun Aug 31 15:48:55 2014 +0200 +++ b/Python/fileutils.c Tue Sep 02 00:38:02 2014 +0200 @@ -550,10 +550,11 @@ int struct _stat wstatbuf; wchar_t *wpath; - wpath = PyUnicode_AsUnicode(path); + wpath = PyUnicode_AsWideCharString(path); if (wpath == NULL) return -2; err = _wstat(wpath, &wstatbuf); + PyMem_Free(wpath); if (!err) statbuf->st_mode = wstatbuf.st_mode; return err; @@ -839,15 +840,18 @@ FILE* Py_TYPE(path)); return NULL; } - wpath = PyUnicode_AsUnicode(path); + wpath = PyUnicode_AsWideCharString(path); if (wpath == NULL) return NULL; usize = MultiByteToWideChar(CP_ACP, 0, mode, -1, wmode, sizeof(wmode)); - if (usize == 0) + if (usize == 0) { + PyMem_Free(wpath); return NULL; + } f = _wfopen(wpath, wmode); + PyMem_Free(wpath); #else PyObject *bytes; if (!PyUnicode_FSConverter(path, &bytes))