diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -706,6 +706,32 @@ 3.x, but need to be aware that their use throughout the interpreter whenever coercion to Unicode is needed. +Locale Encoding +""""""""""""""" + +The locale encoding should be used to decode text from the operating system. + +.. c:function:: PyObject* PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len, int surrogateescape) + + Decode a string from the locale encoding using the ``'surrogateescape'`` + error handler (:pep:`383`) if *surrogateescape* is 1, or ``'strict'`` + otherwise. *str* must not contain any null character. + + .. seealso:: + + :c:func:`PyUnicode_DecodeFSDefaultAndSize` + + .. versionadded:: 3.3 + + +.. c:function:: PyObject* PyUnicode_DecodeLocale(const char *str, int surrogateescape) + + Similar to :c:func:`PyUnicode_DecodeLocaleAndSize`, but compute the string + length using :c:func:`strlen`. + + .. versionadded:: 3.3 + + File System Encoding """""""""""""""""""" @@ -746,6 +772,10 @@ used, passing :c:func:`PyUnicode_FSDecod If :c:data:`Py_FileSystemDefaultEncoding` is not set, fall back to the locale encoding. + .. seealso:: + + :c:func:`PyUnicode_DecodeLocaleAndSize` + .. versionchanged:: 3.2 Use ``'strict'`` error handler on Windows. diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h --- a/Include/unicodeobject.h +++ b/Include/unicodeobject.h @@ -1587,6 +1587,20 @@ PyAPI_FUNC(PyObject*) _PyUnicode_Transfo ); #endif +/* --- Locale encoding --------------------------------------------------- */ + +/* Decode a byte string to a Unicode object from the current locale encoding. + The input string cannot contain embedded null character. */ + +PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocaleAndSize( + const char *str, + Py_ssize_t len, + int surrogateescape); + +PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocale( + const char *str, + int surrogateescape); + /* --- File system encoding ---------------------------------------------- */ /* ParseTuple converter: encode str objects to bytes using diff --git a/Modules/_localemodule.c b/Modules/_localemodule.c --- a/Modules/_localemodule.c +++ b/Modules/_localemodule.c @@ -42,43 +42,6 @@ PyDoc_STRVAR(locale__doc__, "Support for static PyObject *Error; -/* Convert a char* to a Unicode object according to the current locale */ -static PyObject* -str2uni(const char* s) -{ -#ifdef HAVE_BROKEN_MBSTOWCS - size_t needed = strlen(s); -#else - size_t needed = mbstowcs(NULL, s, 0); -#endif - size_t res1; - wchar_t smallbuf[30]; - wchar_t *dest; - PyObject *res2; - if (needed == (size_t)-1) { - PyErr_SetString(PyExc_ValueError, "Cannot convert byte to string"); - return NULL; - } - if (needed*sizeof(wchar_t) < sizeof(smallbuf)) - dest = smallbuf; - else { - dest = PyMem_Malloc((needed+1)*sizeof(wchar_t)); - if (!dest) - return PyErr_NoMemory(); - } - /* This shouldn't fail now */ - res1 = mbstowcs(dest, s, needed+1); -#ifdef HAVE_BROKEN_MBSTOWCS - assert(res1 != (size_t)-1); -#else - assert(res1 == needed); -#endif - res2 = PyUnicode_FromWideChar(dest, res1); - if (dest != smallbuf) - PyMem_Free(dest); - return res2; -} - /* support functions for formatting floating point numbers */ PyDoc_STRVAR(setlocale__doc__, @@ -149,7 +112,7 @@ PyLocale_setlocale(PyObject* self, PyObj PyErr_SetString(Error, "unsupported locale setting"); return NULL; } - result_object = str2uni(result); + result_object = PyUnicode_DecodeLocale(result, 0); if (!result_object) return NULL; } else { @@ -159,7 +122,7 @@ PyLocale_setlocale(PyObject* self, PyObj PyErr_SetString(Error, "locale query failed"); return NULL; } - result_object = str2uni(result); + result_object = PyUnicode_DecodeLocale(result, 0); } return result_object; } @@ -185,7 +148,7 @@ PyLocale_localeconv(PyObject* self) involved herein */ #define RESULT_STRING(s)\ - x = str2uni(l->s); \ + x = PyUnicode_DecodeLocale(l->s, 0); \ if (!x) goto failed;\ PyDict_SetItemString(result, #s, x);\ Py_XDECREF(x) @@ -476,7 +439,7 @@ PyLocale_nl_langinfo(PyObject* self, PyO instead of an empty string for nl_langinfo(ERA). */ const char *result = nl_langinfo(item); result = result != NULL ? result : ""; - return str2uni(result); + return PyUnicode_DecodeLocale(result, 0); } PyErr_SetString(PyExc_ValueError, "unsupported langinfo constant"); return NULL; @@ -495,7 +458,7 @@ PyIntl_gettext(PyObject* self, PyObject char *in; if (!PyArg_ParseTuple(args, "s", &in)) return 0; - return str2uni(gettext(in)); + return PyUnicode_DecodeLocale(gettext(in), 0); } PyDoc_STRVAR(dgettext__doc__, @@ -508,7 +471,7 @@ PyIntl_dgettext(PyObject* self, PyObject char *domain, *in; if (!PyArg_ParseTuple(args, "zs", &domain, &in)) return 0; - return str2uni(dgettext(domain, in)); + return PyUnicode_DecodeLocale(dgettext(domain, in), 0); } PyDoc_STRVAR(dcgettext__doc__, @@ -522,7 +485,7 @@ PyIntl_dcgettext(PyObject *self, PyObjec int category; if (!PyArg_ParseTuple(args, "zsi", &domain, &msgid, &category)) return 0; - return str2uni(dcgettext(domain,msgid,category)); + return PyUnicode_DecodeLocale(dcgettext(domain,msgid,category), 0); } PyDoc_STRVAR(textdomain__doc__, @@ -540,7 +503,7 @@ PyIntl_textdomain(PyObject* self, PyObje PyErr_SetFromErrno(PyExc_OSError); return NULL; } - return str2uni(domain); + return PyUnicode_DecodeLocale(domain, 0); } PyDoc_STRVAR(bindtextdomain__doc__, @@ -572,7 +535,7 @@ PyIntl_bindtextdomain(PyObject* self,PyO PyErr_SetFromErrno(PyExc_OSError); return NULL; } - result = str2uni(current_dirname); + result = PyUnicode_DecodeLocale(current_dirname, 0); Py_XDECREF(dirname_bytes); return result; } @@ -590,7 +553,7 @@ PyIntl_bind_textdomain_codeset(PyObject* return NULL; codeset = bind_textdomain_codeset(domain, codeset); if (codeset) - return str2uni(codeset); + return PyUnicode_DecodeLocale(codeset, 0); Py_RETURN_NONE; } #endif diff --git a/Modules/main.c b/Modules/main.c --- a/Modules/main.c +++ b/Modules/main.c @@ -495,16 +495,13 @@ Py_Main(int argc, wchar_t **argv) /* Use utf-8 on Mac OS X */ unicode = PyUnicode_FromString(p); #else - wchar_t *wchar; - size_t len; - wchar = _Py_char2wchar(p, &len); - if (wchar == NULL) + unicode = PyUnicode_DecodeLocale(p, 1); +#endif + if (unicode == NULL) { + /* ignore errors */ + PyErr_Clear(); continue; - unicode = PyUnicode_FromWideChar(wchar, len); - PyMem_Free(wchar); -#endif - if (unicode == NULL) - continue; + } PySys_AddWarnOptionUnicode(unicode); Py_DECREF(unicode); } diff --git a/Modules/python.c b/Modules/python.c --- a/Modules/python.c +++ b/Modules/python.c @@ -50,8 +50,12 @@ main(int argc, char **argv) #else argv_copy[i] = _Py_char2wchar(argv[i], NULL); #endif - if (!argv_copy[i]) + if (!argv_copy[i]) { + fprintf(stderr, "Fatal Python error: " + "unable to decode the command line argument #%i\n", + i + 1); return 1; + } argv_copy2[i] = argv_copy[i]; } setlocale(LC_ALL, oldloc); diff --git a/Modules/timemodule.c b/Modules/timemodule.c --- a/Modules/timemodule.c +++ b/Modules/timemodule.c @@ -530,7 +530,7 @@ time_strftime(PyObject *self, PyObject * #ifdef HAVE_WCSFTIME ret = PyUnicode_FromWideChar(outbuf, buflen); #else - ret = PyUnicode_DecodeFSDefaultAndSize(outbuf, buflen); + ret = PyUnicode_DecodeLocaleAndSize(outbuf, buflen, 1); #endif PyMem_Free(outbuf); break; @@ -762,8 +762,8 @@ PyInit_timezone(PyObject *m) { #endif /* PYOS_OS2 */ #endif PyModule_AddIntConstant(m, "daylight", daylight); - otz0 = PyUnicode_DecodeFSDefaultAndSize(tzname[0], strlen(tzname[0])); - otz1 = PyUnicode_DecodeFSDefaultAndSize(tzname[1], strlen(tzname[1])); + otz0 = PyUnicode_DecodeLocale(tzname[0], 1); + otz1 = PyUnicode_DecodeLocale(tzname[1], 1); PyModule_AddObject(m, "tzname", Py_BuildValue("(NN)", otz0, otz1)); #else /* !HAVE_TZNAME || __GLIBC__ || __CYGWIN__*/ #ifdef HAVE_STRUCT_TM_TM_ZONE diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -3242,6 +3242,85 @@ PyUnicode_AsEncodedUnicode(PyObject *uni } PyObject* +PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len, + int surrogateescape) +{ + wchar_t smallbuf[256]; + size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf); + wchar_t *wstr; + size_t wlen, wlen2; + PyObject *unicode; + + if (str[len] != '\0' || len != strlen(str)) { + PyErr_SetString(PyExc_TypeError, "embedded null character"); + return NULL; + } + + if (surrogateescape) + { + wstr = _Py_char2wchar(str, &wlen); + if (wstr == NULL) { + if (wlen == (size_t)-1) + PyErr_NoMemory(); + else + PyErr_SetFromErrno(PyExc_OSError); + return NULL; + } + + unicode = PyUnicode_FromWideChar(wstr, wlen); + PyMem_Free(wstr); + } + else { +#ifdef HAVE_BROKEN_MBSTOWCS + wlen = len; +#else + wlen = mbstowcs(NULL, str, 0); +#endif + if (wlen == (size_t)-1) { + PyErr_SetFromErrno(PyExc_OSError); + return NULL; + } + if (wlen+1 <= smallbuf_len) { + wstr = smallbuf; + } + else { + if (wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) + return PyErr_NoMemory(); + + wstr = PyMem_Malloc((wlen+1) * sizeof(wchar_t)); + if (!wstr) + return PyErr_NoMemory(); + } + + /* This shouldn't fail now */ + wlen2 = mbstowcs(wstr, str, wlen+1); +#ifdef HAVE_BROKEN_MBSTOWCS + assert(wlen2 != (size_t)-1); +#else + assert(wlen2 == wlen); +#endif + if (wlen2 == (size_t)-1) { + if (wstr != smallbuf) + PyMem_Free(wstr); + PyErr_SetFromErrno(PyExc_OSError); + return NULL; + } + unicode = PyUnicode_FromWideChar(wstr, wlen2); + if (wstr != smallbuf) + PyMem_Free(wstr); + } + return unicode; +} + +PyObject* +PyUnicode_DecodeLocale(const char *str, int surrogateescape) +{ + Py_ssize_t size = (Py_ssize_t)strlen(str); + return PyUnicode_DecodeLocaleAndSize(str, size, surrogateescape); +} + + +PyObject* PyUnicode_DecodeFSDefault(const char *s) { Py_ssize_t size = (Py_ssize_t)strlen(s); return PyUnicode_DecodeFSDefaultAndSize(s, size); @@ -3271,23 +3350,7 @@ PyUnicode_DecodeFSDefaultAndSize(const c "surrogateescape"); } else { - /* locale encoding with surrogateescape */ - wchar_t *wchar; - PyObject *unicode; - size_t len; - - if (s[size] != '\0' || size != strlen(s)) { - PyErr_SetString(PyExc_TypeError, "embedded NUL character"); - return NULL; - } - - wchar = _Py_char2wchar(s, &len); - if (wchar == NULL) - return PyErr_NoMemory(); - - unicode = PyUnicode_FromWideChar(wchar, len); - PyMem_Free(wchar); - return unicode; + return PyUnicode_DecodeLocaleAndSize(s, size, 1); } #endif } diff --git a/Python/fileutils.c b/Python/fileutils.c --- a/Python/fileutils.c +++ b/Python/fileutils.c @@ -16,7 +16,9 @@ Return a pointer to a newly allocated wide character string (use PyMem_Free() to free the memory) and write the number of written wide characters excluding the null character into *size if size is not NULL, or - NULL on error (conversion or memory allocation error). + NULL on error (conversion or memory allocation error). If size is not NULL, + *size is set to (size_t)-1 on memory error and (size_t)-2 on conversion + error. Conversion errors should never happen, unless there is a bug in the C library. */ @@ -82,8 +84,9 @@ _Py_char2wchar(const char* arg, size_t * since we provide everything that we have - unless there is a bug in the C library, or I misunderstood how mbrtowc works. */ - fprintf(stderr, "unexpected mbrtowc result -2\n"); PyMem_Free(res); + if (size != NULL) + *size = (size_t)-2; return NULL; } if (converted == (size_t)-1) { @@ -126,7 +129,8 @@ _Py_char2wchar(const char* arg, size_t * *size = out - res; return res; oom: - fprintf(stderr, "out of memory\n"); + if (size != NULL) + *size = (size_t)-1; return NULL; } @@ -328,7 +332,7 @@ _Py_fopen(PyObject *path, const char *mo #ifdef HAVE_READLINK /* Read value of symbolic link. Encode the path to the locale encoding, decode - the result from the locale encoding. */ + the result from the locale encoding. Return -1 on error. */ int _Py_wreadlink(const wchar_t *path, wchar_t *buf, size_t bufsiz) @@ -372,7 +376,8 @@ _Py_wreadlink(const wchar_t *path, wchar #ifdef HAVE_REALPATH /* Return the canonicalized absolute pathname. Encode path to the locale - encoding, decode the result from the locale encoding. */ + encoding, decode the result from the locale encoding. + Return NULL on error. */ wchar_t* _Py_wrealpath(const wchar_t *path, @@ -410,7 +415,8 @@ _Py_wrealpath(const wchar_t *path, #endif /* Get the current directory. size is the buffer size in wide characters - including the null character. Decode the path from the locale encoding. */ + including the null character. Decode the path from the locale encoding. + Return NULL on error. */ wchar_t* _Py_wgetcwd(wchar_t *buf, size_t size)