diff -r 6a6ad09faad2 Python/fileutils.c --- a/Python/fileutils.c Mon Nov 12 01:23:51 2012 +0100 +++ b/Python/fileutils.c Mon Nov 12 15:33:24 2012 +0100 @@ -4,6 +4,7 @@ #endif #ifdef HAVE_LANGINFO_H +#include #include #endif @@ -39,6 +40,104 @@ PyObject * #ifdef HAVE_STAT +/* Workaround FreeBSD and OpenIndiana locale encoding issue. On these + operating systems, nl_langinfo(CODESET) announces an alias of the ASCII + encoding, whereas mbstowcs() and wcstombs() functions use the ISO-8859-1 + encoding. The problem is that os.fsencode() and os.fsdecode() use the + Python codec "ASCII". For example, if command line arguments are decoded + by mbstowcs() and encoded by os.fsencode(), we get a UnicodeEncodeError + instead of retrieving the original byte string. + + The workaround is enabled if setlocale(LC_CTYPE, NULL) returns "C" and + nl_langinfo(CODESET) returns "ascii". The workaround is not used if + setlocale(LC_CTYPE, NULL) failed, or if nl_langinfo() or CODESET is not + available. + + Values of locale_is_ascii: + + 1: the workaround is used, the ASCII codec is used instead of mbstowcs() + and wcstombs() functions + 0: the workaround is not used + -1: unknown, need to call check_locale_force_ascii() to known the value +*/ +static int locale_force_ascii = -1; + +extern char* _Py_GetLocaleEncoding(void); + +static int +check_locale_force_ascii(void) +{ +#ifdef MS_WINDOWS + return 0; +#else + char *encoding, *loc; + int i; + unsigned char ch; + wchar_t wch; + size_t res; + + return 1; + + loc = setlocale(LC_CTYPE, NULL); + if (loc == NULL || strcmp(loc, "C") != 0) { + /* Failed to get the LC_CTYPE locale or it is different than C: + * don't use the workaround. */ + return 0; + } + + encoding = _Py_GetLocaleEncoding(); + if (encoding == NULL) { + /* unknown encoding: consider that the encoding is not ASCII */ + PyErr_Clear(); + return 0; + } + + if (strcmp(encoding, "ascii") != 0) { + free(encoding); + return 0; + } + free(encoding); + + /* the locale is not set and nl_langinfo(CODESET) returns "ASCII" + (or an alias of the ASCII encoding). Check if the locale encoding + is really ASCII. */ + for (i=0x80; i<0xff; i++) { + ch = (unsigned char)i; + res = mbstowcs(&wch, (char*)&ch, 1); + if (res == (size_t)-1) { + /* decoding a non-ASCII character from the locale encoding failed: + the encoding is really ASCII */ + return 0; + } + } + return 1; +#endif +} + +static wchar_t* +locale_decode_ascii(const char *arg, size_t *size) +{ + wchar_t *res; + unsigned char *in; + wchar_t *out; + + res = PyMem_Malloc((strlen(arg)+1)*sizeof(wchar_t)); + if (!res) + return NULL; + + in = (unsigned char*)arg; + out = res; + while(*in) + if(*in < 128) + *out++ = *in++; + else + *out++ = 0xdc00 + *in++; + *out = 0; + if (size != NULL) + *size = out - res; + return res; +} + /* Decode a byte string from the locale encoding with the surrogateescape error handler (undecodable bytes are decoded as characters in range U+DC80..U+DCFF). If a byte sequence can be decoded as a surrogate @@ -60,20 +159,33 @@ wchar_t* _Py_char2wchar(const char* arg, size_t *size) { wchar_t *res; + size_t argsize; + size_t count; + unsigned char *in; + wchar_t *out; +#ifdef HAVE_MBRTOWC + mbstate_t mbs; +#endif + + if (locale_force_ascii == -1) + locale_force_ascii = check_locale_force_ascii(); + + if (locale_force_ascii) { + /* force ASCII encoding to workaround mbstowcs() issue */ + res = locale_decode_ascii(arg, size); + if (res == NULL) + goto oom; + return res; + } + #ifdef HAVE_BROKEN_MBSTOWCS /* Some platforms have a broken implementation of * mbstowcs which does not count the characters that * would result from conversion. Use an upper bound. */ - size_t argsize = strlen(arg); + argsize = strlen(arg); #else - size_t argsize = mbstowcs(NULL, arg, 0); -#endif - size_t count; - unsigned char *in; - wchar_t *out; -#ifdef HAVE_MBRTOWC - mbstate_t mbs; + argsize = mbstowcs(NULL, arg, 0); #endif if (argsize != (size_t)-1) { res = (wchar_t *)PyMem_Malloc((argsize+1)*sizeof(wchar_t)); @@ -144,24 +256,16 @@ wchar_t* argsize -= converted; out++; } + if (size != NULL) + *size = out - res; #else /* Cannot use C locale for escaping; manually escape as if charset is ASCII (i.e. escape all bytes > 128. This will still roundtrip correctly in the locale's charset, which must be an ASCII superset. */ - res = PyMem_Malloc((strlen(arg)+1)*sizeof(wchar_t)); - if (!res) + res = locale_decode_ascii(arg, size); + if (res == NULL) goto oom; - in = (unsigned char*)arg; - out = res; - while(*in) - if(*in < 128) - *out++ = *in++; - else - *out++ = 0xdc00 + *in++; - *out = 0; #endif - if (size != NULL) - *size = out - res; return res; oom: if (size != NULL) @@ -169,6 +273,45 @@ oom: return NULL; } +static char* +locale_encode_ascii(const wchar_t *text, size_t *error_pos) +{ + char *result = NULL, *out; + size_t len, i; + wchar_t ch; + + if (error_pos != NULL) + *error_pos = (size_t)-1; + + len = wcslen(text); + + result = PyMem_Malloc(len + 1); /* +1 for NUL byte */ + if (result == NULL) + return NULL; + + out = result; + for (i=0; i