Index: Objects/unicodeobject.c =================================================================== --- Objects/unicodeobject.c (revision 88538) +++ Objects/unicodeobject.c (working copy) @@ -1452,6 +1452,58 @@ return 1; } +static int +normalize_encoding2(const char *encoding, + char *lower, + size_t lower_len) +{ + const char *e; + char *l; + char pc; /* used to store the previous char in the loop */ + char *l_end; + + e = encoding; + l = lower; + l_end = &lower[lower_len - 1]; + pc = ' '; + + /* strip leading non alnum */ + while (!Py_ISALNUM(*e)) *e++; + + while (*e) { + if (l == l_end) + return 0; + if (Py_ISALPHA(*e)) { + if (Py_ISDIGIT(pc)) + *l++ = ' '; + if (Py_ISUPPER(*e)) + *l++ = Py_TOLOWER(*e++); + else + *l++ = *e++; + } + else if (Py_ISDIGIT(*e)) { + if (Py_ISALPHA(pc)) + *l++ = ' '; + *l++ = *e++; + } + else { + /* replace non-alnum with a space */ + *l++ = ' '; + /* and consume all following non-alnum */ + while (*e && !Py_ISALNUM(*e)) { + *e++; + } + } + pc = *(e-1); + } + /* terminate the string, possibly replacing the trailing space */ + if (*(l-1) == ' ') + *(l-1) = '\0'; + else + *l = '\0'; + return 1; +} + PyObject *PyUnicode_Decode(const char *s, Py_ssize_t size, const char *encoding, @@ -1459,17 +1511,15 @@ { PyObject *buffer = NULL, *unicode; Py_buffer info; - char lower[11]; /* Enough for any encoding shortcut */ + char lower[strlen(encoding)*2]; /* Enough for any encoding shortcut */ if (encoding == NULL) encoding = PyUnicode_GetDefaultEncoding(); - - /* Shortcuts for common default encodings */ - if (normalize_encoding(encoding, lower, sizeof(lower))) { - if (strcmp(lower, "utf-8") == 0) + if (normalize_encoding2(encoding, lower, sizeof(lower))) { + if (strcmp(lower, "utf 8") == 0) return PyUnicode_DecodeUTF8(s, size, errors); - else if ((strcmp(lower, "latin-1") == 0) || - (strcmp(lower, "iso-8859-1") == 0)) + else if ((strcmp(lower, "latin 1") == 0) || + (strcmp(lower, "iso 8859 1") == 0)) return PyUnicode_DecodeLatin1(s, size, errors); #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T) else if (strcmp(lower, "mbcs") == 0) @@ -1477,9 +1527,9 @@ #endif else if (strcmp(lower, "ascii") == 0) return PyUnicode_DecodeASCII(s, size, errors); - else if (strcmp(lower, "utf-16") == 0) + else if (strcmp(lower, "utf 16") == 0) return PyUnicode_DecodeUTF16(s, size, errors, 0); - else if (strcmp(lower, "utf-32") == 0) + else if (strcmp(lower, "utf 32") == 0) return PyUnicode_DecodeUTF32(s, size, errors, 0); }