diff --git a/Lib/encodings/__init__.py b/Lib/encodings/__init__.py index b189bd9..469c5cf 100644 --- a/Lib/encodings/__init__.py +++ b/Lib/encodings/__init__.py @@ -55,15 +55,12 @@ def normalize_encoding(encoding): if isinstance(encoding, bytes): encoding = str(encoding, "ascii") chars = [] - punct = False for c in encoding: - if c.isalnum() or c == '.': - if punct and chars: - chars.append('_') + if '0' <= c <= '9' or 'a' <= c <= 'z': chars.append(c) - punct = False - else: - punct = True + elif 'A' <= c <= 'Z': + chars.append(c.lower()) + # else: skip characters return ''.join(chars) def search_function(encoding): diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index e7bbd80..84b257c 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -1419,34 +1419,32 @@ PyObject *PyUnicode_FromEncodedObject(register PyObject *obj, return v; } -/* Convert encoding to lower case and replace '_' with '-' in order to - catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1), - 1 on success. */ +/* Convert encoding to lower case and skip characters different than a-z and + 0-9. Return 0 on error (encoding is longer than lower_len-1), 1 on success. + + See also normalizestring() in codecs and normalize_encoding() in encodings. + */ static int normalize_encoding(const char *encoding, char *lower, size_t lower_len) { const char *e; - char *l; + char *l, c; char *l_end; e = encoding; l = lower; l_end = &lower[lower_len - 1]; - while (*e) { + for (; *e; e++) { if (l == l_end) return 0; - if (Py_ISUPPER(*e)) { - *l++ = Py_TOLOWER(*e++); - } - else if (*e == '_') { - *l++ = '-'; - e++; - } - else { - *l++ = *e++; - } + c = *e; + if (('a' <= c && c <= 'z') || ('0' <= c && c <= '9')) + *l++ = c; + else if ('A' <= c && c <= 'Z') + *l++ = c + ('a' - 'A'); + /* else: ignore the character */ } *l = '\0'; return 1; @@ -1466,10 +1464,10 @@ PyObject *PyUnicode_Decode(const char *s, /* Shortcuts for common default encodings */ if (normalize_encoding(encoding, lower, sizeof(lower))) { - if (strcmp(lower, "utf-8") == 0) + if (strcmp(lower, "utf8") == 0) return PyUnicode_DecodeUTF8(s, size, errors); - else if ((strcmp(lower, "latin-1") == 0) || - (strcmp(lower, "iso-8859-1") == 0)) + else if ((strcmp(lower, "latin1") == 0) || + (strcmp(lower, "iso88591") == 0)) return PyUnicode_DecodeLatin1(s, size, errors); #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T) else if (strcmp(lower, "mbcs") == 0) @@ -1477,9 +1475,9 @@ PyObject *PyUnicode_Decode(const char *s, #endif else if (strcmp(lower, "ascii") == 0) return PyUnicode_DecodeASCII(s, size, errors); - else if (strcmp(lower, "utf-16") == 0) + else if (strcmp(lower, "utf16") == 0) return PyUnicode_DecodeUTF16(s, size, errors, 0); - else if (strcmp(lower, "utf-32") == 0) + else if (strcmp(lower, "utf32") == 0) return PyUnicode_DecodeUTF32(s, size, errors, 0); } @@ -1674,12 +1672,12 @@ PyObject *PyUnicode_AsEncodedString(PyObject *unicode, /* Shortcuts for common default encodings */ if (normalize_encoding(encoding, lower, sizeof(lower))) { - if (strcmp(lower, "utf-8") == 0) + if (strcmp(lower, "utf8") == 0) return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode), errors); - else if ((strcmp(lower, "latin-1") == 0) || - (strcmp(lower, "iso-8859-1") == 0)) + else if ((strcmp(lower, "latin1") == 0) || + (strcmp(lower, "iso88591") == 0)) return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode), errors); diff --git a/Python/codecs.c b/Python/codecs.c index 45d9929..856200c 100644 --- a/Python/codecs.c +++ b/Python/codecs.c @@ -45,15 +45,18 @@ int PyCodec_Register(PyObject *search_function) return -1; } -/* Convert a string to a normalized Python string: all characters are - converted to lower case, spaces are replaced with underscores. */ +/* Convert encoding to lower case and skip characters different than a-z and + 0-9. + + See also normalizestring() in codecs and normalize_encoding() in encodings. + */ static PyObject *normalizestring(const char *string) { register size_t i; size_t len = strlen(string); - char *p; + char *buffer, *p; PyObject *v; if (len > PY_SSIZE_T_MAX) { @@ -61,22 +64,22 @@ PyObject *normalizestring(const char *string) return NULL; } - p = PyMem_Malloc(len + 1); - if (p == NULL) + buffer = PyMem_Malloc(len + 1); + if (buffer == NULL) return NULL; + p = buffer; for (i = 0; i < len; i++) { register char ch = string[i]; - if (ch == ' ') - ch = '-'; - else - ch = tolower(Py_CHARMASK(ch)); - p[i] = ch; + + if (('a' <= ch && ch <= 'z') || ('0' <= ch && ch <= '9')) + *p++ = ch; + else if ('A' <= ch && ch <= 'Z') + *p++ = ch + ('a' - 'A'); } - p[i] = '\0'; - v = PyUnicode_FromString(p); + v = PyUnicode_FromStringAndSize(buffer, p - buffer); if (v == NULL) return NULL; - PyMem_Free(p); + PyMem_Free(buffer); return v; } diff --git a/Lib/test/test_encodings.py b/Lib/test/test_encodings.py new file mode 100644 index 0000000..eb9b823 --- /dev/null +++ b/Lib/test/test_encodings.py @@ -0,0 +1,28 @@ +from test import support +import unittest +import encodings +import sys, _testcapi, io + +class EncodingTests(unittest.TestCase): + + def test_normalize(self): + def test(name, expected): + self.assertEqual(encodings.normalize_encoding(name), expected) + + test(b'utf8', 'utf8') + test('utf8', 'utf8') + test('utf-8', 'utf8') + test('UTF-8', 'utf8') + test('fr_FR.UTF-8', 'frfrutf8') + test('UTF-8\xe9', 'utf8') + test('[ utf-8 ]', 'utf8') + + +def test_main(): + support.run_unittest( + EncodingTests, + ) + + +if __name__ == "__main__": + test_main()