diff -r 5c716437a83a Lib/email/charset.py --- a/Lib/email/charset.py Tue May 24 12:05:19 2011 +0200 +++ b/Lib/email/charset.py Tue May 24 18:27:41 2011 +0200 @@ -10,6 +10,7 @@ ] from functools import partial +from encodings import normalize_encoding import email.base64mime import email.quoprimime @@ -63,33 +64,20 @@ 'utf-8': (SHORTEST, BASE64, 'utf-8'), } -# Aliases for other commonly-used names for character sets. Map -# them to the real ones used in email. +# Maps normalized codecs with their corresponding IANA aliases ALIASES = { - 'latin_1': 'iso-8859-1', + 'not-a-charset': 'not a charset', 'latin-1': 'iso-8859-1', - 'latin_2': 'iso-8859-2', 'latin-2': 'iso-8859-2', - 'latin_3': 'iso-8859-3', 'latin-3': 'iso-8859-3', - 'latin_4': 'iso-8859-4', 'latin-4': 'iso-8859-4', - 'latin_5': 'iso-8859-9', 'latin-5': 'iso-8859-9', - 'latin_6': 'iso-8859-10', 'latin-6': 'iso-8859-10', - 'latin_7': 'iso-8859-13', 'latin-7': 'iso-8859-13', - 'latin_8': 'iso-8859-14', 'latin-8': 'iso-8859-14', - 'latin_9': 'iso-8859-15', 'latin-9': 'iso-8859-15', - 'latin_10':'iso-8859-16', 'latin-10':'iso-8859-16', - 'cp949': 'ks_c_5601-1987', - 'euc_jp': 'euc-jp', - 'euc_kr': 'euc-kr', - 'ascii': 'us-ascii', + 'shift-jis': 'shift_jis' } @@ -220,7 +208,9 @@ input_charset = str(input_charset, 'ascii') except UnicodeError: raise errors.CharsetError(input_charset) + input_charset = input_charset.lower() + input_charset = normalize_encoding(input_charset).replace('_', '-') # Set the input charset after filtering through the aliases self.input_charset = ALIASES.get(input_charset, input_charset) # We can try to guess which encoding and conversion to use by the diff -r 5c716437a83a Lib/encodings/aliases.py --- a/Lib/encodings/aliases.py Tue May 24 12:05:19 2011 +0200 +++ b/Lib/encodings/aliases.py Tue May 24 18:27:41 2011 +0200 @@ -254,7 +254,7 @@ # hp_roman8 codec 'roman8' : 'hp_roman8', 'r8' : 'hp_roman8', - 'csHPRoman8' : 'hp_roman8', + 'cshproman8' : 'hp_roman8', # hz codec 'hzgb' : 'hz', @@ -298,6 +298,7 @@ 'iso_ir_157' : 'iso8859_10', 'l6' : 'iso8859_10', 'latin6' : 'iso8859_10', + 'latin_6' : 'iso8859_10', # iso8859_11 codec 'thai' : 'iso8859_11', @@ -308,6 +309,7 @@ 'iso_8859_13' : 'iso8859_13', 'l7' : 'iso8859_13', 'latin7' : 'iso8859_13', + 'latin_7' : 'iso8859_13', # iso8859_14 codec 'iso_8859_14' : 'iso8859_14', @@ -316,11 +318,13 @@ 'iso_ir_199' : 'iso8859_14', 'l8' : 'iso8859_14', 'latin8' : 'iso8859_14', + 'latin_8' : 'iso8859_14', # iso8859_15 codec 'iso_8859_15' : 'iso8859_15', 'l9' : 'iso8859_15', 'latin9' : 'iso8859_15', + 'latin_9' : 'iso8859_15', # iso8859_16 codec 'iso_8859_16' : 'iso8859_16', @@ -328,6 +332,7 @@ 'iso_ir_226' : 'iso8859_16', 'l10' : 'iso8859_16', 'latin10' : 'iso8859_16', + 'latin_10' : 'iso8859_16', # iso8859_2 codec 'csisolatin2' : 'iso8859_2', @@ -336,6 +341,7 @@ 'iso_ir_101' : 'iso8859_2', 'l2' : 'iso8859_2', 'latin2' : 'iso8859_2', + 'latin_2' : 'iso8859_2', # iso8859_3 codec 'csisolatin3' : 'iso8859_3', @@ -344,6 +350,7 @@ 'iso_ir_109' : 'iso8859_3', 'l3' : 'iso8859_3', 'latin3' : 'iso8859_3', + 'latin_3' : 'iso8859_3', # iso8859_4 codec 'csisolatin4' : 'iso8859_4', @@ -352,6 +359,7 @@ 'iso_ir_110' : 'iso8859_4', 'l4' : 'iso8859_4', 'latin4' : 'iso8859_4', + 'latin_4' : 'iso8859_4', # iso8859_5 codec 'csisolatincyrillic' : 'iso8859_5', @@ -393,6 +401,7 @@ 'iso_ir_148' : 'iso8859_9', 'l5' : 'iso8859_9', 'latin5' : 'iso8859_9', + 'latin_5' : 'iso8859_9', # johab codec 'cp1361' : 'johab', @@ -474,9 +483,6 @@ 'sjisx0213' : 'shift_jisx0213', 's_jisx0213' : 'shift_jisx0213', - # tactis codec - 'tis260' : 'tactis', - # tis_620 codec 'tis620' : 'tis_620', 'tis_620_0' : 'tis_620', diff -r 5c716437a83a Lib/test/test_codeccallbacks.py --- a/Lib/test/test_codeccallbacks.py Tue May 24 12:05:19 2011 +0200 +++ b/Lib/test/test_codeccallbacks.py Tue May 24 18:27:41 2011 +0200 @@ -1,5 +1,13 @@ -import test.support, unittest -import sys, codecs, html.entities, unicodedata +import test.support +import unittest + +from encodings.aliases import aliases +import codecs +import unicodedata +import html.entities +import importlib +import sys + class PosReturn: # this can be used for configurable callbacks @@ -629,7 +637,16 @@ "test.badhandler" ) - def test_lookup(self): + def test_lookup_aliases(self): + for alias, module_name in aliases.items(): + if module_name == 'mbcs' and not sys.platform.startswith('win'): + continue + + module = importlib.import_module('encodings.' + module_name) + codec_name = module.getregentry().name + self.assertEqual(codecs.lookup(alias).name, codec_name) + + def test_lookup_error(self): self.assertEqual(codecs.strict_errors, codecs.lookup_error("strict")) self.assertEqual(codecs.ignore_errors, codecs.lookup_error("ignore")) self.assertEqual(codecs.strict_errors, codecs.lookup_error("strict")) @@ -664,7 +681,7 @@ self.assertRaises(TypeError, codecs.register_error, 42) self.assertRaises(TypeError, codecs.register_error, "test.dummy", 42) - def test_badlookupcall(self): + def test_badlookup_errorcall(self): # enhance coverage of: # Modules/_codecsmodule.c::lookup_error() self.assertRaises(TypeError, codecs.lookup_error)