diff -r da65e1dcdda9 Lib/email/charset.py --- a/Lib/email/charset.py Thu May 26 17:06:47 2011 +0200 +++ b/Lib/email/charset.py Fri May 27 07:25:05 2011 +0200 @@ -16,6 +16,7 @@ from email import errors from email.encoders import encode_7or8bit +from email.mime.aliases import normalize_encoding @@ -63,36 +64,6 @@ 'utf-8': (SHORTEST, BASE64, 'utf-8'), } -# Aliases for other commonly-used names for character sets. Map -# them to the real ones used in email. -ALIASES = { - 'latin_1': 'iso-8859-1', - 'latin-1': 'iso-8859-1', - 'latin_2': 'iso-8859-2', - 'latin-2': 'iso-8859-2', - 'latin_3': 'iso-8859-3', - 'latin-3': 'iso-8859-3', - 'latin_4': 'iso-8859-4', - 'latin-4': 'iso-8859-4', - 'latin_5': 'iso-8859-9', - 'latin-5': 'iso-8859-9', - 'latin_6': 'iso-8859-10', - 'latin-6': 'iso-8859-10', - 'latin_7': 'iso-8859-13', - 'latin-7': 'iso-8859-13', - 'latin_8': 'iso-8859-14', - 'latin-8': 'iso-8859-14', - 'latin_9': 'iso-8859-15', - 'latin-9': 'iso-8859-15', - 'latin_10':'iso-8859-16', - 'latin-10':'iso-8859-16', - 'cp949': 'ks_c_5601-1987', - 'euc_jp': 'euc-jp', - 'euc_kr': 'euc-kr', - 'ascii': 'us-ascii', - } - - # Map charsets to their Unicode codec strings. CODEC_MAP = { 'gb2312': 'eucgb2312_cn', @@ -103,6 +74,8 @@ 'us-ascii': None, } +# User defined MIME encodings aliases +ALIASES = dict() # Convenience functions for extending the above mappings @@ -220,7 +193,8 @@ input_charset = str(input_charset, 'ascii') except UnicodeError: raise errors.CharsetError(input_charset) - input_charset = input_charset.lower() + + input_charset = normalize_encoding(input_charset) # Set the input charset after filtering through the aliases self.input_charset = ALIASES.get(input_charset, input_charset) # We can try to guess which encoding and conversion to use by the diff -r da65e1dcdda9 Lib/email/mime/aliases.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Lib/email/mime/aliases.py Fri May 27 07:25:05 2011 +0200 @@ -0,0 +1,29 @@ +from encodings import normalize_encoding as _normalize_encoding + +__all__ = ['normalize_encoding'] + + +# Maps normalized codecs with their corresponding IANA aliases +aliases = { + 'not-a-charset': 'not a charset', + 'latin-1': 'iso-8859-1', + 'latin-2': 'iso-8859-2', + 'latin-3': 'iso-8859-3', + 'latin-4': 'iso-8859-4', + 'latin-5': 'iso-8859-9', + 'latin-6': 'iso-8859-10', + 'latin-7': 'iso-8859-13', + 'latin-8': 'iso-8859-14', + 'latin-9': 'iso-8859-15', + 'latin-10':'iso-8859-16', + 'shift-jis': 'shift_jis' + } + + +def normalize_encoding(encoding): + encoding = _normalize_encoding(encoding.lower()) + # Since most IANA defined aliases uses '-' instead of '_', replace. + encoding = encoding.replace('_', '-') + + return aliases.get(encoding, encoding) + diff -r da65e1dcdda9 Lib/encodings/aliases.py --- a/Lib/encodings/aliases.py Thu May 26 17:06:47 2011 +0200 +++ b/Lib/encodings/aliases.py Fri May 27 07:25:05 2011 +0200 @@ -254,7 +254,7 @@ # hp_roman8 codec 'roman8' : 'hp_roman8', 'r8' : 'hp_roman8', - 'csHPRoman8' : 'hp_roman8', + 'cshproman8' : 'hp_roman8', # hz codec 'hzgb' : 'hz', @@ -298,6 +298,7 @@ 'iso_ir_157' : 'iso8859_10', 'l6' : 'iso8859_10', 'latin6' : 'iso8859_10', + 'latin_6' : 'iso8859_10', # iso8859_11 codec 'thai' : 'iso8859_11', @@ -308,6 +309,7 @@ 'iso_8859_13' : 'iso8859_13', 'l7' : 'iso8859_13', 'latin7' : 'iso8859_13', + 'latin_7' : 'iso8859_13', # iso8859_14 codec 'iso_8859_14' : 'iso8859_14', @@ -316,11 +318,13 @@ 'iso_ir_199' : 'iso8859_14', 'l8' : 'iso8859_14', 'latin8' : 'iso8859_14', + 'latin_8' : 'iso8859_14', # iso8859_15 codec 'iso_8859_15' : 'iso8859_15', 'l9' : 'iso8859_15', 'latin9' : 'iso8859_15', + 'latin_9' : 'iso8859_15', # iso8859_16 codec 'iso_8859_16' : 'iso8859_16', @@ -328,6 +332,7 @@ 'iso_ir_226' : 'iso8859_16', 'l10' : 'iso8859_16', 'latin10' : 'iso8859_16', + 'latin_10' : 'iso8859_16', # iso8859_2 codec 'csisolatin2' : 'iso8859_2', @@ -336,6 +341,7 @@ 'iso_ir_101' : 'iso8859_2', 'l2' : 'iso8859_2', 'latin2' : 'iso8859_2', + 'latin_2' : 'iso8859_2', # iso8859_3 codec 'csisolatin3' : 'iso8859_3', @@ -344,6 +350,7 @@ 'iso_ir_109' : 'iso8859_3', 'l3' : 'iso8859_3', 'latin3' : 'iso8859_3', + 'latin_3' : 'iso8859_3', # iso8859_4 codec 'csisolatin4' : 'iso8859_4', @@ -352,6 +359,7 @@ 'iso_ir_110' : 'iso8859_4', 'l4' : 'iso8859_4', 'latin4' : 'iso8859_4', + 'latin_4' : 'iso8859_4', # iso8859_5 codec 'csisolatincyrillic' : 'iso8859_5', @@ -393,6 +401,7 @@ 'iso_ir_148' : 'iso8859_9', 'l5' : 'iso8859_9', 'latin5' : 'iso8859_9', + 'latin_5' : 'iso8859_9', # johab codec 'cp1361' : 'johab', @@ -474,9 +483,6 @@ 'sjisx0213' : 'shift_jisx0213', 's_jisx0213' : 'shift_jisx0213', - # tactis codec - 'tis260' : 'tactis', - # tis_620 codec 'tis620' : 'tis_620', 'tis_620_0' : 'tis_620', diff -r da65e1dcdda9 Lib/test/test_codeccallbacks.py --- a/Lib/test/test_codeccallbacks.py Thu May 26 17:06:47 2011 +0200 +++ b/Lib/test/test_codeccallbacks.py Fri May 27 07:25:05 2011 +0200 @@ -1,5 +1,13 @@ -import test.support, unittest -import sys, codecs, html.entities, unicodedata +import test.support +import unittest + +from encodings.aliases import aliases +import codecs +import unicodedata +import html.entities +import importlib +import sys + class PosReturn: # this can be used for configurable callbacks @@ -629,7 +637,16 @@ "test.badhandler" ) - def test_lookup(self): + def test_lookup_aliases(self): + for alias, module_name in aliases.items(): + if module_name == 'mbcs' and not sys.platform.startswith('win'): + continue + + module = importlib.import_module('encodings.' + module_name) + codec_name = module.getregentry().name + self.assertEqual(codecs.lookup(alias).name, codec_name) + + def test_lookup_error(self): self.assertEqual(codecs.strict_errors, codecs.lookup_error("strict")) self.assertEqual(codecs.ignore_errors, codecs.lookup_error("ignore")) self.assertEqual(codecs.strict_errors, codecs.lookup_error("strict")) @@ -664,7 +681,7 @@ self.assertRaises(TypeError, codecs.register_error, 42) self.assertRaises(TypeError, codecs.register_error, "test.dummy", 42) - def test_badlookupcall(self): + def test_badlookup_errorcall(self): # enhance coverage of: # Modules/_codecsmodule.c::lookup_error() self.assertRaises(TypeError, codecs.lookup_error)