diff --git a/Lib/test/test_ucn.py b/Lib/test/test_ucn.py --- a/Lib/test/test_ucn.py +++ b/Lib/test/test_ucn.py @@ -98,13 +98,11 @@ def test_bmp_characters(self): import unicodedata - count = 0 for code in range(0x10000): char = chr(code) name = unicodedata.name(char, None) if name is not None: self.assertEqual(unicodedata.lookup(name), char) - count += 1 def test_misc_symbols(self): self.checkletter("PILCROW SIGN", "\u00b6") @@ -112,6 +110,31 @@ self.checkletter("HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK", "\uFF9F") self.checkletter("FULLWIDTH LATIN SMALL LETTER A", "\uFF41") + def test_aliases(self): + import unicodedata + # check that the aliases defined in the NameAliases.txt file work. + # This should be updated when new aliases are added or the file + # should be downloaded and parsed instead + aliases = [ + ('LATIN CAPITAL LETTER GHA', 0x01A2), + ('LATIN SMALL LETTER GHA', 0x01A3), + ('KANNADA LETTER LLLA', 0x0CDE), + ('LAO LETTER FO FON', 0x0E9D), + ('LAO LETTER FO FAY', 0x0E9F), + ('LAO LETTER RO', 0x0EA3), + ('LAO LETTER LO', 0x0EA5), + ('TIBETAN MARK BKA- SHOG GI MGO RGYAN', 0x0FD0), + ('YI SYLLABLE ITERATION MARK', 0xA015), + ('PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRACKET', 0xFE18), + ('BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA VASIS', 0x1D0C5) + ] + for alias, codepoint in aliases: + self.checkletter(alias, chr(codepoint)) + name = unicodedata.name(chr(codepoint)) + self.assertNotEqual(alias, name) + self.assertEqual(unicodedata.lookup(alias), + unicodedata.lookup(name)) + def test_errors(self): import unicodedata self.assertRaises(TypeError, unicodedata.name) diff --git a/Modules/unicodedata.c b/Modules/unicodedata.c --- a/Modules/unicodedata.c +++ b/Modules/unicodedata.c @@ -1054,7 +1054,7 @@ static int _getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code) { - unsigned int h, v; + unsigned int h, v, k; unsigned int mask = code_size-1; unsigned int i, incr; @@ -1100,6 +1100,17 @@ return 1; } + // check for aliases defined in NameAliases.txt + for (k=0; kcode dictionary */", file=fp) codehash.dump(fp, trace) + print(""" +typedef struct Alias { + char *name; + int namelen; + int codepoint; +} alias; +""", file=fp) + + print('int aliases_count = %d;' % len(unicode.aliases), file=fp) + + print('alias name_aliases[] = {', file=fp) + for name, codepoint in unicode.aliases: + print(' {"%s", %d, 0x%04X},' % (name, len(name), codepoint), file=fp) + print('};', file=fp) + fp.close() @@ -855,6 +871,16 @@ self.table = table self.chars = list(range(0x110000)) # unicode 3.2 + self.aliases = [] + with open_data(NAME_ALIASES, version) as file: + for s in file: + s = s.strip() + if not s or s.startswith('#'): + continue + char, name = s.split(';') + char = int(char, 16) + self.aliases.append((name, char)) + self.exclusions = {} with open_data(COMPOSITION_EXCLUSIONS, version) as file: for s in file: