diff --git a/Doc/library/unicodedata.rst b/Doc/library/unicodedata.rst --- a/Doc/library/unicodedata.rst +++ b/Doc/library/unicodedata.rst @@ -29,6 +29,9 @@ Look up character by name. If a character with the given name is found, return the corresponding character. If not found, :exc:`KeyError` is raised. + .. versionchanged:: 3.3 + Support for name aliases [#]_ and named sequences [#]_ has been added. + .. function:: name(chr[, default]) @@ -160,3 +163,9 @@ >>> unicodedata.bidirectional('\u0660') # 'A'rabic, 'N'umber 'AN' + +.. rubric:: Footnotes + +.. [#] http://www.unicode.org/Public/6.0.0/ucd/NameAliases.txt + +.. [#] http://www.unicode.org/Public/6.0.0/ucd/NamedSequences.txt diff --git a/Doc/reference/lexical_analysis.rst b/Doc/reference/lexical_analysis.rst --- a/Doc/reference/lexical_analysis.rst +++ b/Doc/reference/lexical_analysis.rst @@ -492,13 +492,13 @@ +-----------------+---------------------------------+-------+ | Escape Sequence | Meaning | Notes | +=================+=================================+=======+ -| ``\N{name}`` | Character named *name* in the | | +| ``\N{name}`` | Character named *name* in the | \(4) | | | Unicode database | | +-----------------+---------------------------------+-------+ -| ``\uxxxx`` | Character with 16-bit hex value | \(4) | +| ``\uxxxx`` | Character with 16-bit hex value | \(5) | | | *xxxx* | | +-----------------+---------------------------------+-------+ -| ``\Uxxxxxxxx`` | Character with 32-bit hex value | \(5) | +| ``\Uxxxxxxxx`` | Character with 32-bit hex value | \(6) | | | *xxxxxxxx* | | +-----------------+---------------------------------+-------+ @@ -516,10 +516,14 @@ with the given value. (4) + .. versionchanged:: 3.3 + Support for name aliases [#]_ has been added. + +(5) Individual code units which form parts of a surrogate pair can be encoded using this escape sequence. Exactly four hex digits are required. -(5) +(6) Any Unicode character can be encoded this way, but characters outside the Basic Multilingual Plane (BMP) will be encoded using a surrogate pair if Python is compiled to use 16-bit code units (the default). Exactly eight hex digits @@ -706,3 +710,8 @@ occurrence outside string literals and comments is an unconditional error:: $ ? ` + + +.. rubric:: Footnotes + +.. [#] http://www.unicode.org/Public/6.0.0/ucd/NameAliases.txt diff --git a/Lib/test/test_ucn.py b/Lib/test/test_ucn.py --- a/Lib/test/test_ucn.py +++ b/Lib/test/test_ucn.py @@ -8,8 +8,11 @@ """#" import unittest +import unicodedata from test import support +from http.client import HTTPException +from test.test_normalization import check_version class UnicodeNamesTest(unittest.TestCase): @@ -59,8 +62,6 @@ ) def test_ascii_letters(self): - import unicodedata - for char in "".join(map(chr, range(ord("a"), ord("z")))): name = "LATIN SMALL LETTER %s" % char.upper() code = unicodedata.lookup(name) @@ -81,7 +82,6 @@ self.checkletter("HANGUL SYLLABLE HWEOK", "\ud6f8") self.checkletter("HANGUL SYLLABLE HIH", "\ud7a3") - import unicodedata self.assertRaises(ValueError, unicodedata.name, "\ud7a4") def test_cjk_unified_ideographs(self): @@ -97,14 +97,11 @@ self.checkletter("CJK UNIFIED IDEOGRAPH-2B81D", "\U0002B81D") def test_bmp_characters(self): - import unicodedata - count = 0 for code in range(0x10000): char = chr(code) name = unicodedata.name(char, None) if name is not None: self.assertEqual(unicodedata.lookup(name), char) - count += 1 def test_misc_symbols(self): self.checkletter("PILCROW SIGN", "\u00b6") @@ -112,8 +109,65 @@ self.checkletter("HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK", "\uFF9F") self.checkletter("FULLWIDTH LATIN SMALL LETTER A", "\uFF41") + def test_aliases(self): + # Check that the aliases defined in the NameAliases.txt file work. + # This should be updated when new aliases are added or the file + # should be downloaded and parsed instead. See #12753. + aliases = [ + ('LATIN CAPITAL LETTER GHA', 0x01A2), + ('LATIN SMALL LETTER GHA', 0x01A3), + ('KANNADA LETTER LLLA', 0x0CDE), + ('LAO LETTER FO FON', 0x0E9D), + ('LAO LETTER FO FAY', 0x0E9F), + ('LAO LETTER RO', 0x0EA3), + ('LAO LETTER LO', 0x0EA5), + ('TIBETAN MARK BKA- SHOG GI MGO RGYAN', 0x0FD0), + ('YI SYLLABLE ITERATION MARK', 0xA015), + ('PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRACKET', 0xFE18), + ('BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA VASIS', 0x1D0C5) + ] + for alias, codepoint in aliases: + self.checkletter(alias, chr(codepoint)) + name = unicodedata.name(chr(codepoint)) + self.assertNotEqual(name, alias) + self.assertEqual(unicodedata.lookup(alias), + unicodedata.lookup(name)) + + def test_named_sequences_sample(self): + # Check a few named sequences. See #12753. + sequences = [ + ('LATIN SMALL LETTER R WITH TILDE', '\u0072\u0303'), + ('TAMIL SYLLABLE SAI', '\u0BB8\u0BC8'), + ('TAMIL SYLLABLE MOO', '\u0BAE\u0BCB'), + ('TAMIL SYLLABLE NNOO', '\u0BA3\u0BCB'), + ('TAMIL CONSONANT KSS', '\u0B95\u0BCD\u0BB7\u0BCD'), + ] + for seqname, codepoints in sequences: + self.assertEqual(unicodedata.lookup(seqname), codepoints) + with self.assertRaises(SyntaxError): + self.checkletter(seqname, None) + + def test_named_sequences_full(self): + # Check all the named sequences + url = ("http://www.unicode.org/Public/%s/ucd/NamedSequences.txt" % + unicodedata.unidata_version) + try: + testdata = support.open_urlresource(url, encoding="utf-8", + check=check_version) + except (IOError, HTTPException): + self.skipTest("Could not retrieve " + url) + self.addCleanup(testdata.close) + for line in testdata: + line = line.strip() + if not line or line.startswith('#'): + continue + seqname, codepoints = line.split(';') + codepoints = ''.join(chr(int(cp, 16)) for cp in codepoints.split()) + self.assertEqual(unicodedata.lookup(seqname), codepoints) + with self.assertRaises(SyntaxError): + self.checkletter(seqname, None) + def test_errors(self): - import unicodedata self.assertRaises(TypeError, unicodedata.name) self.assertRaises(TypeError, unicodedata.name, 'xx') self.assertRaises(TypeError, unicodedata.lookup) diff --git a/Modules/unicodedata.c b/Modules/unicodedata.c --- a/Modules/unicodedata.c +++ b/Modules/unicodedata.c @@ -1054,7 +1054,7 @@ static int _getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code) { - unsigned int h, v; + unsigned int h, v, k; unsigned int mask = code_size-1; unsigned int i, incr; @@ -1100,6 +1100,17 @@ return 1; } + /* check for aliases defined in NameAliases.txt */ + for (k=0; k 0) + low = mid + 1; + else + return PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND, + named_sequences[mid].seq, + named_sequences[mid].seqlen); + } + return NULL; +} + PyDoc_STRVAR(unicodedata_lookup__doc__, "lookup(name)\n\ \n\ @@ -1187,6 +1218,7 @@ unicodedata_lookup(PyObject* self, PyObject* args) { Py_UCS4 code; + PyObject *codes; /* for named sequences */ char* name; int namelen; @@ -1194,9 +1226,13 @@ return NULL; if (!_getcode(self, name, namelen, &code)) { - PyErr_Format(PyExc_KeyError, "undefined character name '%s'", - name); - return NULL; + /* if the normal lookup fails try with named sequences */ + codes = _lookup_named_sequences(name); + if (codes == NULL) { + PyErr_Format(PyExc_KeyError, "undefined character name '%s'", name); + return NULL; + } + return codes; } return PyUnicode_FromOrdinal(code); diff --git a/Modules/unicodename_db.h b/Modules/unicodename_db.h --- a/Modules/unicodename_db.h +++ b/Modules/unicodename_db.h @@ -18811,3 +18811,452 @@ #define code_magic 47 #define code_size 32768 #define code_poly 32771 + +typedef struct Alias { + char *name; + int namelen; + int codepoint; +} alias; + +static const int aliases_count = 11; +static const alias name_aliases[] = { + {"LATIN CAPITAL LETTER GHA", 24, 0x01A2}, + {"LATIN SMALL LETTER GHA", 22, 0x01A3}, + {"KANNADA LETTER LLLA", 19, 0x0CDE}, + {"LAO LETTER FO FON", 17, 0x0E9D}, + {"LAO LETTER FO FAY", 17, 0x0E9F}, + {"LAO LETTER RO", 13, 0x0EA3}, + {"LAO LETTER LO", 13, 0x0EA5}, + {"TIBETAN MARK BKA- SHOG GI MGO RGYAN", 35, 0x0FD0}, + {"YI SYLLABLE ITERATION MARK", 26, 0xA015}, + {"PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRACKET", 61, 0xFE18}, + {"BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA VASIS", 52, 0x1D0C5}, +}; + +typedef struct NamedSequence { + char *name; + int seqlen; + Py_UCS2 seq[4]; +} named_sequence; + +static const int named_sequences_count = 418; +static const named_sequence named_sequences[] = { + {"BENGALI LETTER KHINYA", 3, {0x0995, 0x09CD, 0x09B7}}, + {"GEORGIAN LETTER U-BRJGU", 2, {0x10E3, 0x0302}}, + {"HIRAGANA LETTER BIDAKUON NGA", 2, {0x304B, 0x309A}}, + {"HIRAGANA LETTER BIDAKUON NGE", 2, {0x3051, 0x309A}}, + {"HIRAGANA LETTER BIDAKUON NGI", 2, {0x304D, 0x309A}}, + {"HIRAGANA LETTER BIDAKUON NGO", 2, {0x3053, 0x309A}}, + {"HIRAGANA LETTER BIDAKUON NGU", 2, {0x304F, 0x309A}}, + {"KATAKANA LETTER AINU CE", 2, {0x30BB, 0x309A}}, + {"KATAKANA LETTER AINU P", 2, {0x31F7, 0x309A}}, + {"KATAKANA LETTER AINU TO", 2, {0x30C8, 0x309A}}, + {"KATAKANA LETTER AINU TU", 2, {0x30C4, 0x309A}}, + {"KATAKANA LETTER BIDAKUON NGA", 2, {0x30AB, 0x309A}}, + {"KATAKANA LETTER BIDAKUON NGE", 2, {0x30B1, 0x309A}}, + {"KATAKANA LETTER BIDAKUON NGI", 2, {0x30AD, 0x309A}}, + {"KATAKANA LETTER BIDAKUON NGO", 2, {0x30B3, 0x309A}}, + {"KATAKANA LETTER BIDAKUON NGU", 2, {0x30AF, 0x309A}}, + {"KHMER CONSONANT SIGN COENG BA", 2, {0x17D2, 0x1794}}, + {"KHMER CONSONANT SIGN COENG CA", 2, {0x17D2, 0x1785}}, + {"KHMER CONSONANT SIGN COENG CHA", 2, {0x17D2, 0x1786}}, + {"KHMER CONSONANT SIGN COENG CHO", 2, {0x17D2, 0x1788}}, + {"KHMER CONSONANT SIGN COENG CO", 2, {0x17D2, 0x1787}}, + {"KHMER CONSONANT SIGN COENG DA", 2, {0x17D2, 0x178A}}, + {"KHMER CONSONANT SIGN COENG DO", 2, {0x17D2, 0x178C}}, + {"KHMER CONSONANT SIGN COENG HA", 2, {0x17D2, 0x17A0}}, + {"KHMER CONSONANT SIGN COENG KA", 2, {0x17D2, 0x1780}}, + {"KHMER CONSONANT SIGN COENG KHA", 2, {0x17D2, 0x1781}}, + {"KHMER CONSONANT SIGN COENG KHO", 2, {0x17D2, 0x1783}}, + {"KHMER CONSONANT SIGN COENG KO", 2, {0x17D2, 0x1782}}, + {"KHMER CONSONANT SIGN COENG LA", 2, {0x17D2, 0x17A1}}, + {"KHMER CONSONANT SIGN COENG LO", 2, {0x17D2, 0x179B}}, + {"KHMER CONSONANT SIGN COENG MO", 2, {0x17D2, 0x1798}}, + {"KHMER CONSONANT SIGN COENG NA", 2, {0x17D2, 0x178E}}, + {"KHMER CONSONANT SIGN COENG NGO", 2, {0x17D2, 0x1784}}, + {"KHMER CONSONANT SIGN COENG NO", 2, {0x17D2, 0x1793}}, + {"KHMER CONSONANT SIGN COENG NYO", 2, {0x17D2, 0x1789}}, + {"KHMER CONSONANT SIGN COENG PHA", 2, {0x17D2, 0x1795}}, + {"KHMER CONSONANT SIGN COENG PHO", 2, {0x17D2, 0x1797}}, + {"KHMER CONSONANT SIGN COENG PO", 2, {0x17D2, 0x1796}}, + {"KHMER CONSONANT SIGN COENG RO", 2, {0x17D2, 0x179A}}, + {"KHMER CONSONANT SIGN COENG SA", 2, {0x17D2, 0x179F}}, + {"KHMER CONSONANT SIGN COENG SHA", 2, {0x17D2, 0x179D}}, + {"KHMER CONSONANT SIGN COENG SSA", 2, {0x17D2, 0x179E}}, + {"KHMER CONSONANT SIGN COENG TA", 2, {0x17D2, 0x178F}}, + {"KHMER CONSONANT SIGN COENG THA", 2, {0x17D2, 0x1790}}, + {"KHMER CONSONANT SIGN COENG THO", 2, {0x17D2, 0x1792}}, + {"KHMER CONSONANT SIGN COENG TO", 2, {0x17D2, 0x1791}}, + {"KHMER CONSONANT SIGN COENG TTHA", 2, {0x17D2, 0x178B}}, + {"KHMER CONSONANT SIGN COENG TTHO", 2, {0x17D2, 0x178D}}, + {"KHMER CONSONANT SIGN COENG VO", 2, {0x17D2, 0x179C}}, + {"KHMER CONSONANT SIGN COENG YO", 2, {0x17D2, 0x1799}}, + {"KHMER INDEPENDENT VOWEL SIGN COENG QE", 2, {0x17D2, 0x17AF}}, + {"KHMER INDEPENDENT VOWEL SIGN COENG QU", 2, {0x17D2, 0x17A7}}, + {"KHMER INDEPENDENT VOWEL SIGN COENG RY", 2, {0x17D2, 0x17AB}}, + {"KHMER INDEPENDENT VOWEL SIGN COENG RYY", 2, {0x17D2, 0x17AC}}, + {"KHMER VOWEL SIGN AAM", 2, {0x17B6, 0x17C6}}, + {"KHMER VOWEL SIGN COENG QA", 2, {0x17D2, 0x17A2}}, + {"KHMER VOWEL SIGN OM", 2, {0x17BB, 0x17C6}}, + {"LATIN CAPITAL LETTER A WITH MACRON AND GRAVE", 2, {0x0100, 0x0300}}, + {"LATIN CAPITAL LETTER A WITH OGONEK AND ACUTE", 2, {0x0104, 0x0301}}, + {"LATIN CAPITAL LETTER A WITH OGONEK AND TILDE", 2, {0x0104, 0x0303}}, + {"LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND CARON", 2, {0x00CA, 0x030C}}, + {"LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND MACRON", 2, {0x00CA, 0x0304}}, + {"LATIN CAPITAL LETTER E WITH DOT ABOVE AND ACUTE", 2, {0x0116, 0x0301}}, + {"LATIN CAPITAL LETTER E WITH DOT ABOVE AND TILDE", 2, {0x0116, 0x0303}}, + {"LATIN CAPITAL LETTER E WITH OGONEK AND ACUTE", 2, {0x0118, 0x0301}}, + {"LATIN CAPITAL LETTER E WITH OGONEK AND TILDE", 2, {0x0118, 0x0303}}, + {"LATIN CAPITAL LETTER E WITH VERTICAL LINE BELOW", 2, {0x0045, 0x0329}}, + {"LATIN CAPITAL LETTER E WITH VERTICAL LINE BELOW AND ACUTE", 2, {0x00C9, 0x0329}}, + {"LATIN CAPITAL LETTER E WITH VERTICAL LINE BELOW AND GRAVE", 2, {0x00C8, 0x0329}}, + {"LATIN CAPITAL LETTER I WITH MACRON AND GRAVE", 2, {0x012A, 0x0300}}, + {"LATIN CAPITAL LETTER I WITH OGONEK AND ACUTE", 2, {0x012E, 0x0301}}, + {"LATIN CAPITAL LETTER I WITH OGONEK AND TILDE", 2, {0x012E, 0x0303}}, + {"LATIN CAPITAL LETTER J WITH TILDE", 2, {0x004A, 0x0303}}, + {"LATIN CAPITAL LETTER L WITH TILDE", 2, {0x004C, 0x0303}}, + {"LATIN CAPITAL LETTER M WITH TILDE", 2, {0x004D, 0x0303}}, + {"LATIN CAPITAL LETTER O WITH VERTICAL LINE BELOW", 2, {0x004F, 0x0329}}, + {"LATIN CAPITAL LETTER O WITH VERTICAL LINE BELOW AND ACUTE", 2, {0x00D3, 0x0329}}, + {"LATIN CAPITAL LETTER O WITH VERTICAL LINE BELOW AND GRAVE", 2, {0x00D2, 0x0329}}, + {"LATIN CAPITAL LETTER R WITH TILDE", 2, {0x0052, 0x0303}}, + {"LATIN CAPITAL LETTER S WITH VERTICAL LINE BELOW", 2, {0x0053, 0x0329}}, + {"LATIN CAPITAL LETTER U WITH MACRON AND ACUTE", 2, {0x016A, 0x0301}}, + {"LATIN CAPITAL LETTER U WITH MACRON AND GRAVE", 2, {0x016A, 0x0300}}, + {"LATIN CAPITAL LETTER U WITH MACRON AND TILDE", 2, {0x016A, 0x0303}}, + {"LATIN CAPITAL LETTER U WITH OGONEK AND ACUTE", 2, {0x0172, 0x0301}}, + {"LATIN CAPITAL LETTER U WITH OGONEK AND TILDE", 2, {0x0172, 0x0303}}, + {"LATIN SMALL LETTER A WITH MACRON AND GRAVE", 2, {0x0101, 0x0300}}, + {"LATIN SMALL LETTER A WITH OGONEK AND ACUTE", 2, {0x0105, 0x0301}}, + {"LATIN SMALL LETTER A WITH OGONEK AND TILDE", 2, {0x0105, 0x0303}}, + {"LATIN SMALL LETTER AE WITH GRAVE", 2, {0x00E6, 0x0300}}, + {"LATIN SMALL LETTER E WITH CIRCUMFLEX AND CARON", 2, {0x00EA, 0x030C}}, + {"LATIN SMALL LETTER E WITH CIRCUMFLEX AND MACRON", 2, {0x00EA, 0x0304}}, + {"LATIN SMALL LETTER E WITH DOT ABOVE AND ACUTE", 2, {0x0117, 0x0301}}, + {"LATIN SMALL LETTER E WITH DOT ABOVE AND TILDE", 2, {0x0117, 0x0303}}, + {"LATIN SMALL LETTER E WITH OGONEK AND ACUTE", 2, {0x0119, 0x0301}}, + {"LATIN SMALL LETTER E WITH OGONEK AND TILDE", 2, {0x0119, 0x0303}}, + {"LATIN SMALL LETTER E WITH VERTICAL LINE BELOW", 2, {0x0065, 0x0329}}, + {"LATIN SMALL LETTER E WITH VERTICAL LINE BELOW AND ACUTE", 2, {0x00E9, 0x0329}}, + {"LATIN SMALL LETTER E WITH VERTICAL LINE BELOW AND GRAVE", 2, {0x00E8, 0x0329}}, + {"LATIN SMALL LETTER HOOKED SCHWA WITH ACUTE", 2, {0x025A, 0x0301}}, + {"LATIN SMALL LETTER HOOKED SCHWA WITH GRAVE", 2, {0x025A, 0x0300}}, + {"LATIN SMALL LETTER I WITH DOT ABOVE AND ACUTE", 3, {0x0069, 0x0307, 0x0301}}, + {"LATIN SMALL LETTER I WITH DOT ABOVE AND GRAVE", 3, {0x0069, 0x0307, 0x0300}}, + {"LATIN SMALL LETTER I WITH DOT ABOVE AND TILDE", 3, {0x0069, 0x0307, 0x0303}}, + {"LATIN SMALL LETTER I WITH MACRON AND GRAVE", 2, {0x012B, 0x0300}}, + {"LATIN SMALL LETTER I WITH OGONEK AND DOT ABOVE AND ACUTE", 3, {0x012F, 0x0307, 0x0301}}, + {"LATIN SMALL LETTER I WITH OGONEK AND DOT ABOVE AND TILDE", 3, {0x012F, 0x0307, 0x0303}}, + {"LATIN SMALL LETTER J WITH DOT ABOVE AND TILDE", 3, {0x006A, 0x0307, 0x0303}}, + {"LATIN SMALL LETTER L WITH TILDE", 2, {0x006C, 0x0303}}, + {"LATIN SMALL LETTER M WITH TILDE", 2, {0x006D, 0x0303}}, + {"LATIN SMALL LETTER NG WITH TILDE ABOVE", 3, {0x006E, 0x0360, 0x0067}}, + {"LATIN SMALL LETTER O WITH VERTICAL LINE BELOW", 2, {0x006F, 0x0329}}, + {"LATIN SMALL LETTER O WITH VERTICAL LINE BELOW AND ACUTE", 2, {0x00F3, 0x0329}}, + {"LATIN SMALL LETTER O WITH VERTICAL LINE BELOW AND GRAVE", 2, {0x00F2, 0x0329}}, + {"LATIN SMALL LETTER OPEN O WITH ACUTE", 2, {0x0254, 0x0301}}, + {"LATIN SMALL LETTER OPEN O WITH GRAVE", 2, {0x0254, 0x0300}}, + {"LATIN SMALL LETTER R WITH TILDE", 2, {0x0072, 0x0303}}, + {"LATIN SMALL LETTER S WITH VERTICAL LINE BELOW", 2, {0x0073, 0x0329}}, + {"LATIN SMALL LETTER SCHWA WITH ACUTE", 2, {0x0259, 0x0301}}, + {"LATIN SMALL LETTER SCHWA WITH GRAVE", 2, {0x0259, 0x0300}}, + {"LATIN SMALL LETTER TURNED V WITH ACUTE", 2, {0x028C, 0x0301}}, + {"LATIN SMALL LETTER TURNED V WITH GRAVE", 2, {0x028C, 0x0300}}, + {"LATIN SMALL LETTER U WITH MACRON AND ACUTE", 2, {0x016B, 0x0301}}, + {"LATIN SMALL LETTER U WITH MACRON AND GRAVE", 2, {0x016B, 0x0300}}, + {"LATIN SMALL LETTER U WITH MACRON AND TILDE", 2, {0x016B, 0x0303}}, + {"LATIN SMALL LETTER U WITH OGONEK AND ACUTE", 2, {0x0173, 0x0301}}, + {"LATIN SMALL LETTER U WITH OGONEK AND TILDE", 2, {0x0173, 0x0303}}, + {"MODIFIER LETTER EXTRA-HIGH EXTRA-LOW CONTOUR TONE BAR", 2, {0x02E5, 0x02E9}}, + {"MODIFIER LETTER EXTRA-LOW EXTRA-HIGH CONTOUR TONE BAR", 2, {0x02E9, 0x02E5}}, + {"TAMIL CONSONANT C", 2, {0x0B9A, 0x0BCD}}, + {"TAMIL CONSONANT H", 2, {0x0BB9, 0x0BCD}}, + {"TAMIL CONSONANT J", 2, {0x0B9C, 0x0BCD}}, + {"TAMIL CONSONANT K", 2, {0x0B95, 0x0BCD}}, + {"TAMIL CONSONANT KSS", 4, {0x0B95, 0x0BCD, 0x0BB7, 0x0BCD}}, + {"TAMIL CONSONANT L", 2, {0x0BB2, 0x0BCD}}, + {"TAMIL CONSONANT LL", 2, {0x0BB3, 0x0BCD}}, + {"TAMIL CONSONANT LLL", 2, {0x0BB4, 0x0BCD}}, + {"TAMIL CONSONANT M", 2, {0x0BAE, 0x0BCD}}, + {"TAMIL CONSONANT N", 2, {0x0BA8, 0x0BCD}}, + {"TAMIL CONSONANT NG", 2, {0x0B99, 0x0BCD}}, + {"TAMIL CONSONANT NN", 2, {0x0BA3, 0x0BCD}}, + {"TAMIL CONSONANT NNN", 2, {0x0BA9, 0x0BCD}}, + {"TAMIL CONSONANT NY", 2, {0x0B9E, 0x0BCD}}, + {"TAMIL CONSONANT P", 2, {0x0BAA, 0x0BCD}}, + {"TAMIL CONSONANT R", 2, {0x0BB0, 0x0BCD}}, + {"TAMIL CONSONANT RR", 2, {0x0BB1, 0x0BCD}}, + {"TAMIL CONSONANT S", 2, {0x0BB8, 0x0BCD}}, + {"TAMIL CONSONANT SH", 2, {0x0BB6, 0x0BCD}}, + {"TAMIL CONSONANT SS", 2, {0x0BB7, 0x0BCD}}, + {"TAMIL CONSONANT T", 2, {0x0BA4, 0x0BCD}}, + {"TAMIL CONSONANT TT", 2, {0x0B9F, 0x0BCD}}, + {"TAMIL CONSONANT V", 2, {0x0BB5, 0x0BCD}}, + {"TAMIL CONSONANT Y", 2, {0x0BAF, 0x0BCD}}, + {"TAMIL SYLLABLE CAA", 2, {0x0B9A, 0x0BBE}}, + {"TAMIL SYLLABLE CAI", 2, {0x0B9A, 0x0BC8}}, + {"TAMIL SYLLABLE CAU", 2, {0x0B9A, 0x0BCC}}, + {"TAMIL SYLLABLE CE", 2, {0x0B9A, 0x0BC6}}, + {"TAMIL SYLLABLE CEE", 2, {0x0B9A, 0x0BC7}}, + {"TAMIL SYLLABLE CI", 2, {0x0B9A, 0x0BBF}}, + {"TAMIL SYLLABLE CII", 2, {0x0B9A, 0x0BC0}}, + {"TAMIL SYLLABLE CO", 2, {0x0B9A, 0x0BCA}}, + {"TAMIL SYLLABLE COO", 2, {0x0B9A, 0x0BCB}}, + {"TAMIL SYLLABLE CU", 2, {0x0B9A, 0x0BC1}}, + {"TAMIL SYLLABLE CUU", 2, {0x0B9A, 0x0BC2}}, + {"TAMIL SYLLABLE HAA", 2, {0x0BB9, 0x0BBE}}, + {"TAMIL SYLLABLE HAI", 2, {0x0BB9, 0x0BC8}}, + {"TAMIL SYLLABLE HAU", 2, {0x0BB9, 0x0BCC}}, + {"TAMIL SYLLABLE HE", 2, {0x0BB9, 0x0BC6}}, + {"TAMIL SYLLABLE HEE", 2, {0x0BB9, 0x0BC7}}, + {"TAMIL SYLLABLE HI", 2, {0x0BB9, 0x0BBF}}, + {"TAMIL SYLLABLE HII", 2, {0x0BB9, 0x0BC0}}, + {"TAMIL SYLLABLE HO", 2, {0x0BB9, 0x0BCA}}, + {"TAMIL SYLLABLE HOO", 2, {0x0BB9, 0x0BCB}}, + {"TAMIL SYLLABLE HU", 2, {0x0BB9, 0x0BC1}}, + {"TAMIL SYLLABLE HUU", 2, {0x0BB9, 0x0BC2}}, + {"TAMIL SYLLABLE JAA", 2, {0x0B9C, 0x0BBE}}, + {"TAMIL SYLLABLE JAI", 2, {0x0B9C, 0x0BC8}}, + {"TAMIL SYLLABLE JAU", 2, {0x0B9C, 0x0BCC}}, + {"TAMIL SYLLABLE JE", 2, {0x0B9C, 0x0BC6}}, + {"TAMIL SYLLABLE JEE", 2, {0x0B9C, 0x0BC7}}, + {"TAMIL SYLLABLE JI", 2, {0x0B9C, 0x0BBF}}, + {"TAMIL SYLLABLE JII", 2, {0x0B9C, 0x0BC0}}, + {"TAMIL SYLLABLE JO", 2, {0x0B9C, 0x0BCA}}, + {"TAMIL SYLLABLE JOO", 2, {0x0B9C, 0x0BCB}}, + {"TAMIL SYLLABLE JU", 2, {0x0B9C, 0x0BC1}}, + {"TAMIL SYLLABLE JUU", 2, {0x0B9C, 0x0BC2}}, + {"TAMIL SYLLABLE KAA", 2, {0x0B95, 0x0BBE}}, + {"TAMIL SYLLABLE KAI", 2, {0x0B95, 0x0BC8}}, + {"TAMIL SYLLABLE KAU", 2, {0x0B95, 0x0BCC}}, + {"TAMIL SYLLABLE KE", 2, {0x0B95, 0x0BC6}}, + {"TAMIL SYLLABLE KEE", 2, {0x0B95, 0x0BC7}}, + {"TAMIL SYLLABLE KI", 2, {0x0B95, 0x0BBF}}, + {"TAMIL SYLLABLE KII", 2, {0x0B95, 0x0BC0}}, + {"TAMIL SYLLABLE KO", 2, {0x0B95, 0x0BCA}}, + {"TAMIL SYLLABLE KOO", 2, {0x0B95, 0x0BCB}}, + {"TAMIL SYLLABLE KSSA", 3, {0x0B95, 0x0BCD, 0x0BB7}}, + {"TAMIL SYLLABLE KSSAA", 4, {0x0B95, 0x0BCD, 0x0BB7, 0x0BBE}}, + {"TAMIL SYLLABLE KSSAI", 4, {0x0B95, 0x0BCD, 0x0BB7, 0x0BC8}}, + {"TAMIL SYLLABLE KSSAU", 4, {0x0B95, 0x0BCD, 0x0BB7, 0x0BCC}}, + {"TAMIL SYLLABLE KSSE", 4, {0x0B95, 0x0BCD, 0x0BB7, 0x0BC6}}, + {"TAMIL SYLLABLE KSSEE", 4, {0x0B95, 0x0BCD, 0x0BB7, 0x0BC7}}, + {"TAMIL SYLLABLE KSSI", 4, {0x0B95, 0x0BCD, 0x0BB7, 0x0BBF}}, + {"TAMIL SYLLABLE KSSII", 4, {0x0B95, 0x0BCD, 0x0BB7, 0x0BC0}}, + {"TAMIL SYLLABLE KSSO", 4, {0x0B95, 0x0BCD, 0x0BB7, 0x0BCA}}, + {"TAMIL SYLLABLE KSSOO", 4, {0x0B95, 0x0BCD, 0x0BB7, 0x0BCB}}, + {"TAMIL SYLLABLE KSSU", 4, {0x0B95, 0x0BCD, 0x0BB7, 0x0BC1}}, + {"TAMIL SYLLABLE KSSUU", 4, {0x0B95, 0x0BCD, 0x0BB7, 0x0BC2}}, + {"TAMIL SYLLABLE KU", 2, {0x0B95, 0x0BC1}}, + {"TAMIL SYLLABLE KUU", 2, {0x0B95, 0x0BC2}}, + {"TAMIL SYLLABLE LAA", 2, {0x0BB2, 0x0BBE}}, + {"TAMIL SYLLABLE LAI", 2, {0x0BB2, 0x0BC8}}, + {"TAMIL SYLLABLE LAU", 2, {0x0BB2, 0x0BCC}}, + {"TAMIL SYLLABLE LE", 2, {0x0BB2, 0x0BC6}}, + {"TAMIL SYLLABLE LEE", 2, {0x0BB2, 0x0BC7}}, + {"TAMIL SYLLABLE LI", 2, {0x0BB2, 0x0BBF}}, + {"TAMIL SYLLABLE LII", 2, {0x0BB2, 0x0BC0}}, + {"TAMIL SYLLABLE LLAA", 2, {0x0BB3, 0x0BBE}}, + {"TAMIL SYLLABLE LLAI", 2, {0x0BB3, 0x0BC8}}, + {"TAMIL SYLLABLE LLAU", 2, {0x0BB3, 0x0BCC}}, + {"TAMIL SYLLABLE LLE", 2, {0x0BB3, 0x0BC6}}, + {"TAMIL SYLLABLE LLEE", 2, {0x0BB3, 0x0BC7}}, + {"TAMIL SYLLABLE LLI", 2, {0x0BB3, 0x0BBF}}, + {"TAMIL SYLLABLE LLII", 2, {0x0BB3, 0x0BC0}}, + {"TAMIL SYLLABLE LLLAA", 2, {0x0BB4, 0x0BBE}}, + {"TAMIL SYLLABLE LLLAI", 2, {0x0BB4, 0x0BC8}}, + {"TAMIL SYLLABLE LLLAU", 2, {0x0BB4, 0x0BCC}}, + {"TAMIL SYLLABLE LLLE", 2, {0x0BB4, 0x0BC6}}, + {"TAMIL SYLLABLE LLLEE", 2, {0x0BB4, 0x0BC7}}, + {"TAMIL SYLLABLE LLLI", 2, {0x0BB4, 0x0BBF}}, + {"TAMIL SYLLABLE LLLII", 2, {0x0BB4, 0x0BC0}}, + {"TAMIL SYLLABLE LLLO", 2, {0x0BB4, 0x0BCA}}, + {"TAMIL SYLLABLE LLLOO", 2, {0x0BB4, 0x0BCB}}, + {"TAMIL SYLLABLE LLLU", 2, {0x0BB4, 0x0BC1}}, + {"TAMIL SYLLABLE LLLUU", 2, {0x0BB4, 0x0BC2}}, + {"TAMIL SYLLABLE LLO", 2, {0x0BB3, 0x0BCA}}, + {"TAMIL SYLLABLE LLOO", 2, {0x0BB3, 0x0BCB}}, + {"TAMIL SYLLABLE LLU", 2, {0x0BB3, 0x0BC1}}, + {"TAMIL SYLLABLE LLUU", 2, {0x0BB3, 0x0BC2}}, + {"TAMIL SYLLABLE LO", 2, {0x0BB2, 0x0BCA}}, + {"TAMIL SYLLABLE LOO", 2, {0x0BB2, 0x0BCB}}, + {"TAMIL SYLLABLE LU", 2, {0x0BB2, 0x0BC1}}, + {"TAMIL SYLLABLE LUU", 2, {0x0BB2, 0x0BC2}}, + {"TAMIL SYLLABLE MAA", 2, {0x0BAE, 0x0BBE}}, + {"TAMIL SYLLABLE MAI", 2, {0x0BAE, 0x0BC8}}, + {"TAMIL SYLLABLE MAU", 2, {0x0BAE, 0x0BCC}}, + {"TAMIL SYLLABLE ME", 2, {0x0BAE, 0x0BC6}}, + {"TAMIL SYLLABLE MEE", 2, {0x0BAE, 0x0BC7}}, + {"TAMIL SYLLABLE MI", 2, {0x0BAE, 0x0BBF}}, + {"TAMIL SYLLABLE MII", 2, {0x0BAE, 0x0BC0}}, + {"TAMIL SYLLABLE MO", 2, {0x0BAE, 0x0BCA}}, + {"TAMIL SYLLABLE MOO", 2, {0x0BAE, 0x0BCB}}, + {"TAMIL SYLLABLE MU", 2, {0x0BAE, 0x0BC1}}, + {"TAMIL SYLLABLE MUU", 2, {0x0BAE, 0x0BC2}}, + {"TAMIL SYLLABLE NAA", 2, {0x0BA8, 0x0BBE}}, + {"TAMIL SYLLABLE NAI", 2, {0x0BA8, 0x0BC8}}, + {"TAMIL SYLLABLE NAU", 2, {0x0BA8, 0x0BCC}}, + {"TAMIL SYLLABLE NE", 2, {0x0BA8, 0x0BC6}}, + {"TAMIL SYLLABLE NEE", 2, {0x0BA8, 0x0BC7}}, + {"TAMIL SYLLABLE NGAA", 2, {0x0B99, 0x0BBE}}, + {"TAMIL SYLLABLE NGAI", 2, {0x0B99, 0x0BC8}}, + {"TAMIL SYLLABLE NGAU", 2, {0x0B99, 0x0BCC}}, + {"TAMIL SYLLABLE NGE", 2, {0x0B99, 0x0BC6}}, + {"TAMIL SYLLABLE NGEE", 2, {0x0B99, 0x0BC7}}, + {"TAMIL SYLLABLE NGI", 2, {0x0B99, 0x0BBF}}, + {"TAMIL SYLLABLE NGII", 2, {0x0B99, 0x0BC0}}, + {"TAMIL SYLLABLE NGO", 2, {0x0B99, 0x0BCA}}, + {"TAMIL SYLLABLE NGOO", 2, {0x0B99, 0x0BCB}}, + {"TAMIL SYLLABLE NGU", 2, {0x0B99, 0x0BC1}}, + {"TAMIL SYLLABLE NGUU", 2, {0x0B99, 0x0BC2}}, + {"TAMIL SYLLABLE NI", 2, {0x0BA8, 0x0BBF}}, + {"TAMIL SYLLABLE NII", 2, {0x0BA8, 0x0BC0}}, + {"TAMIL SYLLABLE NNAA", 2, {0x0BA3, 0x0BBE}}, + {"TAMIL SYLLABLE NNAI", 2, {0x0BA3, 0x0BC8}}, + {"TAMIL SYLLABLE NNAU", 2, {0x0BA3, 0x0BCC}}, + {"TAMIL SYLLABLE NNE", 2, {0x0BA3, 0x0BC6}}, + {"TAMIL SYLLABLE NNEE", 2, {0x0BA3, 0x0BC7}}, + {"TAMIL SYLLABLE NNI", 2, {0x0BA3, 0x0BBF}}, + {"TAMIL SYLLABLE NNII", 2, {0x0BA3, 0x0BC0}}, + {"TAMIL SYLLABLE NNNAA", 2, {0x0BA9, 0x0BBE}}, + {"TAMIL SYLLABLE NNNAI", 2, {0x0BA9, 0x0BC8}}, + {"TAMIL SYLLABLE NNNAU", 2, {0x0BA9, 0x0BCC}}, + {"TAMIL SYLLABLE NNNE", 2, {0x0BA9, 0x0BC6}}, + {"TAMIL SYLLABLE NNNEE", 2, {0x0BA9, 0x0BC7}}, + {"TAMIL SYLLABLE NNNI", 2, {0x0BA9, 0x0BBF}}, + {"TAMIL SYLLABLE NNNII", 2, {0x0BA9, 0x0BC0}}, + {"TAMIL SYLLABLE NNNO", 2, {0x0BA9, 0x0BCA}}, + {"TAMIL SYLLABLE NNNOO", 2, {0x0BA9, 0x0BCB}}, + {"TAMIL SYLLABLE NNNU", 2, {0x0BA9, 0x0BC1}}, + {"TAMIL SYLLABLE NNNUU", 2, {0x0BA9, 0x0BC2}}, + {"TAMIL SYLLABLE NNO", 2, {0x0BA3, 0x0BCA}}, + {"TAMIL SYLLABLE NNOO", 2, {0x0BA3, 0x0BCB}}, + {"TAMIL SYLLABLE NNU", 2, {0x0BA3, 0x0BC1}}, + {"TAMIL SYLLABLE NNUU", 2, {0x0BA3, 0x0BC2}}, + {"TAMIL SYLLABLE NO", 2, {0x0BA8, 0x0BCA}}, + {"TAMIL SYLLABLE NOO", 2, {0x0BA8, 0x0BCB}}, + {"TAMIL SYLLABLE NU", 2, {0x0BA8, 0x0BC1}}, + {"TAMIL SYLLABLE NUU", 2, {0x0BA8, 0x0BC2}}, + {"TAMIL SYLLABLE NYAA", 2, {0x0B9E, 0x0BBE}}, + {"TAMIL SYLLABLE NYAI", 2, {0x0B9E, 0x0BC8}}, + {"TAMIL SYLLABLE NYAU", 2, {0x0B9E, 0x0BCC}}, + {"TAMIL SYLLABLE NYE", 2, {0x0B9E, 0x0BC6}}, + {"TAMIL SYLLABLE NYEE", 2, {0x0B9E, 0x0BC7}}, + {"TAMIL SYLLABLE NYI", 2, {0x0B9E, 0x0BBF}}, + {"TAMIL SYLLABLE NYII", 2, {0x0B9E, 0x0BC0}}, + {"TAMIL SYLLABLE NYO", 2, {0x0B9E, 0x0BCA}}, + {"TAMIL SYLLABLE NYOO", 2, {0x0B9E, 0x0BCB}}, + {"TAMIL SYLLABLE NYU", 2, {0x0B9E, 0x0BC1}}, + {"TAMIL SYLLABLE NYUU", 2, {0x0B9E, 0x0BC2}}, + {"TAMIL SYLLABLE PAA", 2, {0x0BAA, 0x0BBE}}, + {"TAMIL SYLLABLE PAI", 2, {0x0BAA, 0x0BC8}}, + {"TAMIL SYLLABLE PAU", 2, {0x0BAA, 0x0BCC}}, + {"TAMIL SYLLABLE PE", 2, {0x0BAA, 0x0BC6}}, + {"TAMIL SYLLABLE PEE", 2, {0x0BAA, 0x0BC7}}, + {"TAMIL SYLLABLE PI", 2, {0x0BAA, 0x0BBF}}, + {"TAMIL SYLLABLE PII", 2, {0x0BAA, 0x0BC0}}, + {"TAMIL SYLLABLE PO", 2, {0x0BAA, 0x0BCA}}, + {"TAMIL SYLLABLE POO", 2, {0x0BAA, 0x0BCB}}, + {"TAMIL SYLLABLE PU", 2, {0x0BAA, 0x0BC1}}, + {"TAMIL SYLLABLE PUU", 2, {0x0BAA, 0x0BC2}}, + {"TAMIL SYLLABLE RAA", 2, {0x0BB0, 0x0BBE}}, + {"TAMIL SYLLABLE RAI", 2, {0x0BB0, 0x0BC8}}, + {"TAMIL SYLLABLE RAU", 2, {0x0BB0, 0x0BCC}}, + {"TAMIL SYLLABLE RE", 2, {0x0BB0, 0x0BC6}}, + {"TAMIL SYLLABLE REE", 2, {0x0BB0, 0x0BC7}}, + {"TAMIL SYLLABLE RI", 2, {0x0BB0, 0x0BBF}}, + {"TAMIL SYLLABLE RII", 2, {0x0BB0, 0x0BC0}}, + {"TAMIL SYLLABLE RO", 2, {0x0BB0, 0x0BCA}}, + {"TAMIL SYLLABLE ROO", 2, {0x0BB0, 0x0BCB}}, + {"TAMIL SYLLABLE RRAA", 2, {0x0BB1, 0x0BBE}}, + {"TAMIL SYLLABLE RRAI", 2, {0x0BB1, 0x0BC8}}, + {"TAMIL SYLLABLE RRAU", 2, {0x0BB1, 0x0BCC}}, + {"TAMIL SYLLABLE RRE", 2, {0x0BB1, 0x0BC6}}, + {"TAMIL SYLLABLE RREE", 2, {0x0BB1, 0x0BC7}}, + {"TAMIL SYLLABLE RRI", 2, {0x0BB1, 0x0BBF}}, + {"TAMIL SYLLABLE RRII", 2, {0x0BB1, 0x0BC0}}, + {"TAMIL SYLLABLE RRO", 2, {0x0BB1, 0x0BCA}}, + {"TAMIL SYLLABLE RROO", 2, {0x0BB1, 0x0BCB}}, + {"TAMIL SYLLABLE RRU", 2, {0x0BB1, 0x0BC1}}, + {"TAMIL SYLLABLE RRUU", 2, {0x0BB1, 0x0BC2}}, + {"TAMIL SYLLABLE RU", 2, {0x0BB0, 0x0BC1}}, + {"TAMIL SYLLABLE RUU", 2, {0x0BB0, 0x0BC2}}, + {"TAMIL SYLLABLE SAA", 2, {0x0BB8, 0x0BBE}}, + {"TAMIL SYLLABLE SAI", 2, {0x0BB8, 0x0BC8}}, + {"TAMIL SYLLABLE SAU", 2, {0x0BB8, 0x0BCC}}, + {"TAMIL SYLLABLE SE", 2, {0x0BB8, 0x0BC6}}, + {"TAMIL SYLLABLE SEE", 2, {0x0BB8, 0x0BC7}}, + {"TAMIL SYLLABLE SHAA", 2, {0x0BB6, 0x0BBE}}, + {"TAMIL SYLLABLE SHAI", 2, {0x0BB6, 0x0BC8}}, + {"TAMIL SYLLABLE SHAU", 2, {0x0BB6, 0x0BCC}}, + {"TAMIL SYLLABLE SHE", 2, {0x0BB6, 0x0BC6}}, + {"TAMIL SYLLABLE SHEE", 2, {0x0BB6, 0x0BC7}}, + {"TAMIL SYLLABLE SHI", 2, {0x0BB6, 0x0BBF}}, + {"TAMIL SYLLABLE SHII", 2, {0x0BB6, 0x0BC0}}, + {"TAMIL SYLLABLE SHO", 2, {0x0BB6, 0x0BCA}}, + {"TAMIL SYLLABLE SHOO", 2, {0x0BB6, 0x0BCB}}, + {"TAMIL SYLLABLE SHRII", 4, {0x0BB6, 0x0BCD, 0x0BB0, 0x0BC0}}, + {"TAMIL SYLLABLE SHU", 2, {0x0BB6, 0x0BC1}}, + {"TAMIL SYLLABLE SHUU", 2, {0x0BB6, 0x0BC2}}, + {"TAMIL SYLLABLE SI", 2, {0x0BB8, 0x0BBF}}, + {"TAMIL SYLLABLE SII", 2, {0x0BB8, 0x0BC0}}, + {"TAMIL SYLLABLE SO", 2, {0x0BB8, 0x0BCA}}, + {"TAMIL SYLLABLE SOO", 2, {0x0BB8, 0x0BCB}}, + {"TAMIL SYLLABLE SSAA", 2, {0x0BB7, 0x0BBE}}, + {"TAMIL SYLLABLE SSAI", 2, {0x0BB7, 0x0BC8}}, + {"TAMIL SYLLABLE SSAU", 2, {0x0BB7, 0x0BCC}}, + {"TAMIL SYLLABLE SSE", 2, {0x0BB7, 0x0BC6}}, + {"TAMIL SYLLABLE SSEE", 2, {0x0BB7, 0x0BC7}}, + {"TAMIL SYLLABLE SSI", 2, {0x0BB7, 0x0BBF}}, + {"TAMIL SYLLABLE SSII", 2, {0x0BB7, 0x0BC0}}, + {"TAMIL SYLLABLE SSO", 2, {0x0BB7, 0x0BCA}}, + {"TAMIL SYLLABLE SSOO", 2, {0x0BB7, 0x0BCB}}, + {"TAMIL SYLLABLE SSU", 2, {0x0BB7, 0x0BC1}}, + {"TAMIL SYLLABLE SSUU", 2, {0x0BB7, 0x0BC2}}, + {"TAMIL SYLLABLE SU", 2, {0x0BB8, 0x0BC1}}, + {"TAMIL SYLLABLE SUU", 2, {0x0BB8, 0x0BC2}}, + {"TAMIL SYLLABLE TAA", 2, {0x0BA4, 0x0BBE}}, + {"TAMIL SYLLABLE TAI", 2, {0x0BA4, 0x0BC8}}, + {"TAMIL SYLLABLE TAU", 2, {0x0BA4, 0x0BCC}}, + {"TAMIL SYLLABLE TE", 2, {0x0BA4, 0x0BC6}}, + {"TAMIL SYLLABLE TEE", 2, {0x0BA4, 0x0BC7}}, + {"TAMIL SYLLABLE TI", 2, {0x0BA4, 0x0BBF}}, + {"TAMIL SYLLABLE TII", 2, {0x0BA4, 0x0BC0}}, + {"TAMIL SYLLABLE TO", 2, {0x0BA4, 0x0BCA}}, + {"TAMIL SYLLABLE TOO", 2, {0x0BA4, 0x0BCB}}, + {"TAMIL SYLLABLE TTAA", 2, {0x0B9F, 0x0BBE}}, + {"TAMIL SYLLABLE TTAI", 2, {0x0B9F, 0x0BC8}}, + {"TAMIL SYLLABLE TTAU", 2, {0x0B9F, 0x0BCC}}, + {"TAMIL SYLLABLE TTE", 2, {0x0B9F, 0x0BC6}}, + {"TAMIL SYLLABLE TTEE", 2, {0x0B9F, 0x0BC7}}, + {"TAMIL SYLLABLE TTI", 2, {0x0B9F, 0x0BBF}}, + {"TAMIL SYLLABLE TTII", 2, {0x0B9F, 0x0BC0}}, + {"TAMIL SYLLABLE TTO", 2, {0x0B9F, 0x0BCA}}, + {"TAMIL SYLLABLE TTOO", 2, {0x0B9F, 0x0BCB}}, + {"TAMIL SYLLABLE TTU", 2, {0x0B9F, 0x0BC1}}, + {"TAMIL SYLLABLE TTUU", 2, {0x0B9F, 0x0BC2}}, + {"TAMIL SYLLABLE TU", 2, {0x0BA4, 0x0BC1}}, + {"TAMIL SYLLABLE TUU", 2, {0x0BA4, 0x0BC2}}, + {"TAMIL SYLLABLE VAA", 2, {0x0BB5, 0x0BBE}}, + {"TAMIL SYLLABLE VAI", 2, {0x0BB5, 0x0BC8}}, + {"TAMIL SYLLABLE VAU", 2, {0x0BB5, 0x0BCC}}, + {"TAMIL SYLLABLE VE", 2, {0x0BB5, 0x0BC6}}, + {"TAMIL SYLLABLE VEE", 2, {0x0BB5, 0x0BC7}}, + {"TAMIL SYLLABLE VI", 2, {0x0BB5, 0x0BBF}}, + {"TAMIL SYLLABLE VII", 2, {0x0BB5, 0x0BC0}}, + {"TAMIL SYLLABLE VO", 2, {0x0BB5, 0x0BCA}}, + {"TAMIL SYLLABLE VOO", 2, {0x0BB5, 0x0BCB}}, + {"TAMIL SYLLABLE VU", 2, {0x0BB5, 0x0BC1}}, + {"TAMIL SYLLABLE VUU", 2, {0x0BB5, 0x0BC2}}, + {"TAMIL SYLLABLE YAA", 2, {0x0BAF, 0x0BBE}}, + {"TAMIL SYLLABLE YAI", 2, {0x0BAF, 0x0BC8}}, + {"TAMIL SYLLABLE YAU", 2, {0x0BAF, 0x0BCC}}, + {"TAMIL SYLLABLE YE", 2, {0x0BAF, 0x0BC6}}, + {"TAMIL SYLLABLE YEE", 2, {0x0BAF, 0x0BC7}}, + {"TAMIL SYLLABLE YI", 2, {0x0BAF, 0x0BBF}}, + {"TAMIL SYLLABLE YII", 2, {0x0BAF, 0x0BC0}}, + {"TAMIL SYLLABLE YO", 2, {0x0BAF, 0x0BCA}}, + {"TAMIL SYLLABLE YOO", 2, {0x0BAF, 0x0BCB}}, + {"TAMIL SYLLABLE YU", 2, {0x0BAF, 0x0BC1}}, + {"TAMIL SYLLABLE YUU", 2, {0x0BAF, 0x0BC2}}, +}; diff --git a/Tools/unicode/makeunicodedata.py b/Tools/unicode/makeunicodedata.py --- a/Tools/unicode/makeunicodedata.py +++ b/Tools/unicode/makeunicodedata.py @@ -25,7 +25,12 @@ # written by Fredrik Lundh (fredrik@pythonware.com) # -import sys, os, zipfile +import os +import sys +import zipfile + +from textwrap import dedent +from operator import itemgetter SCRIPT = sys.argv[0] VERSION = "3.2" @@ -39,6 +44,8 @@ DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt" DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt" LINE_BREAK = "LineBreak%s.txt" +NAME_ALIASES = "NameAliases%s.txt" +NAMED_SEQUENCES = "NamedSequences%s.txt" old_versions = ["3.2.0"] @@ -692,6 +699,40 @@ print("/* name->code dictionary */", file=fp) codehash.dump(fp, trace) + print(dedent(""" + typedef struct Alias { + char *name; + int namelen; + int codepoint; + } alias; + """), file=fp) + + print('static const int aliases_count = %d;' % len(unicode.aliases), file=fp) + + print('static const alias name_aliases[] = {', file=fp) + for name, codepoint in unicode.aliases: + print(' {"%s", %d, 0x%04X},' % (name, len(name), codepoint), file=fp) + print('};', file=fp) + + # the Py_UCS2 seq[4] should use Py_UCS4 if non-BMP chars are added to the + # sequences and have an higher number of elements if the sequences get longer + print(dedent(""" + typedef struct NamedSequence { + char *name; + int seqlen; + Py_UCS2 seq[4]; + } named_sequence; + """), file=fp) + + print('static const int named_sequences_count = %d;' % len(unicode.named_sequences), + file=fp) + + print('static const named_sequence named_sequences[] = {', file=fp) + for name, sequence in unicode.named_sequences: + seq_str = ', '.join('0x%04X' % cp for cp in sequence) + print(' {"%s", %d, {%s}},' % (name, len(sequence), seq_str), file=fp) + print('};', file=fp) + fp.close() @@ -855,6 +896,31 @@ self.table = table self.chars = list(range(0x110000)) # unicode 3.2 + self.aliases = [] + with open_data(NAME_ALIASES, version) as file: + for s in file: + s = s.strip() + if not s or s.startswith('#'): + continue + char, name = s.split(';') + char = int(char, 16) + self.aliases.append((name, char)) + + self.named_sequences = [] + with open_data(NAMED_SEQUENCES, version) as file: + for s in file: + s = s.strip() + if not s or s.startswith('#'): + continue + name, chars = s.split(';') + chars = tuple(int(char, 16) for char in chars.split()) + # check that the structure defined in makeunicodename is OK + assert 2 <= len(chars) <= 4, "change the Py_UCS2 array size" + assert all(c <= 0xFFFF for c in chars), "use Py_UCS4 instead" + self.named_sequences.append((name, chars)) + # sort names to enable binary search + self.named_sequences.sort(key=itemgetter(0)) + self.exclusions = {} with open_data(COMPOSITION_EXCLUSIONS, version) as file: for s in file: