diff --git a/Doc/library/unicodedata.rst b/Doc/library/unicodedata.rst --- a/Doc/library/unicodedata.rst +++ b/Doc/library/unicodedata.rst @@ -29,6 +29,9 @@ Look up character by name. If a character with the given name is found, return the corresponding character. If not found, :exc:`KeyError` is raised. + .. versionchanged:: 3.3 + Support for name aliases [#]_ and named sequences [#]_ has been added. + .. function:: name(chr[, default]) @@ -160,3 +163,9 @@ >>> unicodedata.bidirectional('\u0660') # 'A'rabic, 'N'umber 'AN' + +.. rubric:: Footnotes + +.. [#] http://www.unicode.org/Public/6.0.0/ucd/NameAliases.txt + +.. [#] http://www.unicode.org/Public/6.0.0/ucd/NamedSequences.txt diff --git a/Doc/reference/lexical_analysis.rst b/Doc/reference/lexical_analysis.rst --- a/Doc/reference/lexical_analysis.rst +++ b/Doc/reference/lexical_analysis.rst @@ -492,13 +492,13 @@ +-----------------+---------------------------------+-------+ | Escape Sequence | Meaning | Notes | +=================+=================================+=======+ -| ``\N{name}`` | Character named *name* in the | | +| ``\N{name}`` | Character named *name* in the | \(4) | | | Unicode database | | +-----------------+---------------------------------+-------+ -| ``\uxxxx`` | Character with 16-bit hex value | \(4) | +| ``\uxxxx`` | Character with 16-bit hex value | \(5) | | | *xxxx* | | +-----------------+---------------------------------+-------+ -| ``\Uxxxxxxxx`` | Character with 32-bit hex value | \(5) | +| ``\Uxxxxxxxx`` | Character with 32-bit hex value | \(6) | | | *xxxxxxxx* | | +-----------------+---------------------------------+-------+ @@ -516,10 +516,14 @@ with the given value. (4) + .. versionchanged:: 3.3 + Support for name aliases [#]_ has been added. + +(5) Individual code units which form parts of a surrogate pair can be encoded using this escape sequence. Exactly four hex digits are required. -(5) +(6) Any Unicode character can be encoded this way, but characters outside the Basic Multilingual Plane (BMP) will be encoded using a surrogate pair if Python is compiled to use 16-bit code units (the default). Exactly eight hex digits @@ -706,3 +710,8 @@ occurrence outside string literals and comments is an unconditional error:: $ ? ` + + +.. rubric:: Footnotes + +.. [#] http://www.unicode.org/Public/6.0.0/ucd/NameAliases.txt diff --git a/Include/ucnhash.h b/Include/ucnhash.h --- a/Include/ucnhash.h +++ b/Include/ucnhash.h @@ -19,11 +19,13 @@ success, zero if not. Does not set Python exceptions. If self is NULL, data come from the default version of the database. If it is not NULL, it should be a unicodedata.ucd_X_Y_Z object */ - int (*getname)(PyObject *self, Py_UCS4 code, char* buffer, int buflen); + int (*getname)(PyObject *self, Py_UCS4 code, char* buffer, int buflen, + int with_alias_and_seq); /* Get character code for a given name. Same error handling as for getname. */ - int (*getcode)(PyObject *self, const char* name, int namelen, Py_UCS4* code); + int (*getcode)(PyObject *self, const char* name, int namelen, Py_UCS4* code, + int with_named_seq); } _PyUnicode_Name_CAPI; diff --git a/Lib/test/test_ucn.py b/Lib/test/test_ucn.py --- a/Lib/test/test_ucn.py +++ b/Lib/test/test_ucn.py @@ -8,8 +8,11 @@ """#" import unittest +import unicodedata from test import support +from http.client import HTTPException +from test.test_normalization import check_version class UnicodeNamesTest(unittest.TestCase): @@ -59,8 +62,6 @@ ) def test_ascii_letters(self): - import unicodedata - for char in "".join(map(chr, range(ord("a"), ord("z")))): name = "LATIN SMALL LETTER %s" % char.upper() code = unicodedata.lookup(name) @@ -81,7 +82,6 @@ self.checkletter("HANGUL SYLLABLE HWEOK", "\ud6f8") self.checkletter("HANGUL SYLLABLE HIH", "\ud7a3") - import unicodedata self.assertRaises(ValueError, unicodedata.name, "\ud7a4") def test_cjk_unified_ideographs(self): @@ -97,14 +97,11 @@ self.checkletter("CJK UNIFIED IDEOGRAPH-2B81D", "\U0002B81D") def test_bmp_characters(self): - import unicodedata - count = 0 for code in range(0x10000): char = chr(code) name = unicodedata.name(char, None) if name is not None: self.assertEqual(unicodedata.lookup(name), char) - count += 1 def test_misc_symbols(self): self.checkletter("PILCROW SIGN", "\u00b6") @@ -112,8 +109,85 @@ self.checkletter("HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK", "\uFF9F") self.checkletter("FULLWIDTH LATIN SMALL LETTER A", "\uFF41") + def test_aliases(self): + # Check that the aliases defined in the NameAliases.txt file work. + # This should be updated when new aliases are added or the file + # should be downloaded and parsed instead. See #12753. + aliases = [ + ('LATIN CAPITAL LETTER GHA', 0x01A2), + ('LATIN SMALL LETTER GHA', 0x01A3), + ('KANNADA LETTER LLLA', 0x0CDE), + ('LAO LETTER FO FON', 0x0E9D), + ('LAO LETTER FO FAY', 0x0E9F), + ('LAO LETTER RO', 0x0EA3), + ('LAO LETTER LO', 0x0EA5), + ('TIBETAN MARK BKA- SHOG GI MGO RGYAN', 0x0FD0), + ('YI SYLLABLE ITERATION MARK', 0xA015), + ('PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRACKET', 0xFE18), + ('BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA VASIS', 0x1D0C5) + ] + for alias, codepoint in aliases: + self.checkletter(alias, chr(codepoint)) + name = unicodedata.name(chr(codepoint)) + self.assertNotEqual(name, alias) + self.assertEqual(unicodedata.lookup(alias), + unicodedata.lookup(name)) + with self.assertRaises(KeyError): + unicodedata.ucd_3_2_0.lookup(alias) + + def test_aliases_names_in_pua_range(self): + # We are storing aliases in the PUA 15, but their names shouldn't leak + for cp in range(0xf0000, 0xf0100): + with self.assertRaises(ValueError) as cm: + unicodedata.name(chr(cp)) + self.assertEqual(str(cm.exception), 'no such name') + + def test_named_sequences_names_in_pua_range(self): + # We are storing named seq in the PUA 15, but their names shouldn't leak + for cp in range(0xf0100, 0xf0fff): + with self.assertRaises(ValueError) as cm: + unicodedata.name(chr(cp)) + self.assertEqual(str(cm.exception), 'no such name') + + def test_named_sequences_sample(self): + # Check a few named sequences. See #12753. + sequences = [ + ('LATIN SMALL LETTER R WITH TILDE', '\u0072\u0303'), + ('TAMIL SYLLABLE SAI', '\u0BB8\u0BC8'), + ('TAMIL SYLLABLE MOO', '\u0BAE\u0BCB'), + ('TAMIL SYLLABLE NNOO', '\u0BA3\u0BCB'), + ('TAMIL CONSONANT KSS', '\u0B95\u0BCD\u0BB7\u0BCD'), + ] + for seqname, codepoints in sequences: + self.assertEqual(unicodedata.lookup(seqname), codepoints) + with self.assertRaises(SyntaxError): + self.checkletter(seqname, None) + with self.assertRaises(KeyError): + unicodedata.ucd_3_2_0.lookup(seqname) + + def test_named_sequences_full(self): + # Check all the named sequences + url = ("http://www.unicode.org/Public/%s/ucd/NamedSequences.txt" % + unicodedata.unidata_version) + try: + testdata = support.open_urlresource(url, encoding="utf-8", + check=check_version) + except (IOError, HTTPException): + self.skipTest("Could not retrieve " + url) + self.addCleanup(testdata.close) + for line in testdata: + line = line.strip() + if not line or line.startswith('#'): + continue + seqname, codepoints = line.split(';') + codepoints = ''.join(chr(int(cp, 16)) for cp in codepoints.split()) + self.assertEqual(unicodedata.lookup(seqname), codepoints) + with self.assertRaises(SyntaxError): + self.checkletter(seqname, None) + with self.assertRaises(KeyError): + unicodedata.ucd_3_2_0.lookup(seqname) + def test_errors(self): - import unicodedata self.assertRaises(TypeError, unicodedata.name) self.assertRaises(TypeError, unicodedata.name, 'xx') self.assertRaises(TypeError, unicodedata.lookup) diff --git a/Modules/unicodedata.c b/Modules/unicodedata.c --- a/Modules/unicodedata.c +++ b/Modules/unicodedata.c @@ -926,9 +926,18 @@ (0x2B740 <= code && code <= 0x2B81D); /* CJK Ideograph Extension D */ } + +#define IS_ALIAS(cp) ((cp >= aliases_start) && (cp < aliases_end)) +#define IS_NAMED_SEQ(cp) ((cp >= named_sequences_start) && \ + (cp < named_sequences_end)) + static int -_getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen) +_getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen, + int with_alias_and_seq) { + /* Find the name associated with the given codepoint. + * If with_alias_and_seq is 1, check for names in the Private Use Area 15 + * that we are using for aliases and named sequences. */ int offset; int i; int word; @@ -937,7 +946,14 @@ if (code >= 0x110000) return 0; + // XXX should we just skip all the codepoints in the PUAs here? + if (!with_alias_and_seq && (IS_ALIAS(code) || IS_NAMED_SEQ(code))) + return 0; + if (self && UCD_Check(self)) { + // in 3.2.0 there are no aliases and named sequences + if (IS_ALIAS(code) || IS_NAMED_SEQ(code)) + return 0; const change_record *old = get_old_record(self, code); if (old->category_changed == 0) { /* unassigned */ @@ -1022,7 +1038,7 @@ /* check if code corresponds to the given name */ int i; char buffer[NAME_MAXLEN]; - if (!_getucname(self, code, buffer, sizeof(buffer))) + if (!_getucname(self, code, buffer, sizeof(buffer), 1)) return 0; for (i = 0; i < namelen; i++) { if (Py_TOUPPER(Py_CHARMASK(name[i])) != buffer[i]) @@ -1052,8 +1068,28 @@ } static int -_getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code) +_check_alias_and_seq(unsigned int cp, Py_UCS4* code, int with_named_seq) { + // check if named sequences are allowed + if (!with_named_seq && IS_NAMED_SEQ(cp)) + return 0; + // if the codepoint is in the PUA range that we use for aliases, + // convert it to obtain the right codepoint + if (IS_ALIAS(cp)) + *code = name_aliases[cp-aliases_start]; + else + *code = cp; + return 1; +} + +static int +_getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code, + int with_named_seq) +{ + /* Return the codepoint associated with the given name. + * Named aliases are resolved too (unless self != NULL (i.e. we are using + * 3.2.0)). If with_named_seq is 1, returns the PUA codepoint that we are + * using for the named sequence, and the caller must then convert it. */ unsigned int h, v; unsigned int mask = code_size-1; unsigned int i, incr; @@ -1109,10 +1145,8 @@ v = code_hash[i]; if (!v) return 0; - if (_cmpname(self, v, name, namelen)) { - *code = v; - return 1; - } + if (_cmpname(self, v, name, namelen)) + return _check_alias_and_seq(v, code, with_named_seq); incr = (h ^ (h >> 3)) & mask; if (!incr) incr = mask; @@ -1121,10 +1155,8 @@ v = code_hash[i]; if (!v) return 0; - if (_cmpname(self, v, name, namelen)) { - *code = v; - return 1; - } + if (_cmpname(self, v, name, namelen)) + return _check_alias_and_seq(v, code, with_named_seq); incr = incr << 1; if (incr > mask) incr = incr ^ code_poly; @@ -1162,7 +1194,7 @@ if (c == (Py_UCS4)-1) return NULL; - if (!_getucname(self, c, name, sizeof(name))) { + if (!_getucname(self, c, name, sizeof(name), 0)) { if (defobj == NULL) { PyErr_SetString(PyExc_ValueError, "no such name"); return NULL; @@ -1190,15 +1222,22 @@ char* name; int namelen; + unsigned int index; if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen)) return NULL; - if (!_getcode(self, name, namelen, &code)) { - PyErr_Format(PyExc_KeyError, "undefined character name '%s'", - name); + if (!_getcode(self, name, namelen, &code, 1)) { + PyErr_Format(PyExc_KeyError, "undefined character name '%s'", name); return NULL; } - + // check if code is in the PUA range that we use for named sequences + // and convert it + if (IS_NAMED_SEQ(code)) { + index = code-named_sequences_start; + return PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND, + named_sequences[index].seq, + named_sequences[index].seqlen); + } return PyUnicode_FromOrdinal(code); } diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -5809,7 +5809,7 @@ message = "unknown Unicode character name"; s++; if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), - &chr)) + &chr, 0)) goto store; } } diff --git a/Tools/unicode/makeunicodedata.py b/Tools/unicode/makeunicodedata.py --- a/Tools/unicode/makeunicodedata.py +++ b/Tools/unicode/makeunicodedata.py @@ -25,7 +25,12 @@ # written by Fredrik Lundh (fredrik@pythonware.com) # -import sys, os, zipfile +import os +import sys +import zipfile + +from textwrap import dedent +from operator import itemgetter SCRIPT = sys.argv[0] VERSION = "3.2" @@ -39,6 +44,16 @@ DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt" DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt" LINE_BREAK = "LineBreak%s.txt" +NAME_ALIASES = "NameAliases%s.txt" +NAMED_SEQUENCES = "NamedSequences%s.txt" + +# Private Use Areas -- in planes 1, 15, 16 +PUA_1 = range(0xE000, 0xF900) +PUA_15 = range(0xF0000, 0xFFFFE) +PUA_16 = range(0x100000, 0x10FFFE) + +NAME_ALIASES_START = 0xF0000 +NAMED_SEQUENCES_START = 0xF00FF old_versions = ["3.2.0"] @@ -692,6 +707,39 @@ print("/* name->code dictionary */", file=fp) codehash.dump(fp, trace) + print(file=fp) + print('static const unsigned int aliases_start = %#x;' % + NAME_ALIASES_START, file=fp) + print('static const unsigned int aliases_end = %#x;' % + (NAME_ALIASES_START + len(unicode.aliases)), file=fp) + + print('static const unsigned int name_aliases[] = {', file=fp) + for name, codepoint in unicode.aliases: + print(' 0x%04X,' % codepoint, file=fp) + print('};', file=fp) + + # In Unicode 6.0.0, the sequences contain at most 4 BMP chars, + # so we are using Py_UCS2 seq[4]. This needs to be updated if longer + # sequences or sequences with non-BMP chars are added. + # unicodedata_lookup should be adapted too. + print(dedent(""" + typedef struct NamedSequence { + int seqlen; + Py_UCS2 seq[4]; + } named_sequence; + """), file=fp) + + print('static const unsigned int named_sequences_start = %#x;' % + NAMED_SEQUENCES_START, file=fp) + print('static const unsigned int named_sequences_end = %#x;' % + (NAMED_SEQUENCES_START + len(unicode.named_sequences)), file=fp) + + print('static const named_sequence named_sequences[] = {', file=fp) + for name, sequence in unicode.named_sequences: + seq_str = ', '.join('0x%04X' % cp for cp in sequence) + print(' {%d, {%s}},' % (len(sequence), seq_str), file=fp) + print('};', file=fp) + fp.close() @@ -726,7 +774,11 @@ for k in range(len(old.table[i])): if old.table[i][k] != new.table[i][k]: value = old.table[i][k] - if k == 2: + if k == 1 and i in PUA_15: + # the name is not set in the old.table, but in the + # new.table we are using it for aliases and named seq + assert value == '' + elif k == 2: #print "CATEGORY",hex(i), old.table[i][k], new.table[i][k] category_changes[i] = CATEGORY_NAMES.index(value) elif k == 4: @@ -855,6 +907,49 @@ self.table = table self.chars = list(range(0x110000)) # unicode 3.2 + # check for name aliases and named sequences, see #12753 + # aliases and named sequences are not in 3.2.0 + if version != '3.2.0': + self.aliases = [] + # store aliases in the Private Use Area 1, in range U+E000..U+E0FF, + # in order to take advantage of the compression and lookup + # algorithms used for the other characters + pua_index = NAME_ALIASES_START + with open_data(NAME_ALIASES, version) as file: + for s in file: + s = s.strip() + if not s or s.startswith('#'): + continue + char, name = s.split(';') + char = int(char, 16) + self.aliases.append((name, char)) + # also store the name in the PUA 1 + self.table[pua_index][1] = name + pua_index += 1 + assert pua_index - NAME_ALIASES_START == len(self.aliases) + + self.named_sequences = [] + # store named seqences in the PUA 1, in range U+E100..U+E8FF, + # in order to take advantage of the compression and lookup + # algorithms used for the other characters + pua_index = NAMED_SEQUENCES_START + with open_data(NAMED_SEQUENCES, version) as file: + for s in file: + s = s.strip() + if not s or s.startswith('#'): + continue + name, chars = s.split(';') + chars = tuple(int(char, 16) for char in chars.split()) + # check that the structure defined in makeunicodename is OK + assert 2 <= len(chars) <= 4, "change the Py_UCS2 array size" + assert all(c <= 0xFFFF for c in chars), ("use Py_UCS4 in " + "the NamedSequence struct and in unicodedata_lookup") + self.named_sequences.append((name, chars)) + # also store these in the PUA 1 + self.table[pua_index][1] = name + pua_index += 1 + assert pua_index - NAMED_SEQUENCES_START == len(self.named_sequences) + self.exclusions = {} with open_data(COMPOSITION_EXCLUSIONS, version) as file: for s in file: