--- Tools/unicode/makeunicodedata.py.old 2007-06-10 00:22:08.000000000 +0300 +++ Tools/unicode/makeunicodedata.py 2007-06-10 00:55:41.000000000 +0300 @@ -34,6 +34,7 @@ UNICODE_DATA = "UnicodeData%s.txt" COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt" EASTASIAN_WIDTH = "EastAsianWidth%s.txt" +DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt" old_versions = ["3.2.0"] @@ -65,7 +66,8 @@ version = "" unicode = UnicodeData(UNICODE_DATA % version, COMPOSITION_EXCLUSIONS % version, - EASTASIAN_WIDTH % version) + EASTASIAN_WIDTH % version, + DERIVEDNORMALIZATION_PROPS % version) print len(filter(None, unicode.table)), "characters" @@ -86,7 +88,7 @@ def makeunicodedata(unicode, trace): - dummy = (0, 0, 0, 0, 0) + dummy = (0, 0, 0, 0, 0, 0) table = [dummy] cache = {0: dummy} index = [0] * len(unicode.chars) @@ -106,8 +108,10 @@ bidirectional = BIDIRECTIONAL_NAMES.index(record[4]) mirrored = record[9] == "Y" eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15]) + normalizationquickcheck = record[16] item = ( - category, combining, bidirectional, mirrored, eastasianwidth + category, combining, bidirectional, mirrored, eastasianwidth, + normalizationquickcheck ) # add entry to index and item tables i = cache.get(item) @@ -221,7 +225,7 @@ print >>fp, \ "const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {" for item in table: - print >>fp, " {%d, %d, %d, %d, %d}," % item + print >>fp, " {%d, %d, %d, %d, %d, %d}," % item print >>fp, "};" print >>fp @@ -679,7 +683,8 @@ class UnicodeData: - def __init__(self, filename, exclusions, eastasianwidth, expand=1): + def __init__(self, filename, exclusions, eastasianwidth, + derivednormalizationprops=None, expand=1): self.changed = [] file = open(filename) table = [None] * 0x110000 @@ -742,6 +747,26 @@ for i in range(0, 0x110000): if table[i] is not None: table[i].append(widths[i]) + if derivednormalizationprops: + quickchecks = [0] * 0x110000 # default is Yes + qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split() + for s in open(derivednormalizationprops): + if '#' in s: s = s[:s.index('#')] + s = [i.strip() for i in s.split(';')] + if len(s) < 2 or s[1] not in qc_order: + continue + quickcheck = 'MN'.index(s[2])+1 # Maybe or No + quickcheck_shift = qc_order.index(s[1])*2 + quickcheck <<= quickcheck_shift + if '..' not in s[0]: + s[0] = '%s..%s'%(s[0], s[0]) + first, last = [int(c, 16) for c in s[0].split('..')] + for char in range(first, last+1): + assert not (quickchecks[char]>>quickcheck_shift)&3 + quickchecks[char] |= quickcheck + for i in range(0, 0x110000): + if table[i] is not None: + table[i].append(quickchecks[i]) def uselatin1(self): # restrict character range to ISO Latin 1 --- Modules/unicodedata.c.old 2007-06-10 00:28:27.000000000 +0300 +++ Modules/unicodedata.c 2007-06-10 01:37:38.000000000 +0300 @@ -27,6 +27,7 @@ const unsigned char mirrored; /* true if mirrored in bidir mode */ const unsigned char east_asian_width; /* index into _PyUnicode_EastAsianWidth */ + const unsigned char normalization_quick_check; /* see is_normalized() */ } _PyUnicode_DatabaseRecord; typedef struct change_record { @@ -714,6 +715,33 @@ PyUnicode_Resize(&result, o - PyUnicode_AS_UNICODE(result)); return result; } + +/* Return 1 if the input is certainly normalized, 0 if it might not be. */ +static int +is_normalized(PyObject *self, PyObject *input, int nfc, int k) +{ + Py_UNICODE *i, *end; + unsigned char prev_combining = 0; + + /* The two quickcheck bits at this shift mean 0=Yes, 1=Maybe, 2=No, + as described in http://unicode.org/reports/tr15/#Annex8. */ + unsigned char quickcheck_shift = ((nfc?2:0)+(k?1:0))*2; + + i = PyUnicode_AS_UNICODE(input); + end = i+PyUnicode_GET_SIZE(input); + while (i < end) { + const _PyUnicode_DatabaseRecord *record = _getrecord_ex(*i++); + unsigned char combining = record->combining; + unsigned char quickcheck = record->normalization_quick_check; + + if ((quickcheck>>quickcheck_shift) & 3) + return 0; /* this string might need normalization */ + if (combining && prev_combining > combining) + return 0; /* non-canonical sort order, not normalized */ + prev_combining = combining; + } + return 1; // certainly normalized +} PyDoc_STRVAR(unicodedata_normalize__doc__, "normalize(form, unistr)\n\ @@ -738,14 +766,34 @@ return input; } - if (strcmp(form, "NFC") == 0) + if (strcmp(form, "NFC") == 0) { + if (is_normalized(self, input, 1, 0)) { + Py_INCREF(input); + return input; + } return nfc_nfkc(self, input, 0); - if (strcmp(form, "NFKC") == 0) + } + if (strcmp(form, "NFKC") == 0) { + if (is_normalized(self, input, 1, 1)) { + Py_INCREF(input); + return input; + } return nfc_nfkc(self, input, 1); - if (strcmp(form, "NFD") == 0) + } + if (strcmp(form, "NFD") == 0) { + if (is_normalized(self, input, 0, 0)) { + Py_INCREF(input); + return input; + } return nfd_nfkd(self, input, 0); - if (strcmp(form, "NFKD") == 0) + } + if (strcmp(form, "NFKD") == 0) { + if (is_normalized(self, input, 0, 1)) { + Py_INCREF(input); + return input; + } return nfd_nfkd(self, input, 1); + } PyErr_SetString(PyExc_ValueError, "invalid normalization form"); return NULL; }