diff -r 068365acbe73 -r c35b7913e4f6 Doc/library/unicodedata.rst --- a/Doc/library/unicodedata.rst Wed Mar 25 21:03:47 2015 +0200 +++ b/Doc/library/unicodedata.rst Thu Mar 26 22:06:51 2015 +0000 @@ -131,6 +131,42 @@ a human reader, if one has combining characters and the other doesn't, they may not compare equal. + +.. function:: quick_check(form, chr) + + Return the 'Quick_Check' property value for the normal form *form* for the + Unicode character *chr*. Valid values for form are 'NFC', 'NFKC', 'NFD', and + 'NFKD'. The return value is a string: 'Yes', 'No', or 'Maybe'. Return 'Yes' if + the 'Quick_Check' property is not defined for the character. + + The 'Quick_Check' property is useful when joining (or appending to) strings + that have already been normalized to a particular normalization form. If *s1* + and *s2* are two strings that are both normalized to a given normalization form + *form*, then the concatenation *s1* + *s2* is **not** guaranteed to be + normalized to that normalization form; there might be a non-normalized + subsequence of characters straddling the point where the strings were joined. + We might pass the concatenation *s1* + *s2* to ``normalize()``, but if *s1* or + *s2* is long then this implies some wasted effort since a large amount of text + will be processed a second time for no gain (having already been processed when + *s1* or *s2* was normalized). + + However, it is possible to avoid most of this additional work by re-processing + only a small part of each string: characters at the end of *s1* and characters + at the start of *s2*. Specifically, let *x* be the index of the last character + in *s1* that satisfies both ``quick_check(form, chr) == 'Yes'`` and + ``combining(chr) == 0`` (or zero if no character in *s1* satisfies these + conditions), and let *y* be the index of the first character in *s2* that + satisfies the same conditions (or the length of *s2* if no character in *s2* + satisfies the conditions). In this case, the string + ``s1[:x] + normalize(form, s1[x:] + s2[:y]) + s2[y:]`` is guaranteed to be + normalized to *form*. + + For more information, please refer to `section 9 of Unicode Standard Annex #15 + ("Unicode Normalization Forms") + `_. + + .. versionadded:: 3.5 + In addition, the module exposes the following constant: diff -r 068365acbe73 -r c35b7913e4f6 Doc/whatsnew/3.5.rst --- a/Doc/whatsnew/3.5.rst Wed Mar 25 21:03:47 2015 +0200 +++ b/Doc/whatsnew/3.5.rst Thu Mar 26 22:06:51 2015 +0000 @@ -419,6 +419,12 @@ * The :func:`time.monotonic` function is now always available. (Contributed by Victor Stinner in :issue:`22043`.) +unicodedata +----------- + +* The :func:`quick_check` function was added. (Contributed by Philip Eve in + :issue:`23550`.) + urllib ------ diff -r 068365acbe73 -r c35b7913e4f6 Lib/test/test_unicodedata.py --- a/Lib/test/test_unicodedata.py Wed Mar 25 21:03:47 2015 +0200 +++ b/Lib/test/test_unicodedata.py Thu Mar 26 22:06:51 2015 +0000 @@ -223,6 +223,33 @@ self.assertEqual(eaw('\u2010'), 'A') self.assertEqual(eaw('\U00020000'), 'W') + def test_quick_check(self): + qc = self.db.quick_check + def qc_all(char): + return (qc('NFC', char), qc('NFKC', char), + qc('NFD', char), qc('NFKD', char)) + self.assertEqual(qc_all('s'), ('Yes', 'Yes', 'Yes', 'Yes')) + self.assertEqual(qc_all('7'), ('Yes', 'Yes', 'Yes', 'Yes')) + self.assertEqual(qc_all(' '), ('Yes', 'Yes', 'Yes', 'Yes')) + self.assertEqual(qc_all('\uFFFE'), ('Yes', 'Yes', 'Yes', 'Yes')) + self.assertEqual(qc_all('\u01C7'), ('Yes', 'No', 'Yes', 'No')) + self.assertEqual(qc_all('\u01F5'), ('Yes', 'Yes', 'No', 'No')) + self.assertEqual(qc_all('\u0308'), ('Maybe', 'Maybe', 'Yes', 'Yes')) + self.assertEqual(qc_all('\u0387'), ('No', 'No', 'No', 'No')) + self.assertEqual(qc_all('\u1FDD'), ('Yes', 'No', 'No', 'No')) + self.assertEqual(qc_all('\U0001133E'), + ('Maybe', 'Maybe', 'Yes', 'Yes')) + with self.assertRaises(TypeError): + qc() + with self.assertRaises(TypeError): + qc('NFD') + with self.assertRaises(TypeError): + qc('NFD', '') + with self.assertRaises(TypeError): + qc('NFKC', 'xx') + with self.assertRaises(ValueError): + qc('unknown', 's') + class UnicodeMiscTest(UnicodeDatabaseTest): def test_failed_import_during_compiling(self): diff -r 068365acbe73 -r c35b7913e4f6 Modules/unicodedata.c --- a/Modules/unicodedata.c Wed Mar 25 21:03:47 2015 +0200 +++ b/Modules/unicodedata.c Thu Mar 26 22:06:51 2015 +0000 @@ -36,7 +36,8 @@ const unsigned char mirrored; /* true if mirrored in bidir mode */ const unsigned char east_asian_width; /* index into _PyUnicode_EastAsianWidth */ - const unsigned char normalization_quick_check; /* see is_normalized() */ + const unsigned char normalization_quick_check; /* used for quick_check() + and normalize() */ } _PyUnicode_DatabaseRecord; typedef struct change_record { @@ -48,6 +49,8 @@ const double numeric_changed; } change_record; +typedef enum {NFC, NFKC, NFD, NFKD, Invalid_NF} normalization_form; + /* data file generated by Tools/unicode/makeunicodedata.py */ #include "unicodedata_db.h" @@ -65,6 +68,20 @@ return &_PyUnicode_Database_Records[index]; } +static normalization_form str_to_nf(const char *form) +{ + if (strcmp(form, "NFC") == 0) + return NFC; + else if (strcmp(form, "NFKC") == 0) + return NFKC; + else if (strcmp(form, "NFD") == 0) + return NFD; + else if (strcmp(form, "NFKD") == 0) + return NFKD; + else + return Invalid_NF; +} + /* ------------- Previous-version API ------------------------------------- */ typedef struct previous_version { PyObject_HEAD @@ -884,36 +901,122 @@ return input; } - if (strcmp(form, "NFC") == 0) { - if (is_normalized(self, input, 1, 0)) { - Py_INCREF(input); - return input; - } - return nfc_nfkc(self, input, 0); + switch(str_to_nf(form)) { + case NFC: + if (is_normalized(self, input, 1, 0)) { + Py_INCREF(input); + return input; + } + return nfc_nfkc(self, input, 0); + case NFKC: + if (is_normalized(self, input, 1, 1)) { + Py_INCREF(input); + return input; + } + return nfc_nfkc(self, input, 1); + case NFD: + if (is_normalized(self, input, 0, 0)) { + Py_INCREF(input); + return input; + } + return nfd_nfkd(self, input, 0); + case NFKD: + if (is_normalized(self, input, 0, 1)) { + Py_INCREF(input); + return input; + } + return nfd_nfkd(self, input, 1); + default: + PyErr_SetString( + PyExc_ValueError, + "invalid normalization form - valid normalization forms are " + "\"NFC\", \"NFKC\", \"NFD\" and \"NFKD\"" + ); + return NULL; } - if (strcmp(form, "NFKC") == 0) { - if (is_normalized(self, input, 1, 1)) { - Py_INCREF(input); - return input; - } - return nfc_nfkc(self, input, 1); +} + +PyDoc_STRVAR(unicodedata_quick_check__doc__, +"quick_check(form, unichr)\n\ +\n\ +Return the 'Quick_Check' property value for the normal form 'form'\n\ +for the Unicode character unichr. Valid values for form are 'NFC',\n\ +'NFKC', 'NFD', and 'NFKD'. The return value is a string: 'Yes',\n\ +'No', or 'Maybe'. Return 'Yes' if the 'Quick_Check' property is not\n\ +defined for the character.\n\ +\n\ +The 'Quick_Check' property is useful when joining (or appending to)\n\ +strings that have already been normalized to a particular\n\ +normalization form. If s1 and s2 are two strings that are both\n\ +normalized to a given normalization form 'form', then the\n\ +concatenation s1 + s2 is *not* guaranteed to be normalized to that\n\ +normalization form; there might be a non-normalized subsequence of\n\ +characters straddling the point where the strings were joined. We\n\ +might pass the concatenation s1 + s2 to normalize(), but if s1 or s2\n\ +is long then this implies some wasted effort since a large amount of\n\ +text will be processed a second time for no gain (having already\n\ +been processed when s1 or s2 was normalized).\n\ +\n\ +However, it is possible to avoid most of this additional work by\n\ +re-processing only a small part of each string: characters at the\n\ +end of s1 and characters at the start of s2. Specifically, let x be\n\ +the index of the last character in s1 that satisfies\n\ +\n\ + quick_check(form, chr) == 'Yes' and combining(chr) == 0\n\ +\n\ +(or zero if no character in s1 satisfies these conditions), and let\n\ +y be the index of the first character in s2 that satisfies the same\n\ +condition (or the length of s2 if no character in s2 satisfies the\n\ +condition). In this case, the string\n\ +\n\ + s1[:x] + normalize(form, s1[x:] + s2[:y]) + s2[y:]\n\ +\n\ +is guaranteed to be normalized to 'form'.\n\ +\n\ +For more information, please refer to section 9 of Unicode Standard\n\ +Annex #15 (\"Unicode Normalization Forms\"):\n\ +\n\ + http://unicode.org/reports/tr15/#Detecting_Normalization_Forms"); + +static PyObject * +unicodedata_quick_check(PyObject *self, PyObject *args) +{ + char *form; + PyUnicodeObject *v; + unsigned char quickcheck_value; + Py_UCS4 c; + + if(!PyArg_ParseTuple(args, "sO!:quick_check", + &form, &PyUnicode_Type, &v)) + return NULL; + + c = getuchar(v); + if (c == (Py_UCS4)-1) + return NULL; + + quickcheck_value = _getrecord_ex(c)->normalization_quick_check; + + switch (str_to_nf(form)) { + case NFC: quickcheck_value >>= 4; break; + case NFKC: quickcheck_value >>= 6; break; + case NFD: /*No need to do anything*/ break; + case NFKD: quickcheck_value >>= 2; break; + default: + PyErr_SetString( + PyExc_ValueError, + "invalid normalization form - valid normalization forms are " + "\"NFC\", \"NFKC\", \"NFD\" and \"NFKD\"" + ); + return NULL; } - if (strcmp(form, "NFD") == 0) { - if (is_normalized(self, input, 0, 0)) { - Py_INCREF(input); - return input; - } - return nfd_nfkd(self, input, 0); - } - if (strcmp(form, "NFKD") == 0) { - if (is_normalized(self, input, 0, 1)) { - Py_INCREF(input); - return input; - } - return nfd_nfkd(self, input, 1); - } - PyErr_SetString(PyExc_ValueError, "invalid normalization form"); - return NULL; + + /* The two quickcheck bits shifted all the way to the right + mean 0=Yes, 1=Maybe, 2=No, as described in + http://unicode.org/reports/tr15/#Annex8. 3=Unused is assigned + in_PyUnicode_QuickCheckNames as a precaution. */ + return PyUnicode_FromString( + _PyUnicode_QuickCheckNames[quickcheck_value & 3] + ); } /* -------------------------------------------------------------------- */ @@ -1326,6 +1429,8 @@ {"lookup", unicodedata_lookup, METH_VARARGS, unicodedata_lookup__doc__}, {"normalize", unicodedata_normalize, METH_VARARGS, unicodedata_normalize__doc__}, + {"quick_check", unicodedata_quick_check, METH_VARARGS, + unicodedata_quick_check__doc__}, {NULL, NULL} /* sentinel */ }; diff -r 068365acbe73 -r c35b7913e4f6 Modules/unicodedata_db.h --- a/Modules/unicodedata_db.h Wed Mar 25 21:03:47 2015 +0200 +++ b/Modules/unicodedata_db.h Thu Mar 26 22:06:51 2015 +0000 @@ -668,6 +668,13 @@ "N", NULL }; +const char *_PyUnicode_QuickCheckNames[] = { + "Yes", + "Maybe", + "No", + "Unused", + NULL +}; static const char *decomp_prefix[] = { "", "", diff -r 068365acbe73 -r c35b7913e4f6 Tools/unicode/makeunicodedata.py --- a/Tools/unicode/makeunicodedata.py Wed Mar 25 21:03:47 2015 +0200 +++ b/Tools/unicode/makeunicodedata.py Thu Mar 26 22:06:51 2015 +0000 @@ -77,6 +77,8 @@ EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ] +QUICKCHECK_NAMES = [ "Yes", "Maybe", "No", "Unused" ] + MANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ] # note: should match definitions in Objects/unicodectype.c @@ -305,6 +307,12 @@ print(" NULL", file=fp) print("};", file=fp) + print("const char *_PyUnicode_QuickCheckNames[] = {", file=fp) + for name in QUICKCHECK_NAMES: + print(" \"%s\"," % name, file=fp) + print(" NULL", file=fp) + print("};", file=fp) + print("static const char *decomp_prefix[] = {", file=fp) for name in decomp_prefix: print(" \"%s\"," % name, file=fp)