diff -r 4a1fe339dcf6 Doc/library/unicodedata.rst --- a/Doc/library/unicodedata.rst Sun Mar 01 00:42:54 2015 +0200 +++ b/Doc/library/unicodedata.rst Tue Mar 03 22:03:33 2015 +0000 @@ -131,6 +131,16 @@ a human reader, if one has combining characters and the other doesn't, they may not compare equal. + +.. function:: quick_check(form, chr) + + Return the 'Quick_Check' property value for the normal form *form* for the + Unicode character *chr*. Valid values for form are 'NFC', 'NFKC', 'NFD', and + 'NFKD'. The return value is a string: 'Yes', 'No', or 'Maybe'. Returns 'Yes' + if the 'Quick_Check' property is not defined for the character. + + .. versionadded:: 3.5 + In addition, the module exposes the following constant: diff -r 4a1fe339dcf6 Lib/test/test_unicodedata.py --- a/Lib/test/test_unicodedata.py Sun Mar 01 00:42:54 2015 +0200 +++ b/Lib/test/test_unicodedata.py Tue Mar 03 22:03:33 2015 +0000 @@ -223,6 +223,28 @@ self.assertEqual(eaw('\u2010'), 'A') self.assertEqual(eaw('\U00020000'), 'W') + def test_quick_check(self): + ae = self.assertEqual + qc = self.db.quick_check + def qc_all (char): + return (qc('NFC', char), qc('NFKC', char), + qc('NFD', char), qc('NFKD', char)) + ae(qc_all('s') , ('Yes' , 'Yes' , 'Yes', 'Yes')) + ae(qc_all('7') , ('Yes' , 'Yes' , 'Yes', 'Yes')) + ae(qc_all(' ') , ('Yes' , 'Yes' , 'Yes', 'Yes')) + ae(qc_all('\uFFFE') , ('Yes' , 'Yes' , 'Yes', 'Yes')) + ae(qc_all('\u01C7') , ('Yes' , 'No' , 'Yes', 'No' )) + ae(qc_all('\u01F5') , ('Yes' , 'Yes' , 'No' , 'No' )) + ae(qc_all('\u0308') , ('Maybe', 'Maybe', 'Yes', 'Yes')) + ae(qc_all('\u0387') , ('No' , 'No' , 'No' , 'No' )) + ae(qc_all('\u1FDD') , ('Yes' , 'No' , 'No' , 'No' )) + ae(qc_all('\U0001133E'), ('Maybe', 'Maybe', 'Yes', 'Yes')) + self.assertRaises(TypeError, qc) + self.assertRaises(TypeError, qc, 'NFD') + self.assertRaises(TypeError, qc, 'NFD', '') + self.assertRaises(TypeError, qc, 'NFKC', 'xx') + self.assertRaises(ValueError, qc, 'unknown', 's') + class UnicodeMiscTest(UnicodeDatabaseTest): def test_failed_import_during_compiling(self): diff -r 4a1fe339dcf6 Misc/NEWS --- a/Misc/NEWS Sun Mar 01 00:42:54 2015 +0200 +++ b/Misc/NEWS Tue Mar 03 22:03:33 2015 +0000 @@ -80,6 +80,8 @@ argument which, if set to True, will pass messages to handlers taking handler levels into account. +- Issue #23550: Added unicodedata.quick_check(). Patch by Philip Eve. + Build ----- diff -r 4a1fe339dcf6 Modules/unicodedata.c --- a/Modules/unicodedata.c Sun Mar 01 00:42:54 2015 +0200 +++ b/Modules/unicodedata.c Tue Mar 03 22:03:33 2015 +0000 @@ -36,7 +36,8 @@ const unsigned char mirrored; /* true if mirrored in bidir mode */ const unsigned char east_asian_width; /* index into _PyUnicode_EastAsianWidth */ - const unsigned char normalization_quick_check; /* see is_normalized() */ + const unsigned char normalization_quick_check; /* used for quick_check() + and normalize() */ } _PyUnicode_DatabaseRecord; typedef struct change_record { @@ -909,6 +910,53 @@ return NULL; } +PyDoc_STRVAR(unicodedata_quick_check__doc__, +"quick_check(form, unichr)\n\ +\n\ +Return the 'Quick_Check' property value for the normal form 'form'\n\ +for the Unicode character unichr. Valid values for form are 'NFC',\n\ +'NFKC', 'NFD', and 'NFKD'. The return value is a string: 'Yes',\n\ +'No', or 'Maybe'. Returns 'Yes' if the 'Quick_Check' property is\n\ +not defined for the character."); + +static PyObject * +unicodedata_quick_check(PyObject *self, PyObject *args) +{ + char *form; + PyUnicodeObject *v; + unsigned char quickcheck_value; + Py_UCS4 c; + + if(!PyArg_ParseTuple(args, "sO!:quick_check", + &form, &PyUnicode_Type, &v)) + return NULL; + + c = getuchar(v); + if (c == (Py_UCS4)-1) + return NULL; + + quickcheck_value = _getrecord_ex(c)->normalization_quick_check; + + if (strcmp(form, "NFC") == 0) + quickcheck_value >>= 4; + else if (strcmp(form, "NFKC") == 0) + quickcheck_value >>= 6; + else if (strcmp(form, "NFKD") == 0) + quickcheck_value >>= 2; + else if (strcmp(form, "NFD") != 0) { + PyErr_SetString(PyExc_ValueError, "invalid normalization form"); + return NULL; + } + + /* The two quickcheck bits shifted all the way to the right + mean 0=Yes, 1=Maybe, 2=No, as described in + http://unicode.org/reports/tr15/#Annex8. 3=Unused is assigned + in_PyUnicode_QuickCheckNames as a precaution. */ + return PyUnicode_FromString( + _PyUnicode_QuickCheckNames[quickcheck_value & 3] + ); +} + /* -------------------------------------------------------------------- */ /* unicode character name tables */ @@ -1319,6 +1367,8 @@ {"lookup", unicodedata_lookup, METH_VARARGS, unicodedata_lookup__doc__}, {"normalize", unicodedata_normalize, METH_VARARGS, unicodedata_normalize__doc__}, + {"quick_check", unicodedata_quick_check, METH_VARARGS, + unicodedata_quick_check__doc__}, {NULL, NULL} /* sentinel */ }; diff -r 4a1fe339dcf6 Modules/unicodedata_db.h --- a/Modules/unicodedata_db.h Sun Mar 01 00:42:54 2015 +0200 +++ b/Modules/unicodedata_db.h Tue Mar 03 22:03:33 2015 +0000 @@ -668,6 +668,13 @@ "N", NULL }; +const char *_PyUnicode_QuickCheckNames[] = { + "Yes", + "Maybe", + "No", + "Unused", + NULL +}; static const char *decomp_prefix[] = { "", "", diff -r 4a1fe339dcf6 Tools/unicode/makeunicodedata.py --- a/Tools/unicode/makeunicodedata.py Sun Mar 01 00:42:54 2015 +0200 +++ b/Tools/unicode/makeunicodedata.py Tue Mar 03 22:03:33 2015 +0000 @@ -77,6 +77,8 @@ EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ] +QUICKCHECK_NAMES = [ "Yes", "Maybe", "No", "Unused" ] + MANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ] # note: should match definitions in Objects/unicodectype.c @@ -305,6 +307,12 @@ print(" NULL", file=fp) print("};", file=fp) + print("const char *_PyUnicode_QuickCheckNames[] = {", file=fp) + for name in QUICKCHECK_NAMES: + print(" \"%s\"," % name, file=fp) + print(" NULL", file=fp) + print("};", file=fp) + print("static const char *decomp_prefix[] = {", file=fp) for name in decomp_prefix: print(" \"%s\"," % name, file=fp)