diff -r 068365acbe73 -r 3e25ce2f60d9 Doc/library/unicodedata.rst --- a/Doc/library/unicodedata.rst Wed Mar 25 21:03:47 2015 +0200 +++ b/Doc/library/unicodedata.rst Sat Mar 28 19:07:16 2015 +0000 @@ -131,6 +131,42 @@ a human reader, if one has combining characters and the other doesn't, they may not compare equal. + +.. function:: quick_check(form, chr) + + Return the 'Quick_Check' property value for the normal form *form* for the + Unicode character *chr*. Valid values for form are 'NFC', 'NFKC', 'NFD', and + 'NFKD'. The return value is a string: 'Yes', 'No', or 'Maybe'. Return 'Yes' if + the 'Quick_Check' property is not defined for the character. + + The 'Quick_Check' property is useful when joining (or appending to) strings + that have already been normalized to a particular normalization form. If *s1* + and *s2* are two strings that are both normalized to a given normalization form + *form*, then the concatenation *s1* + *s2* is **not** guaranteed to be + normalized to that normalization form; there might be a non-normalized + subsequence of characters straddling the point where the strings were joined. + We might pass the concatenation *s1* + *s2* to ``normalize()``, but if *s1* or + *s2* is long then this implies some wasted effort since a large amount of text + will be processed a second time for no gain (having already been processed when + *s1* or *s2* was normalized). + + However, it is possible to avoid most of this additional work by re-processing + only a small part of each string: characters at the end of *s1* and characters + at the start of *s2*. Specifically, let *x* be the index of the last character + in *s1* that satisfies both ``quick_check(form, chr) == 'Yes'`` and + ``combining(chr) == 0`` (or zero if no character in *s1* satisfies these + conditions), and let *y* be the index of the first character in *s2* that + satisfies the same conditions (or the length of *s2* if no character in *s2* + satisfies the conditions). In this case, the string + ``s1[:x] + normalize(form, s1[x:] + s2[:y]) + s2[y:]`` is guaranteed to be + normalized to *form*. + + For more information, please refer to `section 9 of Unicode Standard Annex #15 + ("Unicode Normalization Forms") + `_. + + .. versionadded:: 3.5 + In addition, the module exposes the following constant: diff -r 068365acbe73 -r 3e25ce2f60d9 Doc/whatsnew/3.5.rst --- a/Doc/whatsnew/3.5.rst Wed Mar 25 21:03:47 2015 +0200 +++ b/Doc/whatsnew/3.5.rst Sat Mar 28 19:07:16 2015 +0000 @@ -419,6 +419,12 @@ * The :func:`time.monotonic` function is now always available. (Contributed by Victor Stinner in :issue:`22043`.) +unicodedata +----------- + +* The :func:`quick_check` function was added. (Contributed by Philip Eve in + :issue:`23550`.) + urllib ------ diff -r 068365acbe73 -r 3e25ce2f60d9 Lib/test/test_unicodedata.py --- a/Lib/test/test_unicodedata.py Wed Mar 25 21:03:47 2015 +0200 +++ b/Lib/test/test_unicodedata.py Sat Mar 28 19:07:16 2015 +0000 @@ -223,6 +223,33 @@ self.assertEqual(eaw('\u2010'), 'A') self.assertEqual(eaw('\U00020000'), 'W') + def test_quick_check(self): + qc = self.db.quick_check + def qc_all(char): + return (qc('NFC', char), qc('NFKC', char), + qc('NFD', char), qc('NFKD', char)) + self.assertEqual(qc_all('s'), ('Yes', 'Yes', 'Yes', 'Yes')) + self.assertEqual(qc_all('7'), ('Yes', 'Yes', 'Yes', 'Yes')) + self.assertEqual(qc_all(' '), ('Yes', 'Yes', 'Yes', 'Yes')) + self.assertEqual(qc_all('\uFFFE'), ('Yes', 'Yes', 'Yes', 'Yes')) + self.assertEqual(qc_all('\u01C7'), ('Yes', 'No', 'Yes', 'No')) + self.assertEqual(qc_all('\u01F5'), ('Yes', 'Yes', 'No', 'No')) + self.assertEqual(qc_all('\u0308'), ('Maybe', 'Maybe', 'Yes', 'Yes')) + self.assertEqual(qc_all('\u0387'), ('No', 'No', 'No', 'No')) + self.assertEqual(qc_all('\u1FDD'), ('Yes', 'No', 'No', 'No')) + self.assertEqual(qc_all('\U0001133E'), + ('Maybe', 'Maybe', 'Yes', 'Yes')) + with self.assertRaises(TypeError): + qc() + with self.assertRaises(TypeError): + qc('NFD') + with self.assertRaises(TypeError): + qc('NFD', '') + with self.assertRaises(TypeError): + qc('NFKC', 'xx') + with self.assertRaises(ValueError): + qc('unknown', 's') + class UnicodeMiscTest(UnicodeDatabaseTest): def test_failed_import_during_compiling(self): diff -r 068365acbe73 -r 3e25ce2f60d9 Modules/unicodedata.c --- a/Modules/unicodedata.c Wed Mar 25 21:03:47 2015 +0200 +++ b/Modules/unicodedata.c Sat Mar 28 19:07:16 2015 +0000 @@ -36,7 +36,8 @@ const unsigned char mirrored; /* true if mirrored in bidir mode */ const unsigned char east_asian_width; /* index into _PyUnicode_EastAsianWidth */ - const unsigned char normalization_quick_check; /* see is_normalized() */ + const unsigned char normalization_quick_check; /* used for quick_check() + and normalize() */ } _PyUnicode_DatabaseRecord; typedef struct change_record { @@ -48,6 +49,8 @@ const double numeric_changed; } change_record; +typedef enum {NFC, NFKC, NFD, NFKD, Invalid_NF} normalization_form; + /* data file generated by Tools/unicode/makeunicodedata.py */ #include "unicodedata_db.h" @@ -65,6 +68,20 @@ return &_PyUnicode_Database_Records[index]; } +static normalization_form str_to_nf(const char *form) +{ + if (strcmp(form, "NFC") == 0) + return NFC; + else if (strcmp(form, "NFKC") == 0) + return NFKC; + else if (strcmp(form, "NFD") == 0) + return NFD; + else if (strcmp(form, "NFKD") == 0) + return NFKD; + else + return Invalid_NF; +} + /* ------------- Previous-version API ------------------------------------- */ typedef struct previous_version { PyObject_HEAD @@ -205,11 +222,11 @@ } PyDoc_STRVAR(unicodedata_digit__doc__, -"digit(unichr[, default])\n\ -\n\ -Returns the digit value assigned to the Unicode character unichr as\n\ -integer. If no such value is defined, default is returned, or, if\n\ -not given, ValueError is raised."); +"digit(unichr[, default])\n" +"\n" +"Returns the digit value assigned to the Unicode character unichr as\n" +"integer. If no such value is defined, default is returned, or, if\n" +"not given, ValueError is raised."); static PyObject * unicodedata_digit(PyObject *self, PyObject *args) @@ -239,11 +256,11 @@ } PyDoc_STRVAR(unicodedata_numeric__doc__, -"numeric(unichr[, default])\n\ -\n\ -Returns the numeric value assigned to the Unicode character unichr\n\ -as float. If no such value is defined, default is returned, or, if\n\ -not given, ValueError is raised."); +"numeric(unichr[, default])\n" +"\n" +"Returns the numeric value assigned to the Unicode character unichr\n" +"as float. If no such value is defined, default is returned, or, if\n" +"not given, ValueError is raised."); static PyObject * unicodedata_numeric(PyObject *self, PyObject *args) @@ -289,10 +306,10 @@ } PyDoc_STRVAR(unicodedata_category__doc__, -"category(unichr)\n\ -\n\ -Returns the general category assigned to the Unicode character\n\ -unichr as string."); +"category(unichr)\n" +"\n" +"Returns the general category assigned to the Unicode character\n" +"unichr as string."); static PyObject * unicodedata_category(PyObject *self, PyObject *args) @@ -317,11 +334,11 @@ } PyDoc_STRVAR(unicodedata_bidirectional__doc__, -"bidirectional(unichr)\n\ -\n\ -Returns the bidirectional class assigned to the Unicode character\n\ -unichr as string. If no such value is defined, an empty string is\n\ -returned."); +"bidirectional(unichr)\n" +"\n" +"Returns the bidirectional class assigned to the Unicode character\n" +"unichr as string. If no such value is defined, an empty string is\n" +"returned."); static PyObject * unicodedata_bidirectional(PyObject *self, PyObject *args) @@ -348,11 +365,11 @@ } PyDoc_STRVAR(unicodedata_combining__doc__, -"combining(unichr)\n\ -\n\ -Returns the canonical combining class assigned to the Unicode\n\ -character unichr as integer. Returns 0 if no combining class is\n\ -defined."); +"combining(unichr)\n" +"\n" +"Returns the canonical combining class assigned to the Unicode\n" +"character unichr as integer. Returns 0 if no combining class is\n" +"defined."); static PyObject * unicodedata_combining(PyObject *self, PyObject *args) @@ -377,11 +394,11 @@ } PyDoc_STRVAR(unicodedata_mirrored__doc__, -"mirrored(unichr)\n\ -\n\ -Returns the mirrored property assigned to the Unicode character\n\ -unichr as integer. Returns 1 if the character has been identified as\n\ -a \"mirrored\" character in bidirectional text, 0 otherwise."); +"mirrored(unichr)\n" +"\n" +"Returns the mirrored property assigned to the Unicode character\n" +"unichr as integer. Returns 1 if the character has been identified as\n" +"a \"mirrored\" character in bidirectional text, 0 otherwise."); static PyObject * unicodedata_mirrored(PyObject *self, PyObject *args) @@ -408,10 +425,10 @@ } PyDoc_STRVAR(unicodedata_east_asian_width__doc__, -"east_asian_width(unichr)\n\ -\n\ -Returns the east asian width assigned to the Unicode character\n\ -unichr as string."); +"east_asian_width(unichr)\n" +"\n" +"Returns the east asian width assigned to the Unicode character\n" +"unichr as string."); static PyObject * unicodedata_east_asian_width(PyObject *self, PyObject *args) @@ -436,11 +453,11 @@ } PyDoc_STRVAR(unicodedata_decomposition__doc__, -"decomposition(unichr)\n\ -\n\ -Returns the character decomposition mapping assigned to the Unicode\n\ -character unichr as string. An empty string is returned in case no\n\ -such mapping is defined."); +"decomposition(unichr)\n" +"\n" +"Returns the character decomposition mapping assigned to the Unicode\n" +"character unichr as string. An empty string is returned in case no\n" +"such mapping is defined."); static PyObject * unicodedata_decomposition(PyObject *self, PyObject *args) @@ -859,10 +876,10 @@ } PyDoc_STRVAR(unicodedata_normalize__doc__, -"normalize(form, unistr)\n\ -\n\ -Return the normal form 'form' for the Unicode string unistr. Valid\n\ -values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'."); +"normalize(form, unistr)\n" +"\n" +"Return the normal form 'form' for the Unicode string unistr. Valid\n" +"values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'."); static PyObject* unicodedata_normalize(PyObject *self, PyObject *args) @@ -884,36 +901,122 @@ return input; } - if (strcmp(form, "NFC") == 0) { - if (is_normalized(self, input, 1, 0)) { - Py_INCREF(input); - return input; - } - return nfc_nfkc(self, input, 0); + switch (str_to_nf(form)) { + case NFC: + if (is_normalized(self, input, 1, 0)) { + Py_INCREF(input); + return input; + } + return nfc_nfkc(self, input, 0); + case NFKC: + if (is_normalized(self, input, 1, 1)) { + Py_INCREF(input); + return input; + } + return nfc_nfkc(self, input, 1); + case NFD: + if (is_normalized(self, input, 0, 0)) { + Py_INCREF(input); + return input; + } + return nfd_nfkd(self, input, 0); + case NFKD: + if (is_normalized(self, input, 0, 1)) { + Py_INCREF(input); + return input; + } + return nfd_nfkd(self, input, 1); + default: + PyErr_SetString( + PyExc_ValueError, + "invalid normalization form - valid normalization forms are " + "\"NFC\", \"NFKC\", \"NFD\" and \"NFKD\"" + ); + return NULL; } - if (strcmp(form, "NFKC") == 0) { - if (is_normalized(self, input, 1, 1)) { - Py_INCREF(input); - return input; - } - return nfc_nfkc(self, input, 1); +} + +PyDoc_STRVAR(unicodedata_quick_check__doc__, +"quick_check(form, unichr)\n" +"\n" +"Return the 'Quick_Check' property value for the normal form 'form'\n" +"for the Unicode character unichr. Valid values for form are 'NFC',\n" +"'NFKC', 'NFD', and 'NFKD'. The return value is a string: 'Yes',\n" +"'No', or 'Maybe'. Return 'Yes' if the 'Quick_Check' property is not\n" +"defined for the character.\n" +"\n" +"The 'Quick_Check' property is useful when joining (or appending to)\n" +"strings that have already been normalized to a particular\n" +"normalization form. If s1 and s2 are two strings that are both\n" +"normalized to a given normalization form 'form', then the\n" +"concatenation s1 + s2 is *not* guaranteed to be normalized to that\n" +"normalization form; there might be a non-normalized subsequence of\n" +"characters straddling the point where the strings were joined. We\n" +"might pass the concatenation s1 + s2 to normalize(), but if s1 or s2\n" +"is long then this implies some wasted effort since a large amount of\n" +"text will be processed a second time for no gain (having already\n" +"been processed when s1 or s2 was normalized).\n" +"\n" +"However, it is possible to avoid most of this additional work by\n" +"re-processing only a small part of each string: characters at the\n" +"end of s1 and characters at the start of s2. Specifically, let x be\n" +"the index of the last character in s1 that satisfies\n" +"\n" +" quick_check(form, chr) == 'Yes' and combining(chr) == 0\n" +"\n" +"(or zero if no character in s1 satisfies this condition), and let y\n" +"be the index of the first character in s2 that satisfies the same\n" +"condition (or the length of s2 if no character in s2 satisfies the\n" +"condition). In this case, the string\n" +"\n" +" s1[:x] + normalize(form, s1[x:] + s2[:y]) + s2[y:]\n" +"\n" +"is guaranteed to be normalized to 'form'.\n" +"\n" +"For more information, please refer to section 9 of Unicode Standard\n" +"Annex #15 (\"Unicode Normalization Forms\"):\n" +"\n" +" http://unicode.org/reports/tr15/#Detecting_Normalization_Forms"); + +static PyObject * +unicodedata_quick_check(PyObject *self, PyObject *args) +{ + char *form; + PyUnicodeObject *v; + unsigned char quickcheck_value; + Py_UCS4 c; + + if(!PyArg_ParseTuple(args, "sO!:quick_check", + &form, &PyUnicode_Type, &v)) + return NULL; + + c = getuchar(v); + if (c == (Py_UCS4)-1) + return NULL; + + quickcheck_value = _getrecord_ex(c)->normalization_quick_check; + + switch (str_to_nf(form)) { + case NFC: quickcheck_value >>= 4; break; + case NFKC: quickcheck_value >>= 6; break; + case NFD: /*No need to do anything*/ break; + case NFKD: quickcheck_value >>= 2; break; + default: + PyErr_SetString( + PyExc_ValueError, + "invalid normalization form - valid normalization forms are " + "\"NFC\", \"NFKC\", \"NFD\" and \"NFKD\"" + ); + return NULL; } - if (strcmp(form, "NFD") == 0) { - if (is_normalized(self, input, 0, 0)) { - Py_INCREF(input); - return input; - } - return nfd_nfkd(self, input, 0); - } - if (strcmp(form, "NFKD") == 0) { - if (is_normalized(self, input, 0, 1)) { - Py_INCREF(input); - return input; - } - return nfd_nfkd(self, input, 1); - } - PyErr_SetString(PyExc_ValueError, "invalid normalization form"); - return NULL; + + /* The two quickcheck bits shifted all the way to the right + mean 0=Yes, 1=Maybe, 2=No, as described in + http://unicode.org/reports/tr15/#Annex8. 3=Unused is assigned + in_PyUnicode_QuickCheckNames as a precaution. */ + return PyUnicode_FromString( + _PyUnicode_QuickCheckNames[quickcheck_value & 3] + ); } /* -------------------------------------------------------------------- */ @@ -1233,10 +1336,10 @@ /* Python bindings */ PyDoc_STRVAR(unicodedata_name__doc__, -"name(unichr[, default])\n\ -Returns the name assigned to the Unicode character unichr as a\n\ -string. If no name is defined, default is returned, or, if not\n\ -given, ValueError is raised."); +"name(unichr[, default])\n" +"Returns the name assigned to the Unicode character unichr as a\n" +"string. If no name is defined, default is returned, or, if not\n" +"given, ValueError is raised."); static PyObject * unicodedata_name(PyObject* self, PyObject* args) @@ -1268,11 +1371,11 @@ } PyDoc_STRVAR(unicodedata_lookup__doc__, -"lookup(name)\n\ -\n\ -Look up character by name. If a character with the\n\ -given name is found, return the corresponding Unicode\n\ -character. If not found, KeyError is raised."); +"lookup(name)\n" +"\n" +"Look up character by name. If a character with the\n" +"given name is found, return the corresponding Unicode\n" +"character. If not found, KeyError is raised."); static PyObject * unicodedata_lookup(PyObject* self, PyObject* args) @@ -1326,6 +1429,8 @@ {"lookup", unicodedata_lookup, METH_VARARGS, unicodedata_lookup__doc__}, {"normalize", unicodedata_normalize, METH_VARARGS, unicodedata_normalize__doc__}, + {"quick_check", unicodedata_quick_check, METH_VARARGS, + unicodedata_quick_check__doc__}, {NULL, NULL} /* sentinel */ }; @@ -1376,13 +1481,13 @@ }; PyDoc_STRVAR(unicodedata_docstring, -"This module provides access to the Unicode Character Database which\n\ -defines character properties for all Unicode characters. The data in\n\ -this database is based on the UnicodeData.txt file version\n\ -" UNIDATA_VERSION " which is publically available from ftp://ftp.unicode.org/.\n\ -\n\ -The module uses the same names and symbols as defined by the\n\ -UnicodeData File Format " UNIDATA_VERSION "."); +"This module provides access to the Unicode Character Database which\n" +"defines character properties for all Unicode characters. The data in\n" +"this database is based on the UnicodeData.txt file version\n" +UNIDATA_VERSION " which is publically available from ftp://ftp.unicode.org/.\n" +"\n" +"The module uses the same names and symbols as defined by the\n" +"UnicodeData File Format " UNIDATA_VERSION "."); static struct PyModuleDef unicodedatamodule = { PyModuleDef_HEAD_INIT, diff -r 068365acbe73 -r 3e25ce2f60d9 Modules/unicodedata_db.h --- a/Modules/unicodedata_db.h Wed Mar 25 21:03:47 2015 +0200 +++ b/Modules/unicodedata_db.h Sat Mar 28 19:07:16 2015 +0000 @@ -598,7 +598,7 @@ }; /* string literals */ -const char *_PyUnicode_CategoryNames[] = { +static const char *_PyUnicode_CategoryNames[] = { "Cn", "Lu", "Ll", @@ -632,7 +632,7 @@ "So", NULL }; -const char *_PyUnicode_BidirectionalNames[] = { +static const char *_PyUnicode_BidirectionalNames[] = { "", "L", "LRE", @@ -659,7 +659,7 @@ "PDI", NULL }; -const char *_PyUnicode_EastAsianWidthNames[] = { +static const char *_PyUnicode_EastAsianWidthNames[] = { "F", "H", "W", @@ -668,6 +668,13 @@ "N", NULL }; +static const char *_PyUnicode_QuickCheckNames[] = { + "Yes", + "Maybe", + "No", + "Unused", + NULL +}; static const char *decomp_prefix[] = { "", "", diff -r 068365acbe73 -r 3e25ce2f60d9 Tools/unicode/makeunicodedata.py --- a/Tools/unicode/makeunicodedata.py Wed Mar 25 21:03:47 2015 +0200 +++ b/Tools/unicode/makeunicodedata.py Sat Mar 28 19:07:16 2015 +0000 @@ -77,6 +77,10 @@ EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ] +# "Unused" is here as a failsafe, related to the implementation of +# unicodedata.quick_check(). +QUICKCHECK_NAMES = [ "Yes", "Maybe", "No", "Unused" ] + MANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ] # note: should match definitions in Objects/unicodectype.c @@ -287,24 +291,30 @@ # the support code moved into unicodedatabase.c print("/* string literals */", file=fp) - print("const char *_PyUnicode_CategoryNames[] = {", file=fp) + print("static const char *_PyUnicode_CategoryNames[] = {", file=fp) for name in CATEGORY_NAMES: print(" \"%s\"," % name, file=fp) print(" NULL", file=fp) print("};", file=fp) - print("const char *_PyUnicode_BidirectionalNames[] = {", file=fp) + print("static const char *_PyUnicode_BidirectionalNames[] = {", file=fp) for name in BIDIRECTIONAL_NAMES: print(" \"%s\"," % name, file=fp) print(" NULL", file=fp) print("};", file=fp) - print("const char *_PyUnicode_EastAsianWidthNames[] = {", file=fp) + print("static const char *_PyUnicode_EastAsianWidthNames[] = {", file=fp) for name in EASTASIANWIDTH_NAMES: print(" \"%s\"," % name, file=fp) print(" NULL", file=fp) print("};", file=fp) + print("static const char *_PyUnicode_QuickCheckNames[] = {", file=fp) + for name in QUICKCHECK_NAMES: + print(" \"%s\"," % name, file=fp) + print(" NULL", file=fp) + print("};", file=fp) + print("static const char *decomp_prefix[] = {", file=fp) for name in decomp_prefix: print(" \"%s\"," % name, file=fp)