# HG changeset patch # User MRAB # Date 1375375440 -3600 # Thu Aug 01 17:44:00 2013 +0100 # Node ID 421f0c6acdf4747743f74435ef18bf57a05af8ae # Parent 8b9d64692f8b97b55b1b702a28cb79fb1c509e4b Issue #18614: Enhanced \N{} escapes for Unicode strings diff -r 8b9d64692f8b -r 421f0c6acdf4 Modules/unicodedata.c --- a/Modules/unicodedata.c Mon Jul 22 16:05:05 2013 +0100 +++ b/Modules/unicodedata.c Thu Aug 01 17:44:00 2013 +0100 @@ -1214,6 +1214,56 @@ return PyUnicode_FromString(name); } +/* Gets the value of a possible hex digit. + + Returns -1 if it's not a hex digit. + */ +static int value_of_digit(char c) { + if ('0' <= c && c <= '9') + return c - '0'; + + if ('A' <= c && c <= 'F') + return c - 'A' + 10; + + if ('a' <= c && c <= 'f') + return c - 'a' + 10; + + return -1; +} + +/* Gets the codepoint from the Unicode U+XXXX notation. + + Returns TRUE if successful. + */ +static PyObject * codepoint_from_U_notation(char* name, + int namelen) { + Py_UCS4 code; + int index; + + if (namelen < 3) { + PyErr_Format(PyExc_ValueError, "invalid codepoint notation '%s'", name); + return NULL; + } + + code = 0; + + for (index = 2; index < namelen; index++) { + int value = value_of_digit(name[index]); + if (value < 0) { + PyErr_Format(PyExc_ValueError, "invalid codepoint notation '%s'", name); + return NULL; + } + + code = (code << 4) | value; + if (code >= 0x110000) { + PyErr_Format(PyExc_ValueError, "codepoint not in range(0x110000)"); + return NULL; + } + } + + return PyUnicode_FromOrdinal(code); +} + PyDoc_STRVAR(unicodedata_lookup__doc__, "lookup(name)\n\ \n\ @@ -1232,6 +1282,9 @@ if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen)) return NULL; + if (namelen >= 2 && name[0] == 'U' && name[1] == '+') + return codepoint_from_U_notation(name, namelen); + if (!_getcode(self, name, namelen, &code, 1)) { PyErr_Format(PyExc_KeyError, "undefined character name '%s'", name); return NULL; diff -r 8b9d64692f8b -r 421f0c6acdf4 Objects/unicodeobject.c --- a/Objects/unicodeobject.c Mon Jul 22 16:05:05 2013 +0100 +++ b/Objects/unicodeobject.c Thu Aug 01 17:44:00 2013 +0100 @@ -5470,6 +5470,53 @@ static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL; +/* Gets the value of a possible hex digit. + + Returns -1 if it's not a hex digit. + */ +static int value_of_digit(char c) { + if ('0' <= c && c <= '9') + return c - '0'; + + if ('A' <= c && c <= 'F') + return c - 'A' + 10; + + if ('a' <= c && c <= 'f') + return c - 'a' + 10; + + return -1; +} + +/* Gets the codepoint from the Unicode U+XXXX notation. + + Returns TRUE if successful. + */ +static int codepoint_from_unicode_notation(const char *name, + int namelen, + Py_UCS4 *chr) { + Py_UCS4 code; + int index; + + if (namelen < 3 || name[0] != 'U' || name[1] != '+') + return FALSE; + + code = 0; + + for (index = 2; index < namelen; index++) { + int value = value_of_digit(name[index]); + if (value < 0) + return FALSE; + + code = (code << 4) | value; + if (code >= 0x110000) + return FALSE; + } + + *chr = code; + + return TRUE; +} + PyObject * PyUnicode_DecodeUnicodeEscape(const char *s, Py_ssize_t size, @@ -5641,8 +5688,10 @@ message = "unknown Unicode character name"; s++; if (s - start - 1 <= INT_MAX && + (codepoint_from_unicode_notation(start, + (int)(s-start-1), &chr) || ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), - &chr, 0)) + &chr, 0))) goto store; } }