Index: Include/unicodeobject.h =================================================================== --- Include/unicodeobject.h (revision 86824) +++ Include/unicodeobject.h (working copy) @@ -355,6 +355,22 @@ for (i_ = 0; i_ < (length); i_++) t_[i_] = v_;\ } while (0) +#define Py_UNICODE_ISSURROGATE(ch) (0xD800 <= ch && ch <= 0xDFFF) +#define Py_UNICODE_ISHIGHSURROGATE(ch) (0xD800 <= ch && ch <= 0xDBFF) +#define Py_UNICODE_ISLOWSURROGATE(ch) (0xDC00 <= ch && ch <= 0xDFFF) +#define Py_UNICODE_JOIN_SURROGATES(high, low) \ + ((Py_UCS4)(((((Py_UCS4)high - 0xD800) << 10) | \ + ((Py_UCS4)low - 0xDC00)) + 0x10000)) +#ifdef Py_UNICODE_WIDE +#define Py_UNICODE_NEXT(ptr, end) *ptr++ +#else +#define Py_UNICODE_NEXT(ptr, end) \ + ((Py_UNICODE_ISHIGHSURROGATE(*ptr) && ptr < end) ? \ + (Py_UNICODE_ISLOWSURROGATE(ptr[1]) ? \ + (ptr += 2,Py_UNICODE_JOIN_SURROGATES(ptr[-2], ptr[-1])) : \ + (Py_UCS4)*ptr++) : \ + (Py_UCS4)*ptr++) +#endif /* Check if substring matches at given offset. The offset must be valid, and the substring must not be empty. */ @@ -737,7 +753,7 @@ const char *errors /* error handling */ ); -/* Encodes a Unicode object and returns the result as Python string +/* Encodes a Unicode object and returns the result as Python bytes object. */ PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString( Index: Objects/unicodeobject.c =================================================================== --- Objects/unicodeobject.c (revision 86824) +++ Objects/unicodeobject.c (working copy) @@ -1192,19 +1192,8 @@ if (w != NULL) { worig = w; wend = w + size; - while (u != uend && w != wend) { - if (0xD800 <= u[0] && u[0] <= 0xDBFF - && 0xDC00 <= u[1] && u[1] <= 0xDFFF) - { - *w = (((u[0] & 0x3FF) << 10) | (u[1] & 0x3FF)) + 0x10000; - u += 2; - } - else { - *w = *u; - u++; - } - w++; - } + while (u != uend && w != wend) + *w++ = Py_UNICODE_NEXT(u, uend); if (w != wend) *w = L'\0'; return w - worig; @@ -3213,6 +3202,7 @@ const char *errors, int byteorder) { + const Py_UNICODE *send = s + size; PyObject *v; unsigned char *p; Py_ssize_t nsize, bytesize; @@ -3257,7 +3247,7 @@ if (byteorder == 0) STORECHAR(0xFEFF); if (size == 0) - goto done; + return v; if (byteorder == -1) { /* force LE */ @@ -3274,22 +3264,11 @@ iorder[3] = 0; } - while (size-- > 0) { - Py_UCS4 ch = *s++; -#ifndef Py_UNICODE_WIDE - if (0xD800 <= ch && ch <= 0xDBFF && size > 0) { - Py_UCS4 ch2 = *s; - if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { - ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000; - s++; - size--; - } - } -#endif + while (s < send) { + Py_UCS4 ch; + ch = Py_UNICODE_NEXT(s, send); STORECHAR(ch); } - - done: return v; #undef STORECHAR } @@ -7654,8 +7633,8 @@ e = p + PyUnicode_GET_SIZE(self); cased = 0; - for (; p < e; p++) { - register const Py_UNICODE ch = *p; + while (p < e) { + register const Py_UCS4 ch = Py_UNICODE_NEXT(p, e); if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) return PyBool_FromLong(0); @@ -7688,8 +7667,8 @@ e = p + PyUnicode_GET_SIZE(self); cased = 0; - for (; p < e; p++) { - register const Py_UNICODE ch = *p; + while (p < e) { + register const Py_UCS4 ch = Py_UNICODE_NEXT(p, e); if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch)) return PyBool_FromLong(0); @@ -7726,8 +7705,8 @@ e = p + PyUnicode_GET_SIZE(self); cased = 0; previous_is_cased = 0; - for (; p < e; p++) { - register const Py_UNICODE ch = *p; + while (p < e) { + register const Py_UCS4 ch = Py_UNICODE_NEXT(p, e); if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) { if (previous_is_cased) @@ -7798,8 +7777,8 @@ return PyBool_FromLong(0); e = p + PyUnicode_GET_SIZE(self); - for (; p < e; p++) { - if (!Py_UNICODE_ISALPHA(*p)) + while (p < e) { + if (!Py_UNICODE_ISALPHA(Py_UNICODE_NEXT(p, e))) return PyBool_FromLong(0); } return PyBool_FromLong(1); @@ -7827,8 +7806,9 @@ return PyBool_FromLong(0); e = p + PyUnicode_GET_SIZE(self); - for (; p < e; p++) { - if (!Py_UNICODE_ISALNUM(*p)) + while (p < e) { + Py_UCS4 ch = Py_UNICODE_NEXT(p, e); + if (!Py_UNICODE_ISALNUM(ch)) return PyBool_FromLong(0); } return PyBool_FromLong(1); @@ -7856,8 +7836,8 @@ return PyBool_FromLong(0); e = p + PyUnicode_GET_SIZE(self); - for (; p < e; p++) { - if (!Py_UNICODE_ISDECIMAL(*p)) + while (p < e) { + if (!Py_UNICODE_ISDECIMAL(Py_UNICODE_NEXT(p, e))) return PyBool_FromLong(0); } return PyBool_FromLong(1); @@ -7885,8 +7865,8 @@ return PyBool_FromLong(0); e = p + PyUnicode_GET_SIZE(self); - for (; p < e; p++) { - if (!Py_UNICODE_ISDIGIT(*p)) + while (p < e) { + if (!Py_UNICODE_ISDIGIT(Py_UNICODE_NEXT(p, e))) return PyBool_FromLong(0); } return PyBool_FromLong(1); @@ -7914,8 +7894,8 @@ return PyBool_FromLong(0); e = p + PyUnicode_GET_SIZE(self); - for (; p < e; p++) { - if (!Py_UNICODE_ISNUMERIC(*p)) + while (p < e) { + if (!Py_UNICODE_ISNUMERIC(Py_UNICODE_NEXT(p, e))) return PyBool_FromLong(0); } return PyBool_FromLong(1); @@ -7926,11 +7906,13 @@ { register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self); register const Py_UNICODE *e; + Py_UCS4 ch; /* Special case for empty strings */ if (PyUnicode_GET_SIZE(self) == 0) return 0; + e = p + PyUnicode_GET_SIZE(self); /* PEP 3131 says that the first character must be in XID_Start and subsequent characters in XID_Continue, and for the ASCII range, the 2.x rules apply (i.e @@ -7939,14 +7921,14 @@ definition of XID_Start and XID_Continue, it is sufficient to check just for these, except that _ must be allowed as starting an identifier. */ - if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */) + ch = Py_UNICODE_NEXT(p, e); + if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) return 0; - - e = p + PyUnicode_GET_SIZE(self); - for (p++; p < e; p++) { - if (!_PyUnicode_IsXidContinue(*p)) + do { + if (!_PyUnicode_IsXidContinue(ch)) return 0; - } + ch = Py_UNICODE_NEXT(p, e); + } while (p < e); return 1; } @@ -7980,8 +7962,8 @@ } e = p + PyUnicode_GET_SIZE(self); - for (; p < e; p++) { - if (!Py_UNICODE_ISPRINTABLE(*p)) { + while (p < e) { + if (!Py_UNICODE_ISPRINTABLE(Py_UNICODE_NEXT(p, e))) { Py_RETURN_FALSE; } } Index: Lib/test/test_unicode.py =================================================================== --- Lib/test/test_unicode.py (revision 86824) +++ Lib/test/test_unicode.py (working copy) @@ -344,11 +344,17 @@ def test_islower(self): string_tests.MixinStrUnicodeUserStringTest.test_islower(self) self.checkequalnofix(False, '\u1FFc', 'islower') + nonbmp = ('\N{MATHEMATICAL BOLD SMALL A}bc' + '\N{MATHEMATICAL BOLD SMALL D}ef') + self.checkequalnofix(True, nonbmp, 'islower') def test_isupper(self): string_tests.MixinStrUnicodeUserStringTest.test_isupper(self) if not sys.platform.startswith('java'): self.checkequalnofix(False, '\u1FFc', 'isupper') + nonbmp = ('\N{MATHEMATICAL BOLD CAPITAL A}BC' + '\N{MATHEMATICAL BOLD CAPITAL D}EF') + self.checkequalnofix(True, nonbmp, 'isupper') def test_istitle(self): string_tests.MixinStrUnicodeUserStringTest.test_title(self) @@ -364,6 +370,9 @@ def test_isalpha(self): string_tests.MixinStrUnicodeUserStringTest.test_isalpha(self) self.checkequalnofix(True, '\u1FFc', 'isalpha') + nonbmp = ('\N{OLD ITALIC LETTER A}' + '\N{MATHEMATICAL BOLD CAPITAL A}') + self.checkequalnofix(True, nonbmp, 'isalpha') def test_isdecimal(self): self.checkequalnofix(False, '', 'isdecimal') @@ -382,6 +391,8 @@ self.checkequalnofix(True, '\u2460', 'isdigit') self.checkequalnofix(False, '\xbc', 'isdigit') self.checkequalnofix(True, '\u0660', 'isdigit') + test = '\N{FULLWIDTH DIGIT ONE}23\N{FULLWIDTH DIGIT FOUR}' + self.checkequalnofix(True, test, 'isdigit') def test_isnumeric(self): self.checkequalnofix(False, '', 'isnumeric') @@ -392,6 +403,9 @@ self.checkequalnofix(True, '\u0660', 'isnumeric') self.checkequalnofix(True, '0123456789', 'isnumeric') self.checkequalnofix(False, '0123456789a', 'isnumeric') + nonbmp = ('\N{COUNTING ROD UNIT DIGIT ONE}23' + '\N{COUNTING ROD UNIT DIGIT FOUR}') + self.checkequalnofix(True, nonbmp, 'isnumeric') self.assertRaises(TypeError, "abc".isnumeric, 42) @@ -403,6 +417,9 @@ self.assertTrue("bc".isidentifier()) self.assertTrue("b_".isidentifier()) self.assertTrue("ยต".isidentifier()) + nonbmp = ('\N{OLD ITALIC LETTER A}' + '\N{MATHEMATICAL BOLD CAPITAL A}') + self.assertTrue(nonbmp.isidentifier()) self.assertFalse(" ".isidentifier()) self.assertFalse("[".isidentifier()) @@ -420,6 +437,9 @@ self.assertFalse("\u0378".isprintable()) # single surrogate character self.assertFalse("\ud800".isprintable()) + nonbmp = ('\N{OLD ITALIC LETTER A}' + '\N{MATHEMATICAL BOLD CAPITAL A}') + self.assertTrue(nonbmp.isprintable()) def test_contains(self): # Testing Unicode contains method