Index: Python/bltinmodule.c =================================================================== --- Python/bltinmodule.c (revision 87556) +++ Python/bltinmodule.c (working copy) @@ -1397,24 +1397,13 @@ } } else if (PyUnicode_Check(obj)) { - size = PyUnicode_GET_SIZE(obj); - if (size == 1) { - ord = (long)*PyUnicode_AS_UNICODE(obj); + const Py_UNICODE *begin = PyUnicode_AS_UNICODE(obj); + const Py_UNICODE *end; + size = PyUnicode_GET_SIZE(obj); + end = begin + size; + ord = _Py_UNICODE_NEXT(begin, end); + if (begin == end) return PyLong_FromLong(ord); - } -#ifndef Py_UNICODE_WIDE - if (size == 2) { - /* Decode a valid surrogate pair */ - int c0 = PyUnicode_AS_UNICODE(obj)[0]; - int c1 = PyUnicode_AS_UNICODE(obj)[1]; - if (0xD800 <= c0 && c0 <= 0xDBFF && - 0xDC00 <= c1 && c1 <= 0xDFFF) { - ord = ((((c0 & 0x03FF) << 10) | (c1 & 0x03FF)) + - 0x00010000); - return PyLong_FromLong(ord); - } - } -#endif } else if (PyByteArray_Check(obj)) { /* XXX Hopefully this is temporary */ Index: Include/unicodeobject.h =================================================================== --- Include/unicodeobject.h (revision 87556) +++ Include/unicodeobject.h (working copy) @@ -358,6 +358,48 @@ for (i_ = 0; i_ < (length); i_++) t_[i_] = v_;\ } while (0) +#define _Py_UNICODE_ISSURROGATE(ch) (0xD800 <= ch && ch <= 0xDFFF) +#define _Py_UNICODE_ISHIGHSURROGATE(ch) (0xD800 <= ch && ch <= 0xDBFF) +#define _Py_UNICODE_ISLOWSURROGATE(ch) (0xDC00 <= ch && ch <= 0xDFFF) +/* Join two surrogate characters and return a single Py_UCS4 value. */ +#define _Py_UNICODE_JOIN_SURROGATES(high, low) \ + ((Py_UCS4)(((((Py_UCS4)(high) & 0x03FF) << 10) | \ + ((Py_UCS4)(low) & 0x03FF)) + 0x10000)) + +/* The following macros can be used in both narrow and wide unicode builds to + * access characters in Py_UNICODE buffer. _Py_UNICODE_NEXT(ptr, end) returns + * the character at the position pointed by ptr and _Py_UNICODE_PUT_NEXT(ptr, + * ch) stores the character ch at this position. The character returned by + * _Py_UNICODE_NEXT() and expected by _Py_UNICODE_PUT_NEXT() has type Py_USC4 + * regardless of the choice of the build. On the other hand, ptr and end + * arguments have type Py_UNICODE* which is different on narrow and wide + * unicode builds. Both macros advance ptr to the next character. The ptr and + * end arguments should be side-effect free and ptr must an lvalue. The end + * argument should point to the end of the buffer. It is used on narrow + * builds to detect a lone surrogate at the end of the buffer that should be + * returned unchanged. + */ +#ifdef Py_UNICODE_WIDE +#define _Py_UNICODE_NEXT(ptr, end) *(ptr)++ +#define _Py_UNICODE_PUT_NEXT(ptr, ch) *(ptr)++ = (ch) +#else +#define _Py_UNICODE_NEXT(ptr, end) \ + ((_Py_UNICODE_ISHIGHSURROGATE(*(ptr)) && (ptr) < (end)) ? \ + (_Py_UNICODE_ISLOWSURROGATE((ptr)[1]) ? \ + ((ptr) += 2,_Py_UNICODE_JOIN_SURROGATES((ptr)[-2], (ptr)[-1])) : \ + (Py_UCS4)*(ptr)++) : \ + (Py_UCS4)*(ptr)++) +#define _Py_UNICODE_PUT_NEXT(ptr, ch) \ + do { \ + if ((ch) > 0xFFFF) { \ + Py_UCS4 code = (ch) - 0x10000; \ + *(ptr)++ = (Py_UNICODE)(0xD800 | (code >> 10)); \ + *(ptr)++ = (Py_UNICODE)(0xDC00 | (code & 0x3FF)); \ + } \ + else \ + *(ptr)++ = (Py_UNICODE)(ch); \ + } while (0) +#endif /* Check if substring matches at given offset. The offset must be valid, and the substring must not be empty. */ @@ -765,7 +807,7 @@ const char *errors /* error handling */ ); -/* Encodes a Unicode object and returns the result as Python string +/* Encodes a Unicode object and returns the result as Python bytes object. */ PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString( Index: Objects/unicodeobject.c =================================================================== --- Objects/unicodeobject.c (revision 87556) +++ Objects/unicodeobject.c (working copy) @@ -1189,19 +1189,8 @@ if (w != NULL) { worig = w; wend = w + size; - while (u != uend && w != wend) { - if (0xD800 <= u[0] && u[0] <= 0xDBFF - && 0xDC00 <= u[1] && u[1] <= 0xDFFF) - { - *w = (((u[0] & 0x3FF) << 10) | (u[1] & 0x3FF)) + 0x10000; - u += 2; - } - else { - *w = *u; - u++; - } - w++; - } + while (u != uend && w != wend) + *w++ = _Py_UNICODE_NEXT(u, uend); if (w != wend) *w = L'\0'; return w - worig; @@ -2225,10 +2214,10 @@ base64buffer &= (1 << base64bits) - 1; /* clear high bits */ if (surrogate) { /* expecting a second surrogate */ - if (outCh >= 0xDC00 && outCh <= 0xDFFF) { + if (_Py_UNICODE_ISLOWSURROGATE(outCh)) { #ifdef Py_UNICODE_WIDE - *p++ = (((surrogate & 0x3FF)<<10) - | (outCh & 0x3FF)) + 0x10000; + *p++ = _Py_UNICODE_JOIN_SURROGATES(surrogate, + outCh); #else *p++ = surrogate; *p++ = outCh; @@ -2682,21 +2671,7 @@ ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) + ((s[2] & 0x3f) << 6) + (s[3] & 0x3f); assert ((ch > 0xFFFF) && (ch <= 0x10ffff)); - -#ifdef Py_UNICODE_WIDE - *p++ = (Py_UNICODE)ch; -#else - /* compute and append the two surrogates: */ - - /* translate from 10000..10FFFF to 0..FFFF */ - ch -= 0x10000; - - /* high surrogate = top 10 bits added to D800 */ - *p++ = (Py_UNICODE)(0xD800 + (ch >> 10)); - - /* low surrogate = bottom 10 bits added to DC00 */ - *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF)); -#endif + _Py_UNICODE_PUT_NEXT(p, ch); break; } s += n; @@ -3210,6 +3185,7 @@ const char *errors, int byteorder) { + const Py_UNICODE *send = s + size; PyObject *v; unsigned char *p; Py_ssize_t nsize, bytesize; @@ -3254,7 +3230,7 @@ if (byteorder == 0) STORECHAR(0xFEFF); if (size == 0) - goto done; + return v; if (byteorder == -1) { /* force LE */ @@ -3271,22 +3247,11 @@ iorder[3] = 0; } - while (size-- > 0) { - Py_UCS4 ch = *s++; -#ifndef Py_UNICODE_WIDE - if (0xD800 <= ch && ch <= 0xDBFF && size > 0) { - Py_UCS4 ch2 = *s; - if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { - ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000; - s++; - size--; - } - } -#endif + while (s < send) { + Py_UCS4 ch; + ch = _Py_UNICODE_NEXT(s, send); STORECHAR(ch); } - - done: return v; #undef STORECHAR } Index: Lib/test/test_builtin.py =================================================================== --- Lib/test/test_builtin.py (revision 87556) +++ Lib/test/test_builtin.py (working copy) @@ -903,6 +903,9 @@ self.assertEqual(ord("\U0010FFFE"), 0x0010FFFE) self.assertEqual(ord("\U0010FFFF"), 0x0010FFFF) + self.assertRaises(TypeError, ord, 'ab') + self.assertRaises(TypeError, ord, '\U0000FFFFx') + def test_pow(self): self.assertEqual(pow(0,0), 1) self.assertEqual(pow(0,1), 0)