diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -78,6 +78,31 @@ OF OR IN CONNECTION WITH THE USE OR PERF # define BYTEORDER_IS_LITTLE_ENDIAN #endif +/* test if a character is in U+10000-U+10FFFF, + outside the BMP range (U+0000-U+FFFF) */ +#define IS_NONBMP(ch) (ch >= 0x10000) + +/* test if a character is a low or high surrogate, in U+D800-U+DFFF */ +#define IS_SURROGATE(ch) (((ch) & 0xFFFFF800UL) == 0xD800) + +/* test if a character is a high surrogate, in U+D800-U+DBFF */ +#define IS_HIGH_SURROGATE(ch) (((ch) & 0xFFFFFC00UL) == 0xD800) + +/* test if a character is a high surrogate: in U+DC00-U+DFFF */ +#define IS_LOW_SURROGATE(ch) (((ch) & 0xFFFFFC00UL) == 0xDC00) + +/* high surrogate = top 10 bits added to D800, + ordinal has to be in [0x0000; 0xFFFF]: use ordinal -= 0x10000 */ +#define HIGH_SURROGATE(ordinal) (0xD800 | ((ordinal) >> 10)) + +/* low surrogate = bottom 10 bits added to DC00. + ordinal has to be in [0x0000; 0xFFFF]: use ordinal -= 0x10000 */ +#define LOW_SURROGATE(ordinal) (0xDC00 | ((ordinal) & 0x3FF)) + +/* combine the two surrogates to form a UCS4 value */ +#define COMBINE_SURROGATES(ch1, ch2) \ + (((((Py_UCS4)(ch1) & 0x3FF) << 10) | ((Py_UCS4)(ch2) & 0x3FF)) + 0x10000) + /* --- Globals ------------------------------------------------------------ The globals are initialized by the _PyUnicode_Init() API and should @@ -626,8 +651,8 @@ PyUnicode_FromWideChar(register const wc if (*w > 0xFFFF) { wchar_t ordinal = *w++; ordinal -= 0x10000; - *u++ = 0xD800 | (ordinal >> 10); - *u++ = 0xDC00 | (ordinal & 0x3FF); + *u++ = HIGH_SURROGATE(ordinal); + *u++ = LOW_SURROGATE(ordinal); } else *u++ = *w++; @@ -1037,8 +1062,8 @@ PyUnicode_FromFormatV(const char *format #ifndef Py_UNICODE_WIDE if (ordinal > 0xffff) { ordinal -= 0x10000; - *s++ = 0xD800 | (ordinal >> 10); - *s++ = 0xDC00 | (ordinal & 0x3FF); + *s++ = HIGH_SURROGATE(ordinal); + *s++ = LOW_SURROGATE(ordinal); } else #endif *s++ = ordinal; @@ -1240,10 +1265,9 @@ unicode_aswidechar(PyUnicodeObject *unic worig = w; wend = w + size; while (u != uend && w != wend) { - if (0xD800 <= u[0] && u[0] <= 0xDBFF - && 0xDC00 <= u[1] && u[1] <= 0xDFFF) + if (IS_HIGH_SURROGATE(u[0]) && IS_LOW_SURROGATE(u[1])) { - *w = (((u[0] & 0x3FF) << 10) | (u[1] & 0x3FF)) + 0x10000; + *w = COMBINE_SURROGATES(u[0], u[1]); u += 2; } else { @@ -1259,8 +1283,7 @@ unicode_aswidechar(PyUnicodeObject *unic else { nchar = 1; /* nul character at the end */ while (u != uend) { - if (0xD800 <= u[0] && u[0] <= 0xDBFF - && 0xDC00 <= u[1] && u[1] <= 0xDFFF) + if (IS_HIGH_SURROGATE(u[0]) && IS_LOW_SURROGATE(u[1])) u += 2; else u++; @@ -1283,8 +1306,8 @@ unicode_aswidechar(PyUnicodeObject *unic ordinal = *u; if (ordinal > 0xffff) { ordinal -= 0x10000; - *w++ = 0xD800 | (ordinal >> 10); - *w++ = 0xDC00 | (ordinal & 0x3FF); + *w++ = HIGH_SURROGATE(ordinal); + *w++ = LOW_SURROGATE(ordinal); } else *w++ = ordinal; @@ -1367,8 +1390,8 @@ PyUnicode_FromOrdinal(int ordinal) #ifndef Py_UNICODE_WIDE if (ordinal > 0xffff) { ordinal -= 0x10000; - s[0] = 0xD800 | (ordinal >> 10); - s[1] = 0xDC00 | (ordinal & 0x3FF); + s[0] = HIGH_SURROGATE(ordinal); + s[1] = LOW_SURROGATE(ordinal); return PyUnicode_FromUnicode(s, 2); } #endif @@ -2303,10 +2326,9 @@ PyUnicode_DecodeUTF7Stateful(const char base64buffer &= (1 << base64bits) - 1; /* clear high bits */ if (surrogate) { /* expecting a second surrogate */ - if (outCh >= 0xDC00 && outCh <= 0xDFFF) { + if (IS_LOW_SURROGATE(outCh)) { #ifdef Py_UNICODE_WIDE - *p++ = (((surrogate & 0x3FF)<<10) - | (outCh & 0x3FF)) + 0x10000; + *p++ = COMBINE_SURROGATES(surrogate, outCh); #else *p++ = surrogate; *p++ = outCh; @@ -2319,11 +2341,11 @@ PyUnicode_DecodeUTF7Stateful(const char goto utf7Error; } } - else if (outCh >= 0xD800 && outCh <= 0xDBFF) { + else if (IS_HIGH_SURROGATE(outCh)) { /* first surrogate */ surrogate = outCh; } - else if (outCh >= 0xDC00 && outCh <= 0xDFFF) { + else if (IS_LOW_SURROGATE(outCh)) { errmsg = "unexpected second surrogate"; goto utf7Error; } @@ -2509,16 +2531,17 @@ PyUnicode_EncodeUTF7(const Py_UNICODE *s continue; encode_char: #ifdef Py_UNICODE_WIDE - if (ch >= 0x10000) { + if (IS_NONBMP(ch)) { + base64bits += 16; /* code first surrogate */ - base64bits += 16; - base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10); + ch -= 0x10000; + base64buffer = (base64buffer << 16) | HIGH_SURROGATE(ch); + /* prepare second surrogate */ + ch = LOW_SURROGATE(ch); while (base64bits >= 6) { *out++ = TO_BASE64(base64buffer >> (base64bits-6)); base64bits -= 6; } - /* prepare second surrogate */ - ch = 0xDC00 | ((ch-0x10000) & 0x3FF); } #endif base64bits += 16; @@ -2773,10 +2796,10 @@ PyUnicode_DecodeUTF8Stateful(const char ch -= 0x10000; /* high surrogate = top 10 bits added to D800 */ - *p++ = (Py_UNICODE)(0xD800 + (ch >> 10)); + *p++ = (Py_UNICODE)HIGH_SURROGATE(ch); /* low surrogate = bottom 10 bits added to DC00 */ - *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF)); + *p++ = (Py_UNICODE)LOW_SURROGATE(ch); #endif break; } @@ -2908,10 +2931,10 @@ _Py_DecodeUTF8_surrogateescape(const cha ch -= 0x10000; /* high surrogate = top 10 bits added to D800 */ - *p++ = (wchar_t)(0xD800 + (ch >> 10)); + *p++ = (wchar_t)HIGH_SURROGATE(ch); /* low surrogate = bottom 10 bits added to DC00 */ - *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF)); + *p++ = (wchar_t)LOW_SURROGATE(ch); #endif break; } @@ -2983,13 +3006,13 @@ PyUnicode_EncodeUTF8(const Py_UNICODE *s /* Encode Latin-1 */ *p++ = (char)(0xc0 | (ch >> 6)); *p++ = (char)(0x80 | (ch & 0x3f)); - } else if (0xD800 <= ch && ch <= 0xDFFF) { + } else if (IS_SURROGATE(ch)) { #ifndef Py_UNICODE_WIDE /* Special case: check for high and low surrogate */ - if (ch <= 0xDBFF && i != size && 0xDC00 <= s[i] && s[i] <= 0xDFFF) { + if (ch <= 0xDBFF && i != size && IS_LOW_SURROGATE(s[i])) { Py_UCS4 ch2 = s[i]; /* Combine the two surrogates to form a UCS4 value */ - ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000; + ch = COMBINE_SURROGATES(ch, ch2); i++; /* Encode UCS4 Unicode ordinals */ @@ -3061,7 +3084,7 @@ PyUnicode_EncodeUTF8(const Py_UNICODE *s #ifndef Py_UNICODE_WIDE } #endif - } else if (ch < 0x10000) { + } else if (!IS_NONBMP(ch)) { *p++ = (char)(0xe0 | (ch >> 12)); *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); *p++ = (char)(0x80 | (ch & 0x3f)); @@ -3248,10 +3271,11 @@ PyUnicode_DecodeUTF32Stateful(const char goto utf32Error; } #ifndef Py_UNICODE_WIDE - if (ch >= 0x10000) + if (IS_NONBMP(ch)) { - *p++ = 0xD800 | ((ch-0x10000) >> 10); - *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF); + ch -= 0x10000; + *p++ = HIGH_SURROGATE(ch); + *p++ = LOW_SURROGATE(ch); } else #endif @@ -3323,8 +3347,7 @@ PyUnicode_EncodeUTF32(const Py_UNICODE * so we need less space. */ #ifndef Py_UNICODE_WIDE for (i = pairs = 0; i < size-1; i++) - if (0xD800 <= s[i] && s[i] <= 0xDBFF && - 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF) + if (IS_HIGH_SURROGATE(s[i]) && IS_LOW_SURROGATE(s[i+1])) pairs++; #endif nsize = (size - pairs + (byteorder == 0)); @@ -3359,10 +3382,10 @@ PyUnicode_EncodeUTF32(const Py_UNICODE * while (size-- > 0) { Py_UCS4 ch = *s++; #ifndef Py_UNICODE_WIDE - if (0xD800 <= ch && ch <= 0xDBFF && size > 0) { + if (IS_HIGH_SURROGATE(ch) && size > 0) { Py_UCS4 ch2 = *s; - if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { - ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000; + if (IS_LOW_SURROGATE(ch2)) { + ch = COMBINE_SURROGATES(ch, ch2); s++; size--; } @@ -3582,7 +3605,7 @@ PyUnicode_DecodeUTF16Stateful(const char q += 2; - if (ch < 0xD800 || ch > 0xDFFF) { + if (!IS_SURROGATE(ch)) { *p++ = ch; continue; } @@ -3594,15 +3617,15 @@ PyUnicode_DecodeUTF16Stateful(const char endinpos = ((const char *)e) + 1 - starts; goto utf16Error; } - if (0xD800 <= ch && ch <= 0xDBFF) { + if (IS_HIGH_SURROGATE(ch)) { Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo]; q += 2; - if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { + if (IS_LOW_SURROGATE(ch2)) { #ifndef Py_UNICODE_WIDE *p++ = ch; *p++ = ch2; #else - *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000; + *p++ = COMBINE_SURROGATES(ch, ch2); #endif continue; } @@ -3716,7 +3739,7 @@ PyUnicode_EncodeUTF16(const Py_UNICODE * #ifdef Py_UNICODE_WIDE for (i = pairs = 0; i < size; i++) - if (s[i] >= 0x10000) + if (IS_NONBMP(s[i])) pairs++; #endif /* 2 * (size + pairs + (byteorder == 0)) */ @@ -3750,16 +3773,20 @@ PyUnicode_EncodeUTF16(const Py_UNICODE * while (size-- > 0) { Py_UNICODE ch = *s++; - Py_UNICODE ch2 = 0; #ifdef Py_UNICODE_WIDE - if (ch >= 0x10000) { - ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF); - ch = 0xD800 | ((ch-0x10000) >> 10); - } + Py_UNICODE ch2; + if (IS_NONBMP(ch)) { + Py_UCS4 ordinal = ch - 0x10000; + ch = HIGH_SURROGATE(ordinal); + STORECHAR(ch); + ch2 = LOW_SURROGATE(ordinal); + STORECHAR(ch2); + } + else + STORECHAR(ch); +#else + STORECHAR(ch); #endif - STORECHAR(ch); - if (ch2) - STORECHAR(ch2); } done: @@ -4101,7 +4128,7 @@ PyUnicode_EncodeUnicodeEscape(const Py_U #ifdef Py_UNICODE_WIDE /* Map 21-bit characters to '\U00xxxxxx' */ - else if (ch >= 0x10000) { + else if (IS_NONBMP(ch)) { *p++ = '\\'; *p++ = 'U'; *p++ = hexdigits[(ch >> 28) & 0x0000000F]; @@ -4122,8 +4149,8 @@ PyUnicode_EncodeUnicodeEscape(const Py_U ch2 = *s++; size--; - if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { - ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000; + if (IS_LOW_SURROGATE(ch2)) { + ucs = COMBINE_SURROGATES(ch, ch2); *p++ = '\\'; *p++ = 'U'; *p++ = hexdigits[(ucs >> 28) & 0x0000000F]; @@ -4346,7 +4373,7 @@ PyUnicode_EncodeRawUnicodeEscape(const P Py_UNICODE ch = *s++; #ifdef Py_UNICODE_WIDE /* Map 32-bit characters to '\Uxxxxxxxx' */ - if (ch >= 0x10000) { + if (IS_NONBMP(ch)) { *p++ = '\\'; *p++ = 'U'; *p++ = hexdigits[(ch >> 28) & 0xf]; @@ -4367,8 +4394,8 @@ PyUnicode_EncodeRawUnicodeEscape(const P ch2 = *s++; size--; - if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { - ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000; + if (IS_LOW_SURROGATE(ch2)) { + ucs = COMBINE_SURROGATES(ch, ch2); *p++ = '\\'; *p++ = 'U'; *p++ = hexdigits[(ucs >> 28) & 0xf]; @@ -8051,10 +8078,10 @@ decode_ucs4(const Py_UNICODE *s, Py_ssiz assert(*i < size); ch = s[(*i)++]; #ifndef Py_UNICODE_WIDE - if ((ch & 0xfffffc00) == 0xd800 && - *i < size - && (s[*i] & 0xFFFFFC00) == 0xDC00) - ch = ((Py_UCS4)ch << 10UL) + (Py_UCS4)(s[(*i)++]) - 0x35fdc00; + if (IS_HIGH_SURROGATE(ch) && *i < size && IS_LOW_SURROGATE(s[*i])) { + ch = COMBINE_SURROGATES(ch, s[*i]); + (*i)++; + } #endif return ch; } @@ -8536,10 +8563,8 @@ unicode_repr(PyObject *unicode) /* Get code point from surrogate pair */ if (size > 0) { ch2 = *s; - if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00 - && ch2 <= 0xDFFF) { - ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) - + 0x00010000; + if (IS_HIGH_SURROGATE(ch) && IS_LOW_SURROGATE(ch2)) { + ucs = COMBINE_SURROGATES(ch, ch2); s++; size--; } @@ -8557,7 +8582,7 @@ unicode_repr(PyObject *unicode) *p++ = hexdigits[ch & 0x000F]; } /* Map 21-bit characters to '\U00xxxxxx' */ - else if (ucs >= 0x10000) { + else if (IS_NONBMP(ucs)) { *p++ = '\\'; *p++ = 'U'; *p++ = hexdigits[(ucs >> 28) & 0x0000000F]; @@ -8583,7 +8608,7 @@ unicode_repr(PyObject *unicode) else { *p++ = ch; #ifndef Py_UNICODE_WIDE - if (ucs >= 0x10000) + if (IS_NONBMP(ucs)) *p++ = ch2; #endif } @@ -9462,8 +9487,7 @@ formatchar(Py_UNICODE *buf, /* Decode a valid surrogate pair */ int c0 = PyUnicode_AS_UNICODE(v)[0]; int c1 = PyUnicode_AS_UNICODE(v)[1]; - if (0xD800 <= c0 && c0 <= 0xDBFF && - 0xDC00 <= c1 && c1 <= 0xDFFF) { + if (IS_HIGH_SURROGATE(c0) && IS_LOW_SURROGATE(c1)) { buf[0] = c0; buf[1] = c1; buf[2] = '\0'; @@ -9489,8 +9513,8 @@ formatchar(Py_UNICODE *buf, #ifndef Py_UNICODE_WIDE if (x > 0xffff) { x -= 0x10000; - buf[0] = (Py_UNICODE)(0xD800 | (x >> 10)); - buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF)); + buf[0] = (Py_UNICODE)HIGH_SURROGATE(x); + buf[1] = (Py_UNICODE)LOW_SURROGATE(x); return 2; } #endif