Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code | Sign in
(6)

Unified Diff: Objects/unicodeobject.c

Issue 10542: Py_UNICODE_NEXT and other macros for surrogates
Patch Set: Created 8 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Please Sign in to add in-line comments.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « no previous file | no next file » | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -78,6 +78,31 @@ OF OR IN CONNECTION WITH THE USE OR PERF
# define BYTEORDER_IS_LITTLE_ENDIAN
#endif
+/* test if a character is in U+10000-U+10FFFF,
sasha 2011/08/19 02:21:01 When writing ranges in comments, please use notati
+ outside the BMP range (U+0000-U+FFFF) */
+#define IS_NONBMP(ch) (ch >= 0x10000)
sasha 2011/08/19 02:21:01 You can avoid using negation in the macro name (an
+
+/* test if a character is a low or high surrogate, in U+D800-U+DFFF */
+#define IS_SURROGATE(ch) (((ch) & 0xFFFFF800UL) == 0xD800)
+
+/* test if a character is a high surrogate, in U+D800-U+DBFF */
+#define IS_HIGH_SURROGATE(ch) (((ch) & 0xFFFFFC00UL) == 0xD800)
+
+/* test if a character is a high surrogate: in U+DC00-U+DFFF */
+#define IS_LOW_SURROGATE(ch) (((ch) & 0xFFFFFC00UL) == 0xDC00)
+
+/* high surrogate = top 10 bits added to D800,
sasha 2011/08/19 02:21:01 This is dangerously close to restating clear code
+ ordinal has to be in [0x0000; 0xFFFF]: use ordinal -= 0x10000 */
sasha 2011/08/19 02:21:01 Please be consistent in how you spell ranges.
+#define HIGH_SURROGATE(ordinal) (0xD800 | ((ordinal) >> 10))
+
+/* low surrogate = bottom 10 bits added to DC00.
+ ordinal has to be in [0x0000; 0xFFFF]: use ordinal -= 0x10000 */
+#define LOW_SURROGATE(ordinal) (0xDC00 | ((ordinal) & 0x3FF))
+
+/* combine the two surrogates to form a UCS4 value */
+#define COMBINE_SURROGATES(ch1, ch2) \
+ (((((Py_UCS4)(ch1) & 0x3FF) << 10) | ((Py_UCS4)(ch2) & 0x3FF)) + 0x10000)
+
/* --- Globals ------------------------------------------------------------
The globals are initialized by the _PyUnicode_Init() API and should
@@ -626,8 +651,8 @@ PyUnicode_FromWideChar(register const wc
if (*w > 0xFFFF) {
wchar_t ordinal = *w++;
ordinal -= 0x10000;
- *u++ = 0xD800 | (ordinal >> 10);
- *u++ = 0xDC00 | (ordinal & 0x3FF);
+ *u++ = HIGH_SURROGATE(ordinal);
+ *u++ = LOW_SURROGATE(ordinal);
}
else
*u++ = *w++;
@@ -1037,8 +1062,8 @@ PyUnicode_FromFormatV(const char *format
#ifndef Py_UNICODE_WIDE
if (ordinal > 0xffff) {
ordinal -= 0x10000;
- *s++ = 0xD800 | (ordinal >> 10);
- *s++ = 0xDC00 | (ordinal & 0x3FF);
+ *s++ = HIGH_SURROGATE(ordinal);
+ *s++ = LOW_SURROGATE(ordinal);
} else
#endif
*s++ = ordinal;
@@ -1240,10 +1265,9 @@ unicode_aswidechar(PyUnicodeObject *unic
worig = w;
wend = w + size;
while (u != uend && w != wend) {
- if (0xD800 <= u[0] && u[0] <= 0xDBFF
- && 0xDC00 <= u[1] && u[1] <= 0xDFFF)
+ if (IS_HIGH_SURROGATE(u[0]) && IS_LOW_SURROGATE(u[1]))
{
- *w = (((u[0] & 0x3FF) << 10) | (u[1] & 0x3FF)) + 0x10000;
+ *w = COMBINE_SURROGATES(u[0], u[1]);
u += 2;
}
else {
@@ -1259,8 +1283,7 @@ unicode_aswidechar(PyUnicodeObject *unic
else {
nchar = 1; /* nul character at the end */
while (u != uend) {
- if (0xD800 <= u[0] && u[0] <= 0xDBFF
- && 0xDC00 <= u[1] && u[1] <= 0xDFFF)
+ if (IS_HIGH_SURROGATE(u[0]) && IS_LOW_SURROGATE(u[1]))
u += 2;
else
u++;
@@ -1283,8 +1306,8 @@ unicode_aswidechar(PyUnicodeObject *unic
ordinal = *u;
if (ordinal > 0xffff) {
ordinal -= 0x10000;
- *w++ = 0xD800 | (ordinal >> 10);
- *w++ = 0xDC00 | (ordinal & 0x3FF);
+ *w++ = HIGH_SURROGATE(ordinal);
+ *w++ = LOW_SURROGATE(ordinal);
}
else
*w++ = ordinal;
@@ -1367,8 +1390,8 @@ PyUnicode_FromOrdinal(int ordinal)
#ifndef Py_UNICODE_WIDE
if (ordinal > 0xffff) {
ordinal -= 0x10000;
- s[0] = 0xD800 | (ordinal >> 10);
- s[1] = 0xDC00 | (ordinal & 0x3FF);
+ s[0] = HIGH_SURROGATE(ordinal);
+ s[1] = LOW_SURROGATE(ordinal);
return PyUnicode_FromUnicode(s, 2);
}
#endif
@@ -2303,10 +2326,9 @@ PyUnicode_DecodeUTF7Stateful(const char
base64buffer &= (1 << base64bits) - 1; /* clear high bits */
if (surrogate) {
/* expecting a second surrogate */
- if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
+ if (IS_LOW_SURROGATE(outCh)) {
#ifdef Py_UNICODE_WIDE
- *p++ = (((surrogate & 0x3FF)<<10)
- | (outCh & 0x3FF)) + 0x10000;
+ *p++ = COMBINE_SURROGATES(surrogate, outCh);
#else
*p++ = surrogate;
*p++ = outCh;
@@ -2319,11 +2341,11 @@ PyUnicode_DecodeUTF7Stateful(const char
goto utf7Error;
}
}
- else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
+ else if (IS_HIGH_SURROGATE(outCh)) {
/* first surrogate */
surrogate = outCh;
}
- else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
+ else if (IS_LOW_SURROGATE(outCh)) {
errmsg = "unexpected second surrogate";
goto utf7Error;
}
@@ -2509,16 +2531,17 @@ PyUnicode_EncodeUTF7(const Py_UNICODE *s
continue;
encode_char:
#ifdef Py_UNICODE_WIDE
- if (ch >= 0x10000) {
+ if (IS_NONBMP(ch)) {
+ base64bits += 16;
/* code first surrogate */
- base64bits += 16;
- base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
+ ch -= 0x10000;
+ base64buffer = (base64buffer << 16) | HIGH_SURROGATE(ch);
+ /* prepare second surrogate */
+ ch = LOW_SURROGATE(ch);
while (base64bits >= 6) {
*out++ = TO_BASE64(base64buffer >> (base64bits-6));
base64bits -= 6;
}
- /* prepare second surrogate */
- ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
}
#endif
base64bits += 16;
@@ -2773,10 +2796,10 @@ PyUnicode_DecodeUTF8Stateful(const char
ch -= 0x10000;
/* high surrogate = top 10 bits added to D800 */
- *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
+ *p++ = (Py_UNICODE)HIGH_SURROGATE(ch);
/* low surrogate = bottom 10 bits added to DC00 */
- *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
+ *p++ = (Py_UNICODE)LOW_SURROGATE(ch);
#endif
break;
}
@@ -2908,10 +2931,10 @@ _Py_DecodeUTF8_surrogateescape(const cha
ch -= 0x10000;
/* high surrogate = top 10 bits added to D800 */
- *p++ = (wchar_t)(0xD800 + (ch >> 10));
+ *p++ = (wchar_t)HIGH_SURROGATE(ch);
/* low surrogate = bottom 10 bits added to DC00 */
- *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
+ *p++ = (wchar_t)LOW_SURROGATE(ch);
#endif
break;
}
@@ -2983,13 +3006,13 @@ PyUnicode_EncodeUTF8(const Py_UNICODE *s
/* Encode Latin-1 */
*p++ = (char)(0xc0 | (ch >> 6));
*p++ = (char)(0x80 | (ch & 0x3f));
- } else if (0xD800 <= ch && ch <= 0xDFFF) {
+ } else if (IS_SURROGATE(ch)) {
#ifndef Py_UNICODE_WIDE
/* Special case: check for high and low surrogate */
- if (ch <= 0xDBFF && i != size && 0xDC00 <= s[i] && s[i] <= 0xDFFF) {
+ if (ch <= 0xDBFF && i != size && IS_LOW_SURROGATE(s[i])) {
Py_UCS4 ch2 = s[i];
/* Combine the two surrogates to form a UCS4 value */
- ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
+ ch = COMBINE_SURROGATES(ch, ch2);
i++;
/* Encode UCS4 Unicode ordinals */
@@ -3061,7 +3084,7 @@ PyUnicode_EncodeUTF8(const Py_UNICODE *s
#ifndef Py_UNICODE_WIDE
}
#endif
- } else if (ch < 0x10000) {
+ } else if (!IS_NONBMP(ch)) {
*p++ = (char)(0xe0 | (ch >> 12));
*p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
*p++ = (char)(0x80 | (ch & 0x3f));
@@ -3248,10 +3271,11 @@ PyUnicode_DecodeUTF32Stateful(const char
goto utf32Error;
}
#ifndef Py_UNICODE_WIDE
- if (ch >= 0x10000)
+ if (IS_NONBMP(ch))
{
- *p++ = 0xD800 | ((ch-0x10000) >> 10);
- *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
+ ch -= 0x10000;
+ *p++ = HIGH_SURROGATE(ch);
+ *p++ = LOW_SURROGATE(ch);
}
else
#endif
@@ -3323,8 +3347,7 @@ PyUnicode_EncodeUTF32(const Py_UNICODE *
so we need less space. */
#ifndef Py_UNICODE_WIDE
for (i = pairs = 0; i < size-1; i++)
- if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
- 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
+ if (IS_HIGH_SURROGATE(s[i]) && IS_LOW_SURROGATE(s[i+1]))
pairs++;
#endif
nsize = (size - pairs + (byteorder == 0));
@@ -3359,10 +3382,10 @@ PyUnicode_EncodeUTF32(const Py_UNICODE *
while (size-- > 0) {
Py_UCS4 ch = *s++;
#ifndef Py_UNICODE_WIDE
- if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
+ if (IS_HIGH_SURROGATE(ch) && size > 0) {
Py_UCS4 ch2 = *s;
- if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
- ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
+ if (IS_LOW_SURROGATE(ch2)) {
+ ch = COMBINE_SURROGATES(ch, ch2);
s++;
size--;
}
@@ -3582,7 +3605,7 @@ PyUnicode_DecodeUTF16Stateful(const char
q += 2;
- if (ch < 0xD800 || ch > 0xDFFF) {
+ if (!IS_SURROGATE(ch)) {
*p++ = ch;
continue;
}
@@ -3594,15 +3617,15 @@ PyUnicode_DecodeUTF16Stateful(const char
endinpos = ((const char *)e) + 1 - starts;
goto utf16Error;
}
- if (0xD800 <= ch && ch <= 0xDBFF) {
+ if (IS_HIGH_SURROGATE(ch)) {
Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
q += 2;
- if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
+ if (IS_LOW_SURROGATE(ch2)) {
#ifndef Py_UNICODE_WIDE
*p++ = ch;
*p++ = ch2;
#else
- *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
+ *p++ = COMBINE_SURROGATES(ch, ch2);
#endif
continue;
}
@@ -3716,7 +3739,7 @@ PyUnicode_EncodeUTF16(const Py_UNICODE *
#ifdef Py_UNICODE_WIDE
for (i = pairs = 0; i < size; i++)
- if (s[i] >= 0x10000)
+ if (IS_NONBMP(s[i]))
pairs++;
#endif
/* 2 * (size + pairs + (byteorder == 0)) */
@@ -3750,16 +3773,20 @@ PyUnicode_EncodeUTF16(const Py_UNICODE *
while (size-- > 0) {
Py_UNICODE ch = *s++;
- Py_UNICODE ch2 = 0;
#ifdef Py_UNICODE_WIDE
- if (ch >= 0x10000) {
- ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
- ch = 0xD800 | ((ch-0x10000) >> 10);
- }
+ Py_UNICODE ch2;
+ if (IS_NONBMP(ch)) {
+ Py_UCS4 ordinal = ch - 0x10000;
+ ch = HIGH_SURROGATE(ordinal);
+ STORECHAR(ch);
+ ch2 = LOW_SURROGATE(ordinal);
+ STORECHAR(ch2);
+ }
+ else
+ STORECHAR(ch);
+#else
+ STORECHAR(ch);
#endif
- STORECHAR(ch);
- if (ch2)
- STORECHAR(ch2);
}
done:
@@ -4101,7 +4128,7 @@ PyUnicode_EncodeUnicodeEscape(const Py_U
#ifdef Py_UNICODE_WIDE
/* Map 21-bit characters to '\U00xxxxxx' */
- else if (ch >= 0x10000) {
+ else if (IS_NONBMP(ch)) {
*p++ = '\\';
*p++ = 'U';
*p++ = hexdigits[(ch >> 28) & 0x0000000F];
@@ -4122,8 +4149,8 @@ PyUnicode_EncodeUnicodeEscape(const Py_U
ch2 = *s++;
size--;
- if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
- ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
+ if (IS_LOW_SURROGATE(ch2)) {
+ ucs = COMBINE_SURROGATES(ch, ch2);
*p++ = '\\';
*p++ = 'U';
*p++ = hexdigits[(ucs >> 28) & 0x0000000F];
@@ -4346,7 +4373,7 @@ PyUnicode_EncodeRawUnicodeEscape(const P
Py_UNICODE ch = *s++;
#ifdef Py_UNICODE_WIDE
/* Map 32-bit characters to '\Uxxxxxxxx' */
- if (ch >= 0x10000) {
+ if (IS_NONBMP(ch)) {
*p++ = '\\';
*p++ = 'U';
*p++ = hexdigits[(ch >> 28) & 0xf];
@@ -4367,8 +4394,8 @@ PyUnicode_EncodeRawUnicodeEscape(const P
ch2 = *s++;
size--;
- if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
- ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
+ if (IS_LOW_SURROGATE(ch2)) {
+ ucs = COMBINE_SURROGATES(ch, ch2);
*p++ = '\\';
*p++ = 'U';
*p++ = hexdigits[(ucs >> 28) & 0xf];
@@ -8051,10 +8078,10 @@ decode_ucs4(const Py_UNICODE *s, Py_ssiz
assert(*i < size);
ch = s[(*i)++];
#ifndef Py_UNICODE_WIDE
- if ((ch & 0xfffffc00) == 0xd800 &&
- *i < size
- && (s[*i] & 0xFFFFFC00) == 0xDC00)
- ch = ((Py_UCS4)ch << 10UL) + (Py_UCS4)(s[(*i)++]) - 0x35fdc00;
+ if (IS_HIGH_SURROGATE(ch) && *i < size && IS_LOW_SURROGATE(s[*i])) {
+ ch = COMBINE_SURROGATES(ch, s[*i]);
+ (*i)++;
+ }
#endif
return ch;
}
@@ -8536,10 +8563,8 @@ unicode_repr(PyObject *unicode)
/* Get code point from surrogate pair */
if (size > 0) {
ch2 = *s;
- if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00
- && ch2 <= 0xDFFF) {
- ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF))
- + 0x00010000;
+ if (IS_HIGH_SURROGATE(ch) && IS_LOW_SURROGATE(ch2)) {
+ ucs = COMBINE_SURROGATES(ch, ch2);
s++;
size--;
}
@@ -8557,7 +8582,7 @@ unicode_repr(PyObject *unicode)
*p++ = hexdigits[ch & 0x000F];
}
/* Map 21-bit characters to '\U00xxxxxx' */
- else if (ucs >= 0x10000) {
+ else if (IS_NONBMP(ucs)) {
*p++ = '\\';
*p++ = 'U';
*p++ = hexdigits[(ucs >> 28) & 0x0000000F];
@@ -8583,7 +8608,7 @@ unicode_repr(PyObject *unicode)
else {
*p++ = ch;
#ifndef Py_UNICODE_WIDE
- if (ucs >= 0x10000)
+ if (IS_NONBMP(ucs))
*p++ = ch2;
#endif
}
@@ -9462,8 +9487,7 @@ formatchar(Py_UNICODE *buf,
/* Decode a valid surrogate pair */
int c0 = PyUnicode_AS_UNICODE(v)[0];
int c1 = PyUnicode_AS_UNICODE(v)[1];
- if (0xD800 <= c0 && c0 <= 0xDBFF &&
- 0xDC00 <= c1 && c1 <= 0xDFFF) {
+ if (IS_HIGH_SURROGATE(c0) && IS_LOW_SURROGATE(c1)) {
buf[0] = c0;
buf[1] = c1;
buf[2] = '\0';
@@ -9489,8 +9513,8 @@ formatchar(Py_UNICODE *buf,
#ifndef Py_UNICODE_WIDE
if (x > 0xffff) {
x -= 0x10000;
- buf[0] = (Py_UNICODE)(0xD800 | (x >> 10));
- buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF));
+ buf[0] = (Py_UNICODE)HIGH_SURROGATE(x);
+ buf[1] = (Py_UNICODE)LOW_SURROGATE(x);
return 2;
}
#endif
« no previous file with comments | « no previous file | no next file » | no next file with comments »

RSS Feeds Recent Issues | This issue
This is Rietveld 894c83f36cb7+