diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h --- a/Include/unicodeobject.h +++ b/Include/unicodeobject.h @@ -956,6 +956,16 @@ Py_ssize_t length /* Number of Py_UNICODE chars to encode */ ); +/* --- Internal Escape Encoding ------------------------------------------ */ + +PyAPI_FUNC(PyObject *) _PyUnicode_EncodeCustomUnicodeEscape( + const Py_UNICODE *s, /* Unicode char buffer to encode */ + Py_ssize_t size, /* Number of Py_UNICODE chars to encode */ + int enclose_in_quotes, /* Whether to add u{r}'' to return value */ + int raw, /* Whether to encode to a raw literal */ + const char* extra_raw_escape /* Additional chars to escape */ + ); + /* --- Unicode Internal Codec --------------------------------------------- Only for internal use in _codecsmodule.c */ diff --git a/Misc/NEWS b/Misc/NEWS --- a/Misc/NEWS +++ b/Misc/NEWS @@ -23,6 +23,8 @@ - Issue #7615: The Unicode escape encoders now check to make sure that the provided size is nonnegative. +- Issue #7615: Eliminated duplicate Unicode escape code. + - Issue #2335: Backport set literals syntax from Python 3.x. Library @@ -34,6 +36,17 @@ necessary to ensure consistency in the decoded value now that the raw_unicode_escape encoder escapes backslashes. +- Issue #7615: Removed cPickle's modified_EncodeRawUnicodeEscape() + function; cPickle now uses _PyUnicode_EncodeCustomUnicodeEscape() + instead. + +C-API +----- + +- Issue #7615: Add new _PyUnicode_EncodeCustomUnicodeEscape() function + (renamed from unicodeescape_string()) to expose internal Unicode + escape encoding to eliminate duplicate code in cPickle. + What's New in Python 2.7 alpha 2? ================================= diff --git a/Modules/cPickle.c b/Modules/cPickle.c --- a/Modules/cPickle.c +++ b/Modules/cPickle.c @@ -1281,100 +1281,6 @@ #ifdef Py_USING_UNICODE -/* A copy of PyUnicode_EncodeRawUnicodeEscape() that also translates - backslash and newline characters to \uXXXX escapes. */ -static PyObject * -modified_EncodeRawUnicodeEscape(const Py_UNICODE *s, Py_ssize_t size) -{ - PyObject *repr; - char *p; - char *q; - - static const char *hexdigit = "0123456789abcdef"; -#ifdef Py_UNICODE_WIDE - const Py_ssize_t expandsize = 10; -#else - const Py_ssize_t expandsize = 6; -#endif - - /* make sure size is nonnegative */ - if (size < 0) { - PyErr_BadInternalCall(); - return NULL; - } - - if (size > PY_SSIZE_T_MAX / expandsize) - return PyErr_NoMemory(); - - repr = PyString_FromStringAndSize(NULL, expandsize * size); - if (repr == NULL) - return NULL; - if (size == 0) - return repr; - - p = q = PyString_AS_STRING(repr); - while (size-- > 0) { - Py_UNICODE ch = *s++; -#ifdef Py_UNICODE_WIDE - /* Map 32-bit characters to '\Uxxxxxxxx' */ - if (ch >= 0x10000) { - *p++ = '\\'; - *p++ = 'U'; - *p++ = hexdigit[(ch >> 28) & 0xf]; - *p++ = hexdigit[(ch >> 24) & 0xf]; - *p++ = hexdigit[(ch >> 20) & 0xf]; - *p++ = hexdigit[(ch >> 16) & 0xf]; - *p++ = hexdigit[(ch >> 12) & 0xf]; - *p++ = hexdigit[(ch >> 8) & 0xf]; - *p++ = hexdigit[(ch >> 4) & 0xf]; - *p++ = hexdigit[ch & 15]; - } - else -#else - /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */ - if (ch >= 0xD800 && ch < 0xDC00 && size) { - Py_UNICODE ch2; - Py_UCS4 ucs; - - ch2 = *s++; - size--; - if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { - ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000; - *p++ = '\\'; - *p++ = 'U'; - *p++ = hexdigit[(ucs >> 28) & 0xf]; - *p++ = hexdigit[(ucs >> 24) & 0xf]; - *p++ = hexdigit[(ucs >> 20) & 0xf]; - *p++ = hexdigit[(ucs >> 16) & 0xf]; - *p++ = hexdigit[(ucs >> 12) & 0xf]; - *p++ = hexdigit[(ucs >> 8) & 0xf]; - *p++ = hexdigit[(ucs >> 4) & 0xf]; - *p++ = hexdigit[ucs & 0xf]; - continue; - } - /* Fall through: isolated surrogates are copied as-is */ - s--; - size++; - } -#endif - /* Map 16-bit characters to '\uxxxx' */ - if (ch >= 256 || ch == '\\' || ch == '\n') { - *p++ = '\\'; - *p++ = 'u'; - *p++ = hexdigit[(ch >> 12) & 0xf]; - *p++ = hexdigit[(ch >> 8) & 0xf]; - *p++ = hexdigit[(ch >> 4) & 0xf]; - *p++ = hexdigit[ch & 15]; - } - /* Copy everything else as-is */ - else - *p++ = (char) ch; - } - *p = '\0'; - _PyString_Resize(&repr, p - q); - return repr; -} - static int save_unicode(Picklerobject *self, PyObject *args, int doput) { @@ -1388,8 +1294,9 @@ char *repr_str; static char string = UNICODE; - repr = modified_EncodeRawUnicodeEscape( - PyUnicode_AS_UNICODE(args), PyUnicode_GET_SIZE(args)); + repr = _PyUnicode_EncodeCustomUnicodeEscape( + PyUnicode_AS_UNICODE(args), PyUnicode_GET_SIZE(args), + 0, 1, "\n"); if (!repr) return -1; diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -2974,37 +2974,50 @@ return NULL; } -/* Return a Unicode-Escape string version of the Unicode object. +/* Return a Unicode-Escape or Raw-Unicode-Escape string version of the + * Unicode object. * - * If enclose_in_quotes is true, the string is enclosed in u"" or u'' - * quotes as appropriate. Otherwise, single and double quotes are - * always escaped. + * The size parameter must be nonnegative. + * + * If enclose_in_quotes is true, the returned string is enclosed in + * quotes as appropriate (u"", u'', ur"", or ur''). Otherwise, single + * and double quotes are always escaped. + * + * If raw is true, the returned string is suitable for use as a raw + * Unicode literal. Otherwise, the returned string is suitable for + * use as a normal Unicode literal. + * + * extra_raw_escape is a nul-terminated string of characters that + * should be escaped in addition to what is already escaped. If NULL + * or the empty string, no extra characters will be escaped. If + * non-NULL and raw is false, the behavior is undefined. */ -static -PyObject *unicodeescape_string(const Py_UNICODE *s, - Py_ssize_t size, - int enclose_in_quotes) -{ +PyObject * +_PyUnicode_EncodeCustomUnicodeEscape(const Py_UNICODE *s, + Py_ssize_t size, + int enclose_in_quotes, + int raw, + const char* extra_raw_escape) +{ + /* string object to return */ PyObject *repr; + + /* pointer to repr's internal buffer, will be incremented as the + * contents are written */ char *p; /* non-zero if quotes should be escaped */ int escape_single_quotes = 1; int escape_double_quotes = 1; - static const char *hexdigit = "0123456789abcdef"; -#ifdef Py_UNICODE_WIDE - const Py_ssize_t expandsize = 10; -#else - const Py_ssize_t expandsize = 6; -#endif - - /* XXX(nnorwitz): rather than over-allocating, it would be + static const char* const hexdigit = "0123456789abcdef"; + + /* Initial allocation is based on the longest-possible unichr + escape. + + XXX(nnorwitz): rather than over-allocating, it would be better to choose a different scheme. Perhaps scan the first N-chars of the string and allocate based on that size. - */ - /* Initial allocation is based on the longest-possible unichr - escape. In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source unichr, so in this case it's the longest unichr escape. In @@ -3016,6 +3029,23 @@ so in the narrow (UTF-16) build case it's the longest unichr escape. */ +#ifdef Py_UNICODE_WIDE + static const Py_ssize_t expandsize = 10; +#else + static const Py_ssize_t expandsize = 6; +#endif + + /* raw unicode quotes add 4 characters: ur'' + * normal unicode quotes add 3 characters: u'' + */ + const Py_ssize_t enclosingQuotesSize = enclose_in_quotes ? + (raw ? 4 : 3) : 0; + + /* calculate the length of the new string object: (size * + * expandsize) for the worst-case escaped unicode literal, plus + * enclosingQuotesSize + */ + const Py_ssize_t maxSize = (size * expandsize) + enclosingQuotesSize; /* make sure size is nonnegative */ if (size < 0) { @@ -3023,21 +3053,25 @@ return NULL; } - if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize) + if (maxSize > PY_SSIZE_T_MAX) return PyErr_NoMemory(); - repr = PyString_FromStringAndSize(NULL, - 2 - + expandsize*size - + 1); + repr = PyString_FromStringAndSize(NULL, maxSize); if (repr == NULL) return NULL; + /* Empty string objects are shared, so we can't call + * _PyString_Resize() on them. Thus, we should return now. + */ + if (maxSize == 0) + return repr; + p = PyString_AS_STRING(repr); if (enclose_in_quotes) { escape_double_quotes = 0; *p++ = 'u'; + if (raw) *p++ = 'r'; if (findchar(s, size, '\'') && !findchar(s, size, '"')) { *p++ = '"'; escape_single_quotes = 0; @@ -3046,98 +3080,132 @@ } } while (size-- > 0) { - Py_UNICODE ch = *s++; - - /* Escape quotes and backslashes */ - if ((escape_single_quotes && ch == '\'') || - (escape_double_quotes && ch == '"') || - ch == '\\') { - *p++ = '\\'; - *p++ = (char) ch; - continue; - } - -#ifdef Py_UNICODE_WIDE - /* Map 21-bit characters to '\U00xxxxxx' */ - else if (ch >= 0x10000) { - *p++ = '\\'; - *p++ = 'U'; - *p++ = hexdigit[(ch >> 28) & 0x0000000F]; - *p++ = hexdigit[(ch >> 24) & 0x0000000F]; - *p++ = hexdigit[(ch >> 20) & 0x0000000F]; - *p++ = hexdigit[(ch >> 16) & 0x0000000F]; - *p++ = hexdigit[(ch >> 12) & 0x0000000F]; - *p++ = hexdigit[(ch >> 8) & 0x0000000F]; - *p++ = hexdigit[(ch >> 4) & 0x0000000F]; - *p++ = hexdigit[ch & 0x0000000F]; - continue; - } -#else - /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */ - else if (ch >= 0xD800 && ch < 0xDC00 && size) { + Py_UCS4 ch = *s++; + +#ifndef Py_UNICODE_WIDE + /* decode UTF-16 surrogate pairs to UCS-4 */ + if (ch >= 0xD800 && ch < 0xDC00 && size) { Py_UNICODE ch2; - Py_UCS4 ucs; ch2 = *s++; size--; if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { - ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000; - *p++ = '\\'; - *p++ = 'U'; - *p++ = hexdigit[(ucs >> 28) & 0x0000000F]; - *p++ = hexdigit[(ucs >> 24) & 0x0000000F]; - *p++ = hexdigit[(ucs >> 20) & 0x0000000F]; - *p++ = hexdigit[(ucs >> 16) & 0x0000000F]; - *p++ = hexdigit[(ucs >> 12) & 0x0000000F]; - *p++ = hexdigit[(ucs >> 8) & 0x0000000F]; - *p++ = hexdigit[(ucs >> 4) & 0x0000000F]; - *p++ = hexdigit[ucs & 0x0000000F]; - continue; - } - /* Fall through: isolated surrogates are copied as-is */ - s--; - size++; + ch = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000; + } else { + /* back up; this is an isolated surrogate, not a + * surrogate pair. isolated surrogates are copied + * as-is + */ + s--; + size++; + } } #endif + /* Map 21-bit characters to '\U00xxxxxx' */ + if (ch >= 0x10000) { + *p++ = '\\'; + *p++ = 'U'; + *p++ = hexdigit[(ch >> 28) & 0xf]; + *p++ = hexdigit[(ch >> 24) & 0xf]; + *p++ = hexdigit[(ch >> 20) & 0xf]; + *p++ = hexdigit[(ch >> 16) & 0xf]; + *p++ = hexdigit[(ch >> 12) & 0xf]; + *p++ = hexdigit[(ch >> 8) & 0xf]; + *p++ = hexdigit[(ch >> 4) & 0xf]; + *p++ = hexdigit[ch & 0xf]; + continue; + } + /* Map 16-bit characters to '\uxxxx' */ if (ch >= 256) { *p++ = '\\'; *p++ = 'u'; - *p++ = hexdigit[(ch >> 12) & 0x000F]; - *p++ = hexdigit[(ch >> 8) & 0x000F]; - *p++ = hexdigit[(ch >> 4) & 0x000F]; - *p++ = hexdigit[ch & 0x000F]; - } - - /* Map special whitespace to '\t', \n', '\r' */ - else if (ch == '\t') { - *p++ = '\\'; - *p++ = 't'; - } - else if (ch == '\n') { - *p++ = '\\'; - *p++ = 'n'; - } - else if (ch == '\r') { - *p++ = '\\'; - *p++ = 'r'; - } - - /* Map non-printable US ASCII to '\xhh' */ - else if (ch < ' ' || ch >= 0x7F) { - *p++ = '\\'; - *p++ = 'x'; - *p++ = hexdigit[(ch >> 4) & 0x000F]; - *p++ = hexdigit[ch & 0x000F]; + *p++ = hexdigit[(ch >> 12) & 0xf]; + *p++ = hexdigit[(ch >> 8) & 0xf]; + *p++ = hexdigit[(ch >> 4) & 0xf]; + *p++ = hexdigit[ch & 0xf]; + continue; + } + + if (raw) { + const char *tmp; + + /* Escape quotes and backslashes. Unicode escape + * sequences are used because just adding a backslash + * changes the value of the raw Unicode literal (the + * backslash cancels the special behavior of the next + * character, but the backslash itself is not removed). + */ + if ((escape_single_quotes && ch == '\'') || + (escape_double_quotes && ch == '"') || + ch == '\\') { + *p++ = '\\'; + *p++ = 'u'; + *p++ = hexdigit[(ch >> 12) & 0xf]; + *p++ = hexdigit[(ch >> 8) & 0xf]; + *p++ = hexdigit[(ch >> 4) & 0xf]; + *p++ = hexdigit[ch & 0xf]; + continue; + } + + for (tmp = extra_raw_escape; tmp && *tmp; ++tmp) { + if (ch == *tmp) { + *p++ = '\\'; + *p++ = 'u'; + *p++ = hexdigit[(ch >> 12) & 0xf]; + *p++ = hexdigit[(ch >> 8) & 0xf]; + *p++ = hexdigit[(ch >> 4) & 0xf]; + *p++ = hexdigit[ch & 0xf]; + goto next_ch; + } + } + + } else { + + /* Escape quotes and backslashes */ + if ((escape_single_quotes && ch == '\'') || + (escape_double_quotes && ch == '"') || + ch == '\\') { + *p++ = '\\'; + *p++ = (char) ch; + continue; + } + + /* Map special whitespace to '\t', \n', '\r' */ + if (ch == '\t') { + *p++ = '\\'; + *p++ = 't'; + continue; + } + if (ch == '\n') { + *p++ = '\\'; + *p++ = 'n'; + continue; + } + if (ch == '\r') { + *p++ = '\\'; + *p++ = 'r'; + continue; + } + + /* Map non-printable US ASCII to '\xhh' */ + if (ch < ' ' || ch >= 0x7F) { + *p++ = '\\'; + *p++ = 'x'; + *p++ = hexdigit[(ch >> 4) & 0xf]; + *p++ = hexdigit[ch & 0xf]; + continue; + } } /* Copy everything else as-is */ - else - *p++ = (char) ch; + *p++ = (char) ch; + + next_ch:; } if (enclose_in_quotes) - *p++ = PyString_AS_STRING(repr)[1]; + *p++ = PyString_AS_STRING(repr)[raw ? 2 : 1]; *p = '\0'; _PyString_Resize(&repr, p - PyString_AS_STRING(repr)); @@ -3147,7 +3215,7 @@ PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s, Py_ssize_t size) { - return unicodeescape_string(s, size, 0); + return _PyUnicode_EncodeCustomUnicodeEscape(s, size, 0, 0, NULL); } PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode) @@ -3282,93 +3350,7 @@ PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s, Py_ssize_t size) { - PyObject *repr; - char *p; - char *q; - - static const char *hexdigit = "0123456789abcdef"; -#ifdef Py_UNICODE_WIDE - const Py_ssize_t expandsize = 10; -#else - const Py_ssize_t expandsize = 6; -#endif - - /* make sure size is nonnegative */ - if (size < 0) { - PyErr_BadInternalCall(); - return NULL; - } - - if (size > PY_SSIZE_T_MAX / expandsize) - return PyErr_NoMemory(); - - repr = PyString_FromStringAndSize(NULL, expandsize * size); - if (repr == NULL) - return NULL; - if (size == 0) - return repr; - - p = q = PyString_AS_STRING(repr); - while (size-- > 0) { - Py_UNICODE ch = *s++; -#ifdef Py_UNICODE_WIDE - /* Map 32-bit characters to '\Uxxxxxxxx' */ - if (ch >= 0x10000) { - *p++ = '\\'; - *p++ = 'U'; - *p++ = hexdigit[(ch >> 28) & 0xf]; - *p++ = hexdigit[(ch >> 24) & 0xf]; - *p++ = hexdigit[(ch >> 20) & 0xf]; - *p++ = hexdigit[(ch >> 16) & 0xf]; - *p++ = hexdigit[(ch >> 12) & 0xf]; - *p++ = hexdigit[(ch >> 8) & 0xf]; - *p++ = hexdigit[(ch >> 4) & 0xf]; - *p++ = hexdigit[ch & 15]; - } - else -#else - /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */ - if (ch >= 0xD800 && ch < 0xDC00 && size) { - Py_UNICODE ch2; - Py_UCS4 ucs; - - ch2 = *s++; - size--; - if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { - ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000; - *p++ = '\\'; - *p++ = 'U'; - *p++ = hexdigit[(ucs >> 28) & 0xf]; - *p++ = hexdigit[(ucs >> 24) & 0xf]; - *p++ = hexdigit[(ucs >> 20) & 0xf]; - *p++ = hexdigit[(ucs >> 16) & 0xf]; - *p++ = hexdigit[(ucs >> 12) & 0xf]; - *p++ = hexdigit[(ucs >> 8) & 0xf]; - *p++ = hexdigit[(ucs >> 4) & 0xf]; - *p++ = hexdigit[ucs & 0xf]; - continue; - } - /* Fall through: isolated surrogates are copied as-is */ - s--; - size++; - } -#endif - /* Map 16-bit characters, backslashes, and quotes to '\uxxxx' */ - if (ch >= 256 || ch == '\\' || ch == '\'' || ch == '"') { - *p++ = '\\'; - *p++ = 'u'; - *p++ = hexdigit[(ch >> 12) & 0xf]; - *p++ = hexdigit[(ch >> 8) & 0xf]; - *p++ = hexdigit[(ch >> 4) & 0xf]; - *p++ = hexdigit[ch & 15]; - } - /* Copy everything else as-is */ - else - *p++ = (char) ch; - } - *p = '\0'; - _PyString_Resize(&repr, p - q); - return repr; + return _PyUnicode_EncodeCustomUnicodeEscape(s, size, 0, 1, NULL); } PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode) @@ -7509,9 +7491,9 @@ static PyObject *unicode_repr(PyObject *unicode) { - return unicodeescape_string(PyUnicode_AS_UNICODE(unicode), - PyUnicode_GET_SIZE(unicode), - 1); + return _PyUnicode_EncodeCustomUnicodeEscape( + PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode), + 1, 0, NULL); } PyDoc_STRVAR(rfind__doc__,