Index: Objects/unicodeobject.c =================================================================== --- Objects/unicodeobject.c (revision 77297) +++ Objects/unicodeobject.c (working copy) @@ -2959,13 +2959,6 @@ return NULL; } -/* Return a Unicode-Escape string version of the Unicode object. - - If quotes is true, the string is enclosed in u"" or u'' quotes as - appropriate. - -*/ - Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s, Py_ssize_t size, Py_UNICODE ch) @@ -2981,28 +2974,43 @@ return NULL; } -static -PyObject *unicodeescape_string(const Py_UNICODE *s, - Py_ssize_t size, - int quotes) +/* Return a Unicode-Escape or Raw-Unicode-Escape string version of the + * Unicode object. + * + * If enclose_in_quotes is true, the returned string is enclosed in + * quotes as appropriate (u"", u'', ur"", or ur''). Otherwise, single + * quotes are always escaped. + * + * If raw is true, the returned string is suitable for use as a raw + * unicode literal. Otherwise, the returned string is suitable for + * use as a normal unicode literal. + */ +Py_LOCAL(PyObject *) +unicodeescape_string(const Py_UNICODE *s, + Py_ssize_t size, + int enclose_in_quotes, + int raw) { + + /* string object to return */ PyObject *repr; + + /* pointer to repr's internal buffer, will be incremented as the + * contents are written */ char *p; - static const char *hexdigit = "0123456789abcdef"; -#ifdef Py_UNICODE_WIDE - const Py_ssize_t expandsize = 10; -#else - const Py_ssize_t expandsize = 6; -#endif + static const char* const hexdigit = "0123456789abcdef"; - /* XXX(nnorwitz): rather than over-allocating, it would be - better to choose a different scheme. Perhaps scan the - first N-chars of the string and allocate based on that size. - */ + /* non-zero if quotes should be escaped */ + int escape_single_quotes = 1; + /* Initial allocation is based on the longest-possible unichr escape. + XXX(nnorwitz): rather than over-allocating, it would be + better to choose a different scheme. Perhaps scan the + first N-chars of the string and allocate based on that size. + In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source unichr, so in this case it's the longest unichr escape. In narrow (UTF-16) builds this is five chars per source unichr @@ -3013,116 +3021,151 @@ so in the narrow (UTF-16) build case it's the longest unichr escape. */ +#ifdef Py_UNICODE_WIDE + static const Py_ssize_t expandsize = 10; +#else + static const Py_ssize_t expandsize = 6; +#endif - if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize) + /* raw unicode quotes add 4 characters: ur'' + * normal unicode quotes add 3 characters: u'' + */ + const Py_ssize_t enclosingQuotesSize = enclose_in_quotes ? + (raw ? 4 : 3) : 0; + + /* calculate the length of the new string object: (size * + * expandsize) for the worst-case escaped unicode literal, plus + * enclosingQuotesSize + */ + const Py_ssize_t maxSize = (size * expandsize) + enclosingQuotesSize; + + if (maxSize > PY_SSIZE_T_MAX) return PyErr_NoMemory(); - repr = PyString_FromStringAndSize(NULL, - 2 - + expandsize*size - + 1); + repr = PyString_FromStringAndSize(NULL, maxSize); if (repr == NULL) return NULL; p = PyString_AS_STRING(repr); - if (quotes) { + if (enclose_in_quotes) { *p++ = 'u'; - *p++ = (findchar(s, size, '\'') && - !findchar(s, size, '"')) ? '"' : '\''; + if (raw) *p++ = 'r'; + if (findchar(s, size, '\'') && !findchar(s, size, '"')) { + *p++ = '"'; + escape_single_quotes = 0; + } else { + *p++ = '\''; + } } while (size-- > 0) { - Py_UNICODE ch = *s++; + Py_UCS4 ch = *s++; - /* Escape quotes and backslashes */ - if ((quotes && - ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') { - *p++ = '\\'; - *p++ = (char) ch; - continue; - } - -#ifdef Py_UNICODE_WIDE - /* Map 21-bit characters to '\U00xxxxxx' */ - else if (ch >= 0x10000) { - *p++ = '\\'; - *p++ = 'U'; - *p++ = hexdigit[(ch >> 28) & 0x0000000F]; - *p++ = hexdigit[(ch >> 24) & 0x0000000F]; - *p++ = hexdigit[(ch >> 20) & 0x0000000F]; - *p++ = hexdigit[(ch >> 16) & 0x0000000F]; - *p++ = hexdigit[(ch >> 12) & 0x0000000F]; - *p++ = hexdigit[(ch >> 8) & 0x0000000F]; - *p++ = hexdigit[(ch >> 4) & 0x0000000F]; - *p++ = hexdigit[ch & 0x0000000F]; - continue; - } -#else - /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */ - else if (ch >= 0xD800 && ch < 0xDC00) { +#ifndef Py_UNICODE_WIDE + /* decode UTF-16 surrogate pairs to UCS-4 */ + if (ch >= 0xD800 && ch < 0xDC00 && size) { Py_UNICODE ch2; - Py_UCS4 ucs; ch2 = *s++; size--; if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { - ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000; - *p++ = '\\'; - *p++ = 'U'; - *p++ = hexdigit[(ucs >> 28) & 0x0000000F]; - *p++ = hexdigit[(ucs >> 24) & 0x0000000F]; - *p++ = hexdigit[(ucs >> 20) & 0x0000000F]; - *p++ = hexdigit[(ucs >> 16) & 0x0000000F]; - *p++ = hexdigit[(ucs >> 12) & 0x0000000F]; - *p++ = hexdigit[(ucs >> 8) & 0x0000000F]; - *p++ = hexdigit[(ucs >> 4) & 0x0000000F]; - *p++ = hexdigit[ucs & 0x0000000F]; - continue; + ch = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000; + } else { + /* back up; this is an isolated surrogate, not a + * surrogate pair. isolated surrogates are copied + * as-is + */ + s--; + size++; } - /* Fall through: isolated surrogates are copied as-is */ - s--; - size++; } #endif + /* Map 21-bit characters to '\U00xxxxxx' */ + if (ch >= 0x10000) { + *p++ = '\\'; + *p++ = 'U'; + *p++ = hexdigit[(ch >> 28) & 0xf]; + *p++ = hexdigit[(ch >> 24) & 0xf]; + *p++ = hexdigit[(ch >> 20) & 0xf]; + *p++ = hexdigit[(ch >> 16) & 0xf]; + *p++ = hexdigit[(ch >> 12) & 0xf]; + *p++ = hexdigit[(ch >> 8) & 0xf]; + *p++ = hexdigit[(ch >> 4) & 0xf]; + *p++ = hexdigit[ch & 0xf]; + continue; + } + /* Map 16-bit characters to '\uxxxx' */ if (ch >= 256) { *p++ = '\\'; *p++ = 'u'; - *p++ = hexdigit[(ch >> 12) & 0x000F]; - *p++ = hexdigit[(ch >> 8) & 0x000F]; - *p++ = hexdigit[(ch >> 4) & 0x000F]; - *p++ = hexdigit[ch & 0x000F]; + *p++ = hexdigit[(ch >> 12) & 0xf]; + *p++ = hexdigit[(ch >> 8) & 0xf]; + *p++ = hexdigit[(ch >> 4) & 0xf]; + *p++ = hexdigit[ch & 0xf]; + continue; } - /* Map special whitespace to '\t', \n', '\r' */ - else if (ch == '\t') { - *p++ = '\\'; - *p++ = 't'; - } - else if (ch == '\n') { - *p++ = '\\'; - *p++ = 'n'; - } - else if (ch == '\r') { - *p++ = '\\'; - *p++ = 'r'; - } + if (raw) { - /* Map non-printable US ASCII to '\xhh' */ - else if (ch < ' ' || ch >= 0x7F) { - *p++ = '\\'; - *p++ = 'x'; - *p++ = hexdigit[(ch >> 4) & 0x000F]; - *p++ = hexdigit[ch & 0x000F]; + /* escape quotes and backslashes. unicode escape + * sequences are used because just adding a backslash + * changes the value of the raw unicode literal (the + * backslash cancels the special behavior of the next + * character, but the backslash itself is not removed). + */ + if ((escape_single_quotes && ch == '\'') || ch == '\\') { + *p++ = '\\'; + *p++ = 'u'; + *p++ = hexdigit[(ch >> 12) & 0xf]; + *p++ = hexdigit[(ch >> 8) & 0xf]; + *p++ = hexdigit[(ch >> 4) & 0xf]; + *p++ = hexdigit[ch & 0xf]; + continue; + } + + } else { + + /* Escape quotes and backslashes */ + if ((escape_single_quotes && ch == '\'') || ch == '\\') { + *p++ = '\\'; + *p++ = (char) ch; + continue; + } + + /* Map special whitespace to '\t', \n', '\r' */ + if (ch == '\t') { + *p++ = '\\'; + *p++ = 't'; + continue; + } + if (ch == '\n') { + *p++ = '\\'; + *p++ = 'n'; + continue; + } + if (ch == '\r') { + *p++ = '\\'; + *p++ = 'r'; + continue; + } + + /* Map non-printable US ASCII to '\xhh' */ + if (ch < ' ' || ch >= 0x7F) { + *p++ = '\\'; + *p++ = 'x'; + *p++ = hexdigit[(ch >> 4) & 0xf]; + *p++ = hexdigit[ch & 0xf]; + continue; + } } /* Copy everything else as-is */ - else - *p++ = (char) ch; + *p++ = (char) ch; } - if (quotes) - *p++ = PyString_AS_STRING(repr)[1]; + if (enclose_in_quotes) + *p++ = PyString_AS_STRING(repr)[raw ? 2 : 1]; *p = '\0'; _PyString_Resize(&repr, p - PyString_AS_STRING(repr)); @@ -3132,7 +3175,7 @@ PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s, Py_ssize_t size) { - return unicodeescape_string(s, size, 0); + return unicodeescape_string(s, size, 0, 0); } PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode) @@ -3267,87 +3310,7 @@ PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s, Py_ssize_t size) { - PyObject *repr; - char *p; - char *q; - - static const char *hexdigit = "0123456789abcdef"; -#ifdef Py_UNICODE_WIDE - const Py_ssize_t expandsize = 10; -#else - const Py_ssize_t expandsize = 6; -#endif - - if (size > PY_SSIZE_T_MAX / expandsize) - return PyErr_NoMemory(); - - repr = PyString_FromStringAndSize(NULL, expandsize * size); - if (repr == NULL) - return NULL; - if (size == 0) - return repr; - - p = q = PyString_AS_STRING(repr); - while (size-- > 0) { - Py_UNICODE ch = *s++; -#ifdef Py_UNICODE_WIDE - /* Map 32-bit characters to '\Uxxxxxxxx' */ - if (ch >= 0x10000) { - *p++ = '\\'; - *p++ = 'U'; - *p++ = hexdigit[(ch >> 28) & 0xf]; - *p++ = hexdigit[(ch >> 24) & 0xf]; - *p++ = hexdigit[(ch >> 20) & 0xf]; - *p++ = hexdigit[(ch >> 16) & 0xf]; - *p++ = hexdigit[(ch >> 12) & 0xf]; - *p++ = hexdigit[(ch >> 8) & 0xf]; - *p++ = hexdigit[(ch >> 4) & 0xf]; - *p++ = hexdigit[ch & 15]; - } - else -#else - /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */ - if (ch >= 0xD800 && ch < 0xDC00) { - Py_UNICODE ch2; - Py_UCS4 ucs; - - ch2 = *s++; - size--; - if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { - ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000; - *p++ = '\\'; - *p++ = 'U'; - *p++ = hexdigit[(ucs >> 28) & 0xf]; - *p++ = hexdigit[(ucs >> 24) & 0xf]; - *p++ = hexdigit[(ucs >> 20) & 0xf]; - *p++ = hexdigit[(ucs >> 16) & 0xf]; - *p++ = hexdigit[(ucs >> 12) & 0xf]; - *p++ = hexdigit[(ucs >> 8) & 0xf]; - *p++ = hexdigit[(ucs >> 4) & 0xf]; - *p++ = hexdigit[ucs & 0xf]; - continue; - } - /* Fall through: isolated surrogates are copied as-is */ - s--; - size++; - } -#endif - /* Map 16-bit characters to '\uxxxx' */ - if (ch >= 256) { - *p++ = '\\'; - *p++ = 'u'; - *p++ = hexdigit[(ch >> 12) & 0xf]; - *p++ = hexdigit[(ch >> 8) & 0xf]; - *p++ = hexdigit[(ch >> 4) & 0xf]; - *p++ = hexdigit[ch & 15]; - } - /* Copy everything else as-is */ - else - *p++ = (char) ch; - } - *p = '\0'; - _PyString_Resize(&repr, p - q); - return repr; + return unicodeescape_string(s, size, 0, 1); } PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode) @@ -7490,7 +7453,7 @@ { return unicodeescape_string(PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode), - 1); + 1, 0); } PyDoc_STRVAR(rfind__doc__,