diff -r 59a189a15933 Objects/unicodeobject.c --- a/Objects/unicodeobject.c Thu Oct 25 17:23:54 2012 -0700 +++ b/Objects/unicodeobject.c Sat Oct 27 01:47:23 2012 +0300 @@ -5286,61 +5286,6 @@ /* --- Unicode Escape Codec ----------------------------------------------- */ -/* Helper function for PyUnicode_DecodeUnicodeEscape, determines - if all the escapes in the string make it still a valid ASCII string. - Returns -1 if any escapes were found which cause the string to - pop out of ASCII range. Otherwise returns the length of the - required buffer to hold the string. - */ -static Py_ssize_t -length_of_escaped_ascii_string(const char *s, Py_ssize_t size) -{ - const unsigned char *p = (const unsigned char *)s; - const unsigned char *end = p + size; - Py_ssize_t length = 0; - - if (size < 0) - return -1; - - for (; p < end; ++p) { - if (*p > 127) { - /* Non-ASCII */ - return -1; - } - else if (*p != '\\') { - /* Normal character */ - ++length; - } - else { - /* Backslash-escape, check next char */ - ++p; - /* Escape sequence reaches till end of string or - non-ASCII follow-up. */ - if (p >= end || *p > 127) - return -1; - switch (*p) { - case '\n': - /* backslash + \n result in zero characters */ - break; - case '\\': case '\'': case '\"': - case 'b': case 'f': case 't': - case 'n': case 'r': case 'v': case 'a': - ++length; - break; - case '0': case '1': case '2': case '3': - case '4': case '5': case '6': case '7': - case 'x': case 'u': case 'U': case 'N': - /* these do not guarantee ASCII characters */ - return -1; - default: - /* count the backslash + the other character */ - length += 2; - } - } - } - return length; -} - static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL; PyObject * @@ -5349,253 +5294,202 @@ const char *errors) { const char *starts = s; - Py_ssize_t startinpos; - Py_ssize_t endinpos; - int j; PyObject *v; const char *end; - char* message; - Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */ PyObject *errorHandler = NULL; PyObject *exc = NULL; - Py_ssize_t len; - Py_ssize_t i; - - len = length_of_escaped_ascii_string(s, size); - - /* After length_of_escaped_ascii_string() there are two alternatives, - either the string is pure ASCII with named escapes like \n, etc. - and we determined it's exact size (common case) - or it contains \x, \u, ... escape sequences. then we create a - legacy wchar string and resize it at the end of this function. */ - if (len >= 0) { - v = PyUnicode_New(len, 127); - if (!v) - goto onError; - assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND); - } - else { - /* Escaped strings will always be longer than the resulting - Unicode string, so we start with size here and then reduce the - length after conversion to the true value. - (but if the error callback returns a long replacement string - we'll have to allocate more space) */ - v = PyUnicode_New(size, 127); - if (!v) - goto onError; - len = size; - } + Py_ssize_t outpos; + enum PyUnicode_Kind kind = PyUnicode_1BYTE_KIND; + Py_UCS4 maxchar = 127; + void *data; + + /* Escaped strings will always be longer than the resulting + Unicode string, so we start with size here and then reduce the + length after conversion to the true value. + (but if the error callback returns a long replacement string + we'll have to allocate more space) */ + v = PyUnicode_New(size, 127); + if (!v) + goto onError; if (size == 0) return v; - i = 0; + outpos = 0; end = s + size; + assert(PyUnicode_KIND(v) == kind); + assert(PyUnicode_MAX_CHAR_VALUE(v) == maxchar); + data = PyUnicode_DATA(v); + while (s < end) { - unsigned char c; - Py_UCS4 x; - int digits; - - /* The only case in which i == ascii_length is a backslash - followed by a newline. */ - assert(i <= len); + unsigned char c = (unsigned char) *s++; + Py_UCS4 ch; + int count; + Py_ssize_t startinpos; + Py_ssize_t endinpos; + const char* message; /* Non-escape characters are interpreted as Unicode ordinals */ - if (*s != '\\') { - if (unicode_putchar(&v, &i, (unsigned char) *s++) < 0) - goto onError; + if (c != '\\') { + ch = c; + writechar: + assert(outpos < PyUnicode_GET_LENGTH(v)); + if (ch > maxchar) { + if (unicode_widen(&v, outpos, ch) < 0) + goto onError; + kind = PyUnicode_KIND(v); + maxchar = PyUnicode_MAX_CHAR_VALUE(v); + data = PyUnicode_DATA(v); + } + PyUnicode_WRITE(kind, data, outpos++, ch); continue; } - startinpos = s-starts; + startinpos = s - starts - 1; /* \ - Escapes */ - s++; - c = *s++; - if (s > end) - c = '\0'; /* Invalid after \ */ - - /* The only case in which i == ascii_length is a backslash - followed by a newline. */ - assert(i < len || (i == len && c == '\n')); + if (s >= end) { + message = "\\ at end of string"; + goto error; + } + c = (unsigned char) *s++; + + /* The only case in which outpos == ascii_length is a backslash + followed by a newline. */ + assert(outpos < PyUnicode_GET_LENGTH(v) || + (outpos == PyUnicode_GET_LENGTH(v) && c == '\n')); switch (c) { /* \x escapes */ -#define WRITECHAR(ch) \ - do { \ - if (unicode_putchar(&v, &i, ch) < 0) \ - goto onError; \ - }while(0) - - case '\n': break; - case '\\': WRITECHAR('\\'); break; - case '\'': WRITECHAR('\''); break; - case '\"': WRITECHAR('\"'); break; - case 'b': WRITECHAR('\b'); break; + + case '\n': continue; + case '\\': PyUnicode_WRITE(kind, data, outpos++, '\\'); continue; + case '\'': PyUnicode_WRITE(kind, data, outpos++, '\''); continue; + case '\"': PyUnicode_WRITE(kind, data, outpos++, '\"'); continue; + case 'b': PyUnicode_WRITE(kind, data, outpos++, '\b'); continue; /* FF */ - case 'f': WRITECHAR('\014'); break; - case 't': WRITECHAR('\t'); break; - case 'n': WRITECHAR('\n'); break; - case 'r': WRITECHAR('\r'); break; + case 'f': PyUnicode_WRITE(kind, data, outpos++, '\014'); continue; + case 't': PyUnicode_WRITE(kind, data, outpos++, '\t'); continue; + case 'n': PyUnicode_WRITE(kind, data, outpos++, '\n'); continue; + case 'r': PyUnicode_WRITE(kind, data, outpos++, '\r'); continue; /* VT */ - case 'v': WRITECHAR('\013'); break; + case 'v': PyUnicode_WRITE(kind, data, outpos++, '\013'); continue; /* BEL, not classic C */ - case 'a': WRITECHAR('\007'); break; + case 'a': PyUnicode_WRITE(kind, data, outpos++, '\007'); continue; /* \OOO (octal) escapes */ case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': - x = s[-1] - '0'; + ch = c - '0'; if (s < end && '0' <= *s && *s <= '7') { - x = (x<<3) + *s++ - '0'; + ch = (ch<<3) + *s++ - '0'; if (s < end && '0' <= *s && *s <= '7') - x = (x<<3) + *s++ - '0'; - } - WRITECHAR(x); - break; + ch = (ch<<3) + *s++ - '0'; + } + goto writechar; /* hex escapes */ /* \xXX */ case 'x': - digits = 2; + count = 2; message = "truncated \\xXX escape"; goto hexescape; /* \uXXXX */ case 'u': - digits = 4; + count = 4; message = "truncated \\uXXXX escape"; goto hexescape; /* \UXXXXXXXX */ case 'U': - digits = 8; + count = 8; message = "truncated \\UXXXXXXXX escape"; - hexescape: - chr = 0; - if (s+digits>end) { - endinpos = size; - if (unicode_decode_call_errorhandler( - errors, &errorHandler, - "unicodeescape", "end of string in escape sequence", - &starts, &end, &startinpos, &endinpos, &exc, &s, - &v, &i)) - goto onError; - goto nextByte; - } - for (j = 0; j < digits; ++j) { - c = (unsigned char) s[j]; - if (!Py_ISXDIGIT(c)) { - endinpos = (s+j+1)-starts; - if (unicode_decode_call_errorhandler( - errors, &errorHandler, - "unicodeescape", message, - &starts, &end, &startinpos, &endinpos, &exc, &s, - &v, &i)) - goto onError; - len = PyUnicode_GET_LENGTH(v); - goto nextByte; - } - chr = (chr<<4) & ~0xF; + hexescape: + for (ch = 0; count--; ++s) { + if (s >= end) + goto error; + c = (unsigned char)*s; + if (!Py_ISXDIGIT(c)) + goto error; + ch <<= 4; if (c >= '0' && c <= '9') - chr += c - '0'; + ch += c - '0'; else if (c >= 'a' && c <= 'f') - chr += 10 + c - 'a'; + ch += c - ('a' - 10); else - chr += 10 + c - 'A'; - } - s += j; - if (chr == 0xffffffff && PyErr_Occurred()) - /* _decoding_error will have already written into the - target buffer. */ - break; - store: - /* when we get here, chr is a 32-bit unicode character */ - if (chr <= MAX_UNICODE) { - WRITECHAR(chr); - } else { - endinpos = s-starts; - if (unicode_decode_call_errorhandler( - errors, &errorHandler, - "unicodeescape", "illegal Unicode character", - &starts, &end, &startinpos, &endinpos, &exc, &s, - &v, &i)) - goto onError; - } - break; + ch += c - ('A' - 10); + } + store: + /* when we get here, ch is a 32-bit unicode character */ + if (ch <= MAX_UNICODE) + goto writechar; + message = "illegal Unicode character"; + goto error; /* \N{name} */ case 'N': - message = "malformed \\N character escape"; if (ucnhash_CAPI == NULL) { /* load the unicode data module */ ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import( PyUnicodeData_CAPSULE_NAME, 1); - if (ucnhash_CAPI == NULL) - goto ucnhashError; - } + if (ucnhash_CAPI == NULL) { + PyErr_SetString( + PyExc_UnicodeError, + "\\N escapes not supported (can't load unicodedata module)" + ); + goto onError; + } + } + message = "malformed \\N character escape"; if (*s == '{') { - const char *start = s+1; + const char *start = ++s; + size_t namelen; /* look for the closing brace */ - while (*s != '}' && s < end) - s++; - if (s > start && s < end && *s == '}') { + do { + if (s >= end) + goto error; + } while (*s++ != '}'); + namelen = s - start - 1; + if (!namelen || namelen >= INT_MAX) { /* found a name. look it up in the unicode database */ + ch = 0xffffffff; /* in case 'getcode' messes up */ + if (ucnhash_CAPI->getcode(NULL, start, (int)namelen, + &ch, 0)) { + goto store; + } message = "unknown Unicode character name"; - s++; - if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), - &chr, 0)) - goto store; } } - endinpos = s-starts; - if (unicode_decode_call_errorhandler( - errors, &errorHandler, - "unicodeescape", message, - &starts, &end, &startinpos, &endinpos, &exc, &s, - &v, &i)) - goto onError; - break; + goto error; default: - if (s > end) { - message = "\\ at end of string"; - s--; - endinpos = s-starts; - if (unicode_decode_call_errorhandler( - errors, &errorHandler, - "unicodeescape", message, - &starts, &end, &startinpos, &endinpos, &exc, &s, - &v, &i)) - goto onError; - } - else { - WRITECHAR('\\'); - WRITECHAR(s[-1]); - } - break; - } - nextByte: - ; - } -#undef WRITECHAR - - if (unicode_resize(&v, i) < 0) + assert(outpos < PyUnicode_GET_LENGTH(v)); + PyUnicode_WRITE(kind, data, outpos++, '\\'); + ch = c; + goto writechar; + } + + error: + endinpos = s-starts; + if (unicode_decode_call_errorhandler( + errors, &errorHandler, + "unicodeescape", message, + &starts, &end, &startinpos, &endinpos, &exc, &s, + &v, &outpos)) + goto onError; + kind = PyUnicode_KIND(v); + maxchar = PyUnicode_MAX_CHAR_VALUE(v); + data = PyUnicode_DATA(v); + continue; + } + + if (unicode_resize(&v, outpos) < 0) goto onError; Py_XDECREF(errorHandler); Py_XDECREF(exc); return unicode_result(v); - ucnhashError: - PyErr_SetString( - PyExc_UnicodeError, - "\\N escapes not supported (can't load unicodedata module)" - ); - Py_XDECREF(v); - Py_XDECREF(errorHandler); - Py_XDECREF(exc); - return NULL; - onError: Py_XDECREF(v); Py_XDECREF(errorHandler); @@ -5616,9 +5510,9 @@ Py_ssize_t i, len; PyObject *repr; char *p; - int kind; + enum PyUnicode_Kind kind; void *data; - Py_ssize_t expandsize = 0; + Py_ssize_t expandsize; /* Initial allocation is based on the longest-possible character escape. @@ -5637,11 +5531,9 @@ len = PyUnicode_GET_LENGTH(unicode); kind = PyUnicode_KIND(unicode); data = PyUnicode_DATA(unicode); - switch (kind) { - case PyUnicode_1BYTE_KIND: expandsize = 4; break; - case PyUnicode_2BYTE_KIND: expandsize = 6; break; - case PyUnicode_4BYTE_KIND: expandsize = 10; break; - } + /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6 + bytes, and 1 byte characters 4. */ + expandsize = kind * 2 + 2; if (len == 0) return PyBytes_FromStringAndSize(NULL, 0); @@ -5661,15 +5553,53 @@ for (i = 0; i < len; i++) { Py_UCS4 ch = PyUnicode_READ(kind, data, i); - /* Escape backslashes */ - if (ch == '\\') { + if (ch < 0x100) { + /* Copy printable US ASCII as-is */ + if (ch >= ' ' && ch < 0x7F) { + if (ch != '\\') { + *p++ = (char) ch; + continue; + } + /* Escape backslashes */ + else { + *p++ = '\\'; + *p++ = (char) ch; + } + } + + /* Map special whitespace to '\t', \n', '\r' */ + else if (ch == '\t') { + *p++ = '\\'; + *p++ = 't'; + } + else if (ch == '\n') { + *p++ = '\\'; + *p++ = 'n'; + } + else if (ch == '\r') { + *p++ = '\\'; + *p++ = 'r'; + } + + /* Map non-printable US ASCII and 8-bit characters to '\xhh' */ + else { + *p++ = '\\'; + *p++ = 'x'; + *p++ = Py_hexdigits[(ch >> 4) & 0x000F]; + *p++ = Py_hexdigits[ch & 0x000F]; + } + } + /* Map 16-bit characters to '\uxxxx' */ + else if (ch < 0x10000) { *p++ = '\\'; - *p++ = (char) ch; - continue; - } - + *p++ = 'u'; + *p++ = Py_hexdigits[(ch >> 12) & 0x000F]; + *p++ = Py_hexdigits[(ch >> 8) & 0x000F]; + *p++ = Py_hexdigits[(ch >> 4) & 0x000F]; + *p++ = Py_hexdigits[ch & 0x000F]; + } /* Map 21-bit characters to '\U00xxxxxx' */ - else if (ch >= 0x10000) { + else { assert(ch <= MAX_UNICODE); *p++ = '\\'; *p++ = 'U'; @@ -5681,44 +5611,7 @@ *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F]; *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F]; *p++ = Py_hexdigits[ch & 0x0000000F]; - continue; - } - - /* Map 16-bit characters to '\uxxxx' */ - if (ch >= 256) { - *p++ = '\\'; - *p++ = 'u'; - *p++ = Py_hexdigits[(ch >> 12) & 0x000F]; - *p++ = Py_hexdigits[(ch >> 8) & 0x000F]; - *p++ = Py_hexdigits[(ch >> 4) & 0x000F]; - *p++ = Py_hexdigits[ch & 0x000F]; - } - - /* Map special whitespace to '\t', \n', '\r' */ - else if (ch == '\t') { - *p++ = '\\'; - *p++ = 't'; - } - else if (ch == '\n') { - *p++ = '\\'; - *p++ = 'n'; - } - else if (ch == '\r') { - *p++ = '\\'; - *p++ = 'r'; - } - - /* Map non-printable US ASCII to '\xhh' */ - else if (ch < ' ' || ch >= 0x7F) { - *p++ = '\\'; - *p++ = 'x'; - *p++ = Py_hexdigits[(ch >> 4) & 0x000F]; - *p++ = Py_hexdigits[ch & 0x000F]; - } - - /* Copy everything else as-is */ - else - *p++ = (char) ch; + } } assert(p - PyBytes_AS_STRING(repr) > 0); @@ -5748,14 +5641,14 @@ const char *errors) { const char *starts = s; - Py_ssize_t startinpos; - Py_ssize_t endinpos; Py_ssize_t outpos; PyObject *v; const char *end; - const char *bs; PyObject *errorHandler = NULL; PyObject *exc = NULL; + enum PyUnicode_Kind kind = PyUnicode_1BYTE_KIND; + Py_UCS4 maxchar = 127; + void *data; /* Escaped strings will always be longer than the resulting Unicode string, so we start with size here and then reduce the @@ -5768,73 +5661,82 @@ return v; outpos = 0; end = s + size; + assert(PyUnicode_KIND(v) == kind); + assert(PyUnicode_MAX_CHAR_VALUE(v) == maxchar); + data = PyUnicode_DATA(v); + while (s < end) { - unsigned char c; - Py_UCS4 x; - int i; + unsigned char c = *s++; + Py_UCS4 ch; int count; + Py_ssize_t startinpos; + Py_ssize_t endinpos; + const char *message; /* Non-escape characters are interpreted as Unicode ordinals */ - if (*s != '\\') { - if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0) - goto onError; + if (c != '\\' || s >= end) { + ch = c; + + writechar: + assert(outpos < PyUnicode_GET_LENGTH(v)); + if (ch > maxchar) { + if (unicode_widen(&v, outpos, ch) < 0) + goto onError; + kind = PyUnicode_KIND(v); + maxchar = PyUnicode_MAX_CHAR_VALUE(v); + data = PyUnicode_DATA(v); + } + PyUnicode_WRITE(kind, data, outpos++, ch); continue; } - startinpos = s-starts; - - /* \u-escapes are only interpreted iff the number of leading - backslashes if odd */ - bs = s; - for (;s < end;) { - if (*s != '\\') - break; - if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0) - goto onError; - } - if (((s - bs) & 1) == 0 || - s >= end || - (*s != 'u' && *s != 'U')) { - continue; - } - outpos--; - count = *s=='u' ? 4 : 8; - s++; + + c = *s++; + if (c == 'u') { + count = 4; + message = "truncated \\uXXXX escape"; + } + else if (c == 'U') { + count = 8; + message = "truncated \\UXXXXXXXX escape"; + } + else { + assert(outpos < PyUnicode_GET_LENGTH(v)); + PyUnicode_WRITE(kind, data, outpos++, '\\'); + ch = c; + goto writechar; + } + startinpos = s - starts - 2; /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */ - for (x = 0, i = 0; i < count; ++i, ++s) { + for (ch = 0; count--; ++s) { + if (s >= end) + goto error; c = (unsigned char)*s; - if (!Py_ISXDIGIT(c)) { - endinpos = s-starts; - if (unicode_decode_call_errorhandler( - errors, &errorHandler, - "rawunicodeescape", "truncated \\uXXXX", - &starts, &end, &startinpos, &endinpos, &exc, &s, - &v, &outpos)) - goto onError; - goto nextByte; - } - x = (x<<4) & ~0xF; + if (!Py_ISXDIGIT(c)) + goto error; + ch <<= 4; if (c >= '0' && c <= '9') - x += c - '0'; + ch += c - '0'; else if (c >= 'a' && c <= 'f') - x += 10 + c - 'a'; + ch += c - ('a' - 10); else - x += 10 + c - 'A'; - } - if (x <= MAX_UNICODE) { - if (unicode_putchar(&v, &outpos, x) < 0) - goto onError; - } else { - endinpos = s-starts; - if (unicode_decode_call_errorhandler( - errors, &errorHandler, - "rawunicodeescape", "\\Uxxxxxxxx out of range", - &starts, &end, &startinpos, &endinpos, &exc, &s, - &v, &outpos)) - goto onError; - } - nextByte: - ; + ch += c - ('A' - 10); + } + if (ch <= MAX_UNICODE) + goto writechar; + message = "\\Uxxxxxxxx out of range"; + + error: + endinpos = s-starts; + if (unicode_decode_call_errorhandler( + errors, &errorHandler, + "rawunicodeescape", message, + &starts, &end, &startinpos, &endinpos, &exc, &s, + &v, &outpos)) + goto onError; + kind = PyUnicode_KIND(v); + maxchar = PyUnicode_MAX_CHAR_VALUE(v); + data = PyUnicode_DATA(v); } if (unicode_resize(&v, outpos) < 0) goto onError; @@ -5870,6 +5772,10 @@ kind = PyUnicode_KIND(unicode); data = PyUnicode_DATA(unicode); len = PyUnicode_GET_LENGTH(unicode); + if (kind == PyUnicode_1BYTE_KIND) { + repr = PyBytes_FromStringAndSize(data, len); + return repr; + } /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6 bytes, and 1 byte characters 4. */ expandsize = kind * 2 + 2;