diff --git a/Makefile.pre.in b/Makefile.pre.in --- a/Makefile.pre.in +++ b/Makefile.pre.in @@ -645,6 +645,7 @@ BYTESTR_DEPS = \ UNICODE_DEPS = $(BYTESTR_DEPS) \ $(srcdir)/Objects/stringlib/asciilib.h \ + $(srcdir)/Objects/stringlib/codecs.h \ $(srcdir)/Objects/stringlib/ucs1lib.h \ $(srcdir)/Objects/stringlib/ucs2lib.h \ $(srcdir)/Objects/stringlib/ucs4lib.h \ diff --git a/Objects/stringlib/codecs.h b/Objects/stringlib/codecs.h new file mode 100644 --- /dev/null +++ b/Objects/stringlib/codecs.h @@ -0,0 +1,156 @@ +/* stringlib: codec implementations */ + +#if STRINGLIB_IS_UNICODE + +/* Mask to check or force alignment of a pointer to C 'long' boundaries */ +#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1) + +/* Mask to quickly check whether a C 'long' contains a + non-ASCII, UTF8-encoded char. */ +#if (SIZEOF_LONG == 8) +# define ASCII_CHAR_MASK 0x8080808080808080L +#elif (SIZEOF_LONG == 4) +# define ASCII_CHAR_MASK 0x80808080L +#else +# error C 'long' size should be either 4 or 8! +#endif + +Py_LOCAL_INLINE(int) +STRINGLIB(utf8_try_decode)(const char *start, const char *end, + STRINGLIB_CHAR *dest, + const char **src_pos, Py_ssize_t *dest_index) +{ + int ret; + Py_ssize_t n; + const char *s = start; + const char *aligned_end = (const char *) ((size_t) end & ~LONG_PTR_MASK); + STRINGLIB_CHAR *p = dest; + + while (s < end) { + Py_UCS4 ch = (unsigned char)*s; + + if (ch < 0x80) { + /* Fast path for runs of ASCII characters. Given that common UTF-8 + input will consist of an overwhelming majority of ASCII + characters, we try to optimize for this case by checking + as many characters as a C 'long' can contain. + First, check if we can do an aligned read, as most CPUs have + a penalty for unaligned reads. + */ + if (!((size_t) s & LONG_PTR_MASK)) { + /* Help register allocation */ + register const char *_s = s; + register STRINGLIB_CHAR *_p = p; + while (_s < aligned_end) { + /* Read a whole long at a time (either 4 or 8 bytes), + and do a fast unrolled copy if it only contains ASCII + characters. */ + unsigned long value = *(unsigned long *) _s; + if (value & ASCII_CHAR_MASK) + break; + _p[0] = _s[0]; + _p[1] = _s[1]; + _p[2] = _s[2]; + _p[3] = _s[3]; +#if (SIZEOF_LONG == 8) + _p[4] = _s[4]; + _p[5] = _s[5]; + _p[6] = _s[6]; + _p[7] = _s[7]; +#endif + _s += SIZEOF_LONG; + _p += SIZEOF_LONG; + } + s = _s; + p = _p; + if (s == end) + break; + ch = (unsigned char)*s; + } + } + + if (ch < 0x80) { + s++; + *p++ = ch; + continue; + } + + n = utf8_code_length[ch]; + + if (s + n > end) { + /* unexpected end of data: the caller will decide whether + it's an error or not */ + goto _error; + } + + switch (n) { + case 0: + /* invalid start byte */ + goto _error; + case 1: + /* internal error */ + goto _error; + case 2: + if ((s[1] & 0xc0) != 0x80) + /* invalid continuation byte */ + goto _error; + ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f); + assert ((ch > 0x007F) && (ch <= 0x07FF)); + s += 2; + *p++ = ch; + break; + + case 3: + /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf + will result in surrogates in range d800-dfff. Surrogates are + not valid UTF-8 so they are rejected. + See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf + (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */ + if ((s[1] & 0xc0) != 0x80 || + (s[2] & 0xc0) != 0x80 || + ((unsigned char)s[0] == 0xE0 && + (unsigned char)s[1] < 0xA0) || + ((unsigned char)s[0] == 0xED && + (unsigned char)s[1] > 0x9F)) { + /* invalid continuation byte */ + goto _error; + } + ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f); + assert ((ch > 0x07FF) && (ch <= 0xFFFF)); + s += 3; + *p++ = ch; + break; + + case 4: + if ((s[1] & 0xc0) != 0x80 || + (s[2] & 0xc0) != 0x80 || + (s[3] & 0xc0) != 0x80 || + ((unsigned char)s[0] == 0xF0 && + (unsigned char)s[1] < 0x90) || + ((unsigned char)s[0] == 0xF4 && + (unsigned char)s[1] > 0x8F)) { + /* invalid continuation byte */ + goto _error; + } + ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) + + ((s[2] & 0x3f) << 6) + (s[3] & 0x3f); + assert ((ch > 0xFFFF) && (ch <= 0x10ffff)); + s += 4; + *p++ = ch; + break; + } + } + ret = 0; + goto _ok; +_error: + ret = -1; +_ok: + *src_pos = s; + *dest_index = p - dest; + return ret; +} + +#undef LONG_PTR_MASK +#undef ASCII_CHAR_MASK + +#endif /* STRINGLIB_IS_UNICODE */ diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -506,6 +506,7 @@ make_bloom_mask(int kind, void* ptr, Py_ #include "stringlib/fastsearch.h" #include "stringlib/count.h" #include "stringlib/find.h" +#include "stringlib/undef.h" /* --- Unicode Object ----------------------------------------------------- */ @@ -4177,6 +4178,18 @@ PyUnicode_DecodeUTF8(const char *s, return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL); } +#include "stringlib/ucs1lib.h" +#include "stringlib/codecs.h" +#include "stringlib/undef.h" + +#include "stringlib/ucs2lib.h" +#include "stringlib/codecs.h" +#include "stringlib/undef.h" + +#include "stringlib/ucs4lib.h" +#include "stringlib/codecs.h" +#include "stringlib/undef.h" + /* Mask to check or force alignment of a pointer to C 'long' boundaries */ #define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1) @@ -4190,33 +4203,41 @@ PyUnicode_DecodeUTF8(const char *s, # error C 'long' size should be either 4 or 8! #endif -/* Scans a UTF-8 string and returns the maximum character to be expected, - the size of the decoded unicode string and if any major errors were - encountered. - - This function does check basic UTF-8 sanity, it does however NOT CHECK - if the string contains surrogates, and if all continuation bytes are - within the correct ranges, these checks are performed in +/* Scans a UTF-8 string and returns the maximum character to be expected + and the size of the decoded unicode string. + + This function doesn't check for errors, these checks are performed in PyUnicode_DecodeUTF8Stateful. - - If it sets has_errors to 1, it means the value of unicode_size and max_char - will be bogus and you should not rely on useful information in them. */ static Py_UCS4 -utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size, - Py_ssize_t *unicode_size, Py_ssize_t* consumed, - int *has_errors) -{ - Py_ssize_t n; +utf8_max_char_size_and_char_count(const char *s, Py_ssize_t string_size, + Py_ssize_t *unicode_size) +{ Py_ssize_t char_count = 0; - Py_UCS4 max_char = 127, new_max; - Py_UCS4 upper_bound; const unsigned char *p = (const unsigned char *)s; const unsigned char *end = p + string_size; const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK); - int err = 0; - - for (; p < end && !err; ++p, ++char_count) { + + assert(unicode_size != NULL); + + /* By having a cascade of independent loops which fallback onto each + other, we minimize the amount of work done in the average loop + iteration, and we also maximize the CPU's ability to predict + branches correctly (because a given condition will have always the + same boolean outcome except perhaps in the last iteration of the + corresponding loop). + In the general case this brings us rather close to decoding + performance pre-PEP 393, despite the two-pass decoding. + + Note that the pure ASCII loop is not duplicated once a non-ASCII + character has been encountered. It is actually a pessimization (by + a significant factor) to use this loop on text with many non-ASCII + characters, and it is important to avoid bad performance on valid + utf-8 data (invalid utf-8 being a different can of worms). + */ + + /* ASCII */ + for (; p < end; ++p) { /* Only check value if it's not a ASCII char... */ if (*p < 0x80) { /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for @@ -4236,76 +4257,60 @@ utf8_max_char_size_and_has_errors(const break; } } - if (*p >= 0x80) { - n = utf8_code_length[*p]; - new_max = max_char; - switch (n) { - /* invalid start byte */ - case 0: - err = 1; - break; - case 2: - /* Code points between 0x00FF and 0x07FF inclusive. - Approximate the upper bound of the code point, - if this flips over 255 we can be sure it will be more - than 255 and the string will need 2 bytes per code coint, - if it stays under or equal to 255, we can be sure 1 byte - is enough. - ((*p & 0b00011111) << 6) | 0b00111111 */ - upper_bound = ((*p & 0x1F) << 6) | 0x3F; - if (max_char < upper_bound) - new_max = upper_bound; - /* Ensure we track at least that we left ASCII space. */ - if (new_max < 128) - new_max = 128; - break; - case 3: - /* Between 0x0FFF and 0xFFFF inclusive, so values are - always > 255 and <= 65535 and will always need 2 bytes. */ - if (max_char < 65535) - new_max = 65535; - break; - case 4: - /* Code point will be above 0xFFFF for sure in this case. */ - new_max = 65537; - break; - /* Internal error, this should be caught by the first if */ - case 1: - default: - assert(0 && "Impossible case in utf8_max_char_and_size"); - err = 1; - } - /* Instead of number of overall bytes for this code point, - n contains the number of following bytes: */ - --n; - /* Check if the follow up chars are all valid continuation bytes */ - if (n >= 1) { - const unsigned char *cont; - if ((p + n) >= end) { - if (consumed == 0) - /* incomplete data, non-incremental decoding */ - err = 1; - break; - } - for (cont = p + 1; cont <= (p + n); ++cont) { - if ((*cont & 0xc0) != 0x80) { - err = 1; - break; - } - } - p += n; - } - else - err = 1; - max_char = new_max; - } - } - - if (unicode_size) - *unicode_size = char_count; - if (has_errors) - *has_errors = err; - return max_char; + if (*p < 0x80) + ++char_count; + else + goto _ucs1loop; + } + *unicode_size = char_count; + return 127; + + /* We don't repeat */ +_ucs1loop: + for (; p < end; ++p) { + if (*p < 0xc4) + char_count += ((*p & 0xc0) != 0x80); + else + goto _ucs2loop; + } + *unicode_size = char_count; + return 255; + +_ucs2loop: + for (; p < end; ++p) { + if (*p < 0xf0) + char_count += ((*p & 0xc0) != 0x80); + else + goto _ucs4loop; + } + *unicode_size = char_count; + return 65535; + +_ucs4loop: + for (; p < end; ++p) { + char_count += ((*p & 0xc0) != 0x80); + } + *unicode_size = char_count; + return 65537; +} + +/* Called when we encountered some error that wasn't detected in the original + scan, e.g. an encoded surrogate character. The original maxchar computation + may have been incorrect, so redo it. */ +static int +refit_partial_string(PyObject **unicode, int kind, void *data, Py_ssize_t n) +{ + PyObject *tmp; + Py_ssize_t k, maxchar; + for (k = 0, maxchar = 0; k < n; k++) + maxchar = Py_MAX(maxchar, PyUnicode_READ(kind, data, k)); + tmp = PyUnicode_New(PyUnicode_GET_LENGTH(*unicode), maxchar); + if (tmp == NULL) + return -1; + PyUnicode_CopyCharacters(tmp, 0, *unicode, 0, n); + Py_DECREF(*unicode); + *unicode = tmp; + return 0; } /* Similar to PyUnicode_WRITE but may attempt to widen and resize the string @@ -4348,35 +4353,58 @@ PyUnicode_DecodeUTF8Stateful(const char Py_ssize_t i; int kind; void *data; - int has_errors; + int has_errors = 0; if (size == 0) { if (consumed) *consumed = 0; return (PyObject *)PyUnicode_New(0, 0); } - maxchar = utf8_max_char_size_and_has_errors(s, size, &unicode_size, - consumed, &has_errors); - if (has_errors) - /* maxchar and size computation might be incorrect; - code below widens and resizes as necessary. */ - unicode = PyUnicode_New(size, 127); - else - unicode = PyUnicode_New(unicode_size, maxchar); + maxchar = utf8_max_char_size_and_char_count(s, size, &unicode_size); +/* if (maxchar > 127) + printf("%zd utf-8 bytes -> max char %d, nb chars %zd (has errors %d)\n", size, maxchar, unicode_size, has_errors);*/ + /* In case of errors, maxchar and size computation might be incorrect; + code below refits and resizes as necessary. */ + unicode = PyUnicode_New(unicode_size, maxchar); if (!unicode) return NULL; /* When the string is ASCII only, just use memcpy and return. unicode_size may be != size if there is an incomplete UTF-8 sequence at the end of the ASCII block. */ - if (!has_errors && maxchar < 128 && size == unicode_size) { + if (maxchar < 128 && size == unicode_size) { Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size); return unicode; } kind = PyUnicode_KIND(unicode); data = PyUnicode_DATA(unicode); + /* Unpack UTF-8 encoded data */ i = 0; e = s + size; + switch (kind) { + case PyUnicode_1BYTE_KIND: + has_errors = ucs1lib_utf8_try_decode(s, e, (Py_UCS1 *) data, &s, &i); + break; + case PyUnicode_2BYTE_KIND: + has_errors = ucs2lib_utf8_try_decode(s, e, (Py_UCS2 *) data, &s, &i); + break; + case PyUnicode_4BYTE_KIND: + has_errors = ucs4lib_utf8_try_decode(s, e, (Py_UCS4 *) data, &s, &i); + break; + } + if (!has_errors) { + /* Ensure the unicode size calculation was correct */ + assert(i == unicode_size); + assert(s == e); + if (consumed) + *consumed = s-starts; + return unicode; + } + /* Fall through to the generic decoding loop for the rest of + the string */ + if (refit_partial_string(&unicode, kind, data, i) < 0) + goto onError; + aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK); while (s < e) { @@ -4528,19 +4556,8 @@ PyUnicode_DecodeUTF8Stateful(const char utf8Error: if (!has_errors) { - PyObject *tmp; - Py_ssize_t k; - /* We encountered some error that wasn't detected in the original scan, - e.g. an encoded surrogate character. The original maxchar computation may - have been incorrect, so redo it now. */ - for (k = 0, maxchar = 0; k < i; k++) - maxchar = Py_MAX(maxchar, PyUnicode_READ(kind, data, k)); - tmp = PyUnicode_New(PyUnicode_GET_LENGTH(unicode), maxchar); - if (tmp == NULL) + if (refit_partial_string(&unicode, kind, data, i) < 0) goto onError; - PyUnicode_CopyCharacters(tmp, 0, unicode, 0, i); - Py_DECREF(unicode); - unicode = tmp; has_errors = 1; } if (unicode_decode_call_errorhandler(