Index: Parser/parsetok.c =================================================================== --- Parser/parsetok.c (revision 86409) +++ Parser/parsetok.c (working copy) @@ -118,6 +118,41 @@ #endif #endif +/* Count the number of valid unicode codepoints in an UTF-8 encoded string of + * bytes */ + +static Py_ssize_t +count_utf8_chars(const unsigned char* s, Py_ssize_t len) +{ + Py_ssize_t i = 0, count = 0, expected; + while (i < len) { + unsigned char b = s[i++]; + if (b < 0x80) { /* 0xxxxxxx */ + count++; + continue; + } + if (b < 0xC0) /* invalid byte */ + return count; + if (b < 0xE0) /* 110xxxxx */ + expected = 1; + else if (b < 0xF0) /* 1110xxxx */ + expected = 2; + else if (b < 0xF8) /* 11110xxx */ + expected = 3; + else + return count; + i += expected; + if (i > len) + break; + for (; expected; expected--) + if (s[i - expected] < 0x80 || s[i - expected] >= 0xC0) + return count; + /* Count a valid code point. */ + count++; + } + return count; +} + /* Parse input coming from the given tokenizer structure. Return error code. */ @@ -231,7 +266,7 @@ if (tok->buf != NULL) { size_t len; assert(tok->cur - tok->buf < INT_MAX); - err_ret->offset = (int)(tok->cur - tok->buf); + err_ret->offset = count_utf8_chars(tok->buf, tok->cur - tok->buf); len = tok->inp - tok->buf; err_ret->text = (char *) PyObject_MALLOC(len + 1); if (err_ret->text != NULL) {