diff -r 7b41f5e83732 Parser/tokenizer.c --- a/Parser/tokenizer.c Thu Nov 07 13:33:36 2013 +0100 +++ b/Parser/tokenizer.c Thu Nov 07 13:37:00 2013 +0100 @@ -143,6 +143,7 @@ tok_new(void) tok->decoding_readline = NULL; tok->decoding_buffer = NULL; #endif + tok->input_is_utf8 = 0; return tok; } @@ -195,32 +196,39 @@ error_ret(struct tok_state *tok) /* XXX static char * get_normal_name(char *s) /* for utf-8 and latin-1 */ { - char buf[13]; - int i; - for (i = 0; i < 12; i++) { - int c = s[i]; - if (c == '\0') - break; - else if (c == '_') - buf[i] = '-'; - else - buf[i] = tolower(c); - } - buf[i] = '\0'; - if (strcmp(buf, "utf-8") == 0 || - strncmp(buf, "utf-8-", 6) == 0) + char lower[13]; + extern int _Py_normalize_encoding(const char *, char *, size_t); + + if (!_Py_normalize_encoding(s, lower, sizeof(lower))) + return s; + + if (strcmp(lower, "utf-8") == 0 || strcmp(lower, "utf8") == 0) return "utf-8"; - else if (strcmp(buf, "latin-1") == 0 || - strcmp(buf, "iso-8859-1") == 0 || - strcmp(buf, "iso-latin-1") == 0 || - strncmp(buf, "latin-1-", 8) == 0 || - strncmp(buf, "iso-8859-1-", 11) == 0 || - strncmp(buf, "iso-latin-1-", 12) == 0) + else if (strcmp(lower, "ascii") == 0) + return "ascii"; + else if (strcmp(lower, "latin-1") == 0 || + strcmp(lower, "iso-8859-1") == 0 || + strcmp(lower, "iso-latin-1") == 0 || + strncmp(lower, "latin-1-", 8) == 0 || + strncmp(lower, "iso-8859-1-", 11) == 0 || + strncmp(lower, "iso-latin-1-", 12) == 0) return "iso-8859-1"; else return s; } +static int +encoding_is_utf8(char *name) +{ + char *norm; + norm = get_normal_name(name); + if (strcmp(norm, "utf-8") == 0) + return 1; + if (strcmp(norm, "ascii") == 0) + return 1; + return 0; +} + /* Return the coding spec in S, or NULL if none is found. */ static int @@ -293,9 +301,12 @@ check_coding_spec(const char* line, Py_s tok->read_coding_spec = 1; if (tok->encoding == NULL) { assert(tok->decoding_state == STATE_RAW); - if (strcmp(cs, "utf-8") == 0) { + + if (encoding_is_utf8(cs) == 0) { tok->encoding = cs; - } else { + tok->input_is_utf8 = 1; + } + else { r = set_readline(tok, cs); if (r) { tok->encoding = cs; @@ -379,6 +390,7 @@ check_bom(int get_char(struct tok_state tok->encoding = new_string("utf-8", 5, tok); if (!tok->encoding) return 0; + tok->input_is_utf8 = 1; /* No need to set_readline: input is already utf-8 */ return 1; } @@ -814,6 +826,7 @@ PyTokenizer_FromUTF8(const char *str, in return NULL; } strcpy(tok->encoding, "utf-8"); + tok->input_is_utf8 = 1; /* XXX: constify members. */ tok->buf = tok->cur = tok->end = tok->inp = (char*)str; @@ -826,7 +839,10 @@ struct tok_state * PyTokenizer_FromFile(FILE *fp, const char* enc, const char *ps1, const char *ps2) { - struct tok_state *tok = tok_new(); + struct tok_state *tok; + extern int _Py_normalize_encoding(const char *, char *, size_t); + + tok = tok_new(); if (tok == NULL) return NULL; if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) { @@ -848,6 +864,8 @@ PyTokenizer_FromFile(FILE *fp, const cha } strcpy(tok->encoding, enc); tok->decoding_state = STATE_NORMAL; + + tok->input_is_utf8 = encoding_is_utf8(tok->encoding); } return tok; } @@ -911,11 +929,12 @@ tok_nextc(struct tok_state *tok) return EOF; newtok = translated; } - if (tok->encoding && newtok && *newtok) { + if (!tok->input_is_utf8 && tok->encoding && newtok && *newtok) { /* Recode to UTF-8 */ Py_ssize_t buflen; const char* buf; - PyObject *u = translate_into_utf8(newtok, tok->encoding); + PyObject *u; + u = translate_into_utf8(newtok, tok->encoding); PyMem_FREE(newtok); if (!u) { tok->done = E_DECODE; diff -r 7b41f5e83732 Parser/tokenizer.h --- a/Parser/tokenizer.h Thu Nov 07 13:33:36 2013 +0100 +++ b/Parser/tokenizer.h Thu Nov 07 13:37:00 2013 +0100 @@ -65,6 +65,7 @@ struct tok_state { const char* enc; /* Encoding for the current str. */ const char* str; const char* input; /* Tokenizer's newline translated copy of the string. */ + int input_is_utf8; /* input byte string is encoded to UTF-8? */ }; extern struct tok_state *PyTokenizer_FromString(const char *, int);