diff -r 2c97612859b3 Lib/test/test_source_encoding.py --- a/Lib/test/test_source_encoding.py Mon Nov 16 07:36:44 2015 -0500 +++ b/Lib/test/test_source_encoding.py Tue Nov 17 03:15:46 2015 +0200 @@ -1,11 +1,12 @@ # -*- coding: koi8-r -*- import unittest -from test.support import TESTFN, unlink, unload, rmtree +from test.support import TESTFN, unlink, unload, rmtree, script_helper, captured_stdout import importlib import os import sys import subprocess +import tempfile class SourceEncodingTest(unittest.TestCase): @@ -142,5 +143,136 @@ class SourceEncodingTest(unittest.TestCa msg=c.exception.args[0]) +BUFSIZ = 2**13 + +class AbstractSourceEncodingTest: + + def test_double_coding(self): + src = (b'#coding:iso8859-15 coding:latin1\n' + b'print(ascii("\xc3\xa4"))\n') + out = self.run_script(src) + self.assertEqual(out.rstrip(), br"'\xc3\u20ac'") + + def test_first_coding_line(self): + src = (b'#coding:iso8859-15\n' + b'print(ascii("\xc3\xa4"))\n') + out = self.run_script(src) + self.assertEqual(out.rstrip(), br"'\xc3\u20ac'") + + def test_second_coding_line(self): + src = (b'#\n' + b'#coding:iso8859-15\n' + b'print(ascii("\xc3\xa4"))\n') + out = self.run_script(src) + self.assertEqual(out.rstrip(), br"'\xc3\u20ac'") + + def test_long_first_coding_line(self): + src = (b'#' + b' '*BUFSIZ + b'coding:iso8859-15\n' + b'print(ascii("\xc3\xa4"))\n') + out = self.run_script(src) + self.assertEqual(out.rstrip(), br"'\xc3\u20ac'") + + def test_long_second_coding_line(self): + src = (b'#\n' + b'#' + b' '*BUFSIZ + b'coding:iso8859-15\n' + b'print(ascii("\xc3\xa4"))\n') + out = self.run_script(src) + self.assertEqual(out.rstrip(), br"'\xc3\u20ac'") + + def test_long_coding_line(self): + src = (b'#coding:iso-8859-15' + b' '*BUFSIZ + b'\n' + b'print(ascii("\xc3\xa4"))\n') + out = self.run_script(src) + self.assertEqual(out.rstrip(), br"'\xc3\u20ac'") + + def test_long_coding_name(self): + src = (b'#coding:iso-8859-1-' + b'x'*BUFSIZ + b'\n' + b'print(ascii("\xc3\xa4"))\n') + out = self.run_script(src) + self.assertEqual(out.rstrip(), br"'\xc3\xa4'") + + def test_long_first_utf8_line(self): + src = b'#' + b'\xc3\xa4'*(BUFSIZ//2) + b'\n' + out = self.run_script(src) + src = b'# ' + b'\xc3\xa4'*(BUFSIZ//2) + b'\n' + out = self.run_script(src) + + def test_long_second_utf8_line(self): + src = b'\n#' + b'\xc3\xa4'*(BUFSIZ//2) + b'\n' + out = self.run_script(src) + src = b'\n# ' + b'\xc3\xa4'*(BUFSIZ//2) + b'\n' + out = self.run_script(src) + + def test_first_non_utf8_coding_line(self): + src = (b'#coding:iso-8859-15 \xa4\n' + b'print(ascii("\xc3\xa4"))\n') + out = self.run_script(src) + self.assertEqual(out.rstrip(), br"'\xc3\u20ac'") + + def test_second_non_utf8_coding_line(self): + src = (b'\n' + b'#coding:iso-8859-15 \xa4\n' + b'print(ascii("\xc3\xa4"))\n') + out = self.run_script(src) + self.assertEqual(out.rstrip(), br"'\xc3\u20ac'") + + def test_non_utf8_shebang(self): + src = (b'#!/home/\xa4/bin/python\n' + b'#coding:iso-8859-15\n') + out = self.run_script(src) + + #def test_crlf(self): + #src = (b'print(ascii("""\r\n"""))\n') + #out = self.run_script(src) + #self.assertEqual(out.rstrip(), br"'\n'") + + #def test_crcrlf(self): + #src = (b'print(ascii("""\r\r\n"""))\n') + #out = self.run_script(src) + #self.assertEqual(out.rstrip(), br"'\n\n'") + + #def test_crcrcrlf(self): + #src = (b'print(ascii("""\r\r\r\n"""))\n') + #out = self.run_script(src) + #self.assertEqual(out.rstrip(), br"'\n\n\n'") + + #def test_crcrcrlf2(self): + #src = (b'#coding:iso-8859-1\n' + #b'print(ascii("""\r\r\r\n"""))\n') + #out = self.run_script(src) + #self.assertEqual(out.rstrip(), br"'\n\n\n'") + + #def test_null(self): + #src = (b'#\x00\n' + #b'print("ok")\n') + #out = self.run_script(src) + #self.assertEqual(out.rstrip(), br"ok") + + #def _test_null2(self): + #src = (b'#\n' + #b'#\n' + #b'#\x00\n' + #b'print("ok")\n') + #out = self.run_script(src) + #self.assertEqual(out.rstrip(), br"ok") + +class StringSourceEncodingTest(AbstractSourceEncodingTest, unittest.TestCase): + + def run_script(self, src): + with captured_stdout() as stdout: + exec(src) + return stdout.getvalue().encode() + +class FileSourceEncodingTest(AbstractSourceEncodingTest, unittest.TestCase): + + def run_script(self, src): + with tempfile.TemporaryDirectory() as tmpd: + fn = os.path.join(tmpd, "test.py") + with open(fn, "wb") as fp: + fp.write(src) + res = script_helper.assert_python_ok(fn) + return res.out + + if __name__ == "__main__": unittest.main() diff -r 2c97612859b3 Parser/tokenizer.c --- a/Parser/tokenizer.c Mon Nov 16 07:36:44 2015 -0500 +++ b/Parser/tokenizer.c Tue Nov 17 03:15:46 2015 +0200 @@ -138,7 +138,6 @@ tok_new(void) tok->altindstack[0] = 0; tok->decoding_state = STATE_INIT; tok->decoding_erred = 0; - tok->read_coding_spec = 0; tok->enc = NULL; tok->encoding = NULL; tok->cont_line = 0; @@ -168,19 +167,30 @@ new_string(const char *s, Py_ssize_t len return result; } -#ifdef PGEN - -static char * -decoding_fgets(char *s, int size, struct tok_state *tok) +static int +tok_reserve_buf(struct tok_state *tok, Py_ssize_t size) { - return fgets(s, size, tok->fp); + Py_ssize_t cur = tok->cur - tok->buf; + Py_ssize_t oldsize = tok->inp - tok->buf; + Py_ssize_t newsize = oldsize + Py_MAX(size, oldsize >> 1); + if (newsize > tok->end - tok->buf) { + char *newbuf = tok->buf; + Py_ssize_t start = tok->start == NULL ? -1 : tok->start - tok->buf; + newbuf = (char *)PyMem_REALLOC(newbuf, newsize); + if (newbuf == NULL) { + tok->done = E_NOMEM; + return 0; + } + tok->buf = newbuf; + tok->cur = tok->buf + cur; + tok->inp = tok->buf + oldsize; + tok->end = tok->buf + newsize; + tok->start = start < 0 ? NULL : tok->buf + start; + } + return 1; } -static int -decoding_feof(struct tok_state *tok) -{ - return feof(tok->fp); -} +#ifdef PGEN static char * decode_str(const char *str, int exec_input, struct tok_state *tok) @@ -248,14 +258,14 @@ get_coding_spec(const char *s, char **sp } for (; i < size - 6; i++) { /* XXX inefficient search */ const char* t = s + i; - if (strncmp(t, "coding", 6) == 0) { + if (memcmp(t, "coding", 6) == 0) { const char* begin = NULL; t += 6; if (t[0] != ':' && t[0] != '=') continue; do { t++; - } while (t[0] == '\x20' || t[0] == '\t'); + } while (t[0] == ' ' || t[0] == '\t'); begin = t; while (Py_ISALNUM(t[0]) || @@ -275,6 +285,7 @@ get_coding_spec(const char *s, char **sp return 0; } *spec = r; + break; } } } @@ -291,11 +302,10 @@ check_coding_spec(const char* line, Py_s int set_readline(struct tok_state *, const char *)) { char *cs; - int r = 1; if (tok->cont_line) { /* It's a continuation line, so it can't be a coding spec. */ - tok->read_coding_spec = 1; + tok->decoding_state = STATE_NORMAL; return 1; } if (!get_coding_spec(line, &cs, size, tok)) @@ -308,37 +318,34 @@ check_coding_spec(const char* line, Py_s if (line[i] != ' ' && line[i] != '\t' && line[i] != '\014') { /* Stop checking coding spec after a line containing * anything except a comment. */ - tok->read_coding_spec = 1; + tok->decoding_state = STATE_NORMAL; break; } } return 1; } - tok->read_coding_spec = 1; + tok->decoding_state = STATE_NORMAL; if (tok->encoding == NULL) { - assert(tok->decoding_state == STATE_RAW); - if (strcmp(cs, "utf-8") == 0) { - tok->encoding = cs; - } else { - r = set_readline(tok, cs); - if (r) { - tok->encoding = cs; - tok->decoding_state = STATE_NORMAL; - } - else { - PyErr_Format(PyExc_SyntaxError, - "encoding problem: %s", cs); - PyMem_FREE(cs); - } + assert(tok->decoding_readline == NULL); + if (strcmp(cs, "utf-8") != 0 && !set_readline(tok, cs)) { + error_ret(tok); + PyErr_Format(PyExc_SyntaxError, + "encoding problem: %s", cs); + PyMem_FREE(cs); + return 0; } + tok->encoding = cs; } else { /* then, compare cs with BOM */ - r = (strcmp(tok->encoding, cs) == 0); - if (!r) + if (strcmp(tok->encoding, cs) != 0) { + error_ret(tok); PyErr_Format(PyExc_SyntaxError, "encoding problem: %s with BOM", cs); + PyMem_FREE(cs); + return 0; + } PyMem_FREE(cs); } - return r; + return 1; } /* See whether the file starts with a BOM. If it does, @@ -353,7 +360,7 @@ check_bom(int get_char(struct tok_state { int ch1, ch2, ch3; ch1 = get_char(tok); - tok->decoding_state = STATE_RAW; + tok->decoding_state = STATE_SEEK_CODING; if (ch1 == EOF) { return 1; } else if (ch1 == 0xEF) { @@ -422,65 +429,38 @@ check_bom(int get_char(struct tok_state reached): see tok_nextc and its calls to decoding_fgets. */ -static char * -fp_readl(char *s, int size, struct tok_state *tok) +static int +tok_readline_recode(struct tok_state *tok) { - PyObject* bufobj; + PyObject* line; const char *buf; Py_ssize_t buflen; - /* Ask for one less byte so we can terminate it */ - assert(size > 0); - size--; - - if (tok->decoding_buffer) { - bufobj = tok->decoding_buffer; - Py_INCREF(bufobj); - } - else - { - bufobj = PyObject_CallObject(tok->decoding_readline, NULL); - if (bufobj == NULL) - goto error; - } - if (PyUnicode_CheckExact(bufobj)) - { - buf = _PyUnicode_AsStringAndSize(bufobj, &buflen); - if (buf == NULL) { + line = tok->decoding_buffer; + if (line == NULL) { + line = PyObject_CallObject(tok->decoding_readline, NULL); + if (line == NULL) { goto error; } } - else - { - buf = PyByteArray_AsString(bufobj); - if (buf == NULL) { - goto error; - } - buflen = PyByteArray_GET_SIZE(bufobj); + else { + tok->decoding_buffer = NULL; } - - Py_XDECREF(tok->decoding_buffer); - if (buflen > size) { - /* Too many chars, the rest goes into tok->decoding_buffer */ - tok->decoding_buffer = PyByteArray_FromStringAndSize(buf+size, - buflen-size); - if (tok->decoding_buffer == NULL) - goto error; - buflen = size; - } - else - tok->decoding_buffer = NULL; - - memcpy(s, buf, buflen); - s[buflen] = '\0'; - if (buflen == 0) /* EOF */ - s = NULL; - Py_DECREF(bufobj); - return s; + buf = PyUnicode_AsUTF8AndSize(line, &buflen); + if (buf == NULL) + goto error; + if (!tok_reserve_buf(tok, buflen + 1)) + return 0; + memcpy(tok->inp, buf, buflen); + tok->inp += buflen; + *tok->inp = '\0'; + Py_DECREF(line); + return 1; error: - Py_XDECREF(bufobj); - return error_ret(tok); + Py_XDECREF(line); + error_ret(tok); + return 0; } /* Set the readline function for TOK to a StreamReader's @@ -524,8 +504,8 @@ fp_setreadl(struct tok_state *tok, const if (stream == NULL) goto cleanup; + readline = _PyObject_GetAttrId(stream, &PyId_readline); Py_XDECREF(tok->decoding_readline); - readline = _PyObject_GetAttrId(stream, &PyId_readline); tok->decoding_readline = readline; if (pos > 0) { if (PyObject_CallObject(readline, NULL) == NULL) { @@ -580,51 +560,18 @@ static int valid_utf8(const unsigned cha return length; } -/* Read a line of input from TOK. Determine encoding - if necessary. */ - -static char * -decoding_fgets(char *s, int size, struct tok_state *tok) +/* Make sure we don't have any non-UTF-8 sequences in it. */ +static int +ensure_utf8(char *line, struct tok_state *tok) { - char *line = NULL; int badchar = 0; - for (;;) { - if (tok->decoding_state == STATE_NORMAL) { - /* We already have a codec associated with - this input. */ - line = fp_readl(s, size, tok); + unsigned char *c; + int length; + for (c = (unsigned char *)line; *c; c += length) + if (!(length = valid_utf8(c))) { + badchar = *c; break; - } else if (tok->decoding_state == STATE_RAW) { - /* We want a 'raw' read. */ - line = Py_UniversalNewlineFgets(s, size, - tok->fp, NULL); - break; - } else { - /* We have not yet determined the encoding. - If an encoding is found, use the file-pointer - reader functions from now on. */ - if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok)) - return error_ret(tok); - assert(tok->decoding_state != STATE_INIT); } - } - if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) { - if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) { - return error_ret(tok); - } - } -#ifndef PGEN - /* The default encoding is UTF-8, so make sure we don't have any - non-UTF-8 sequences in it. */ - if (line && !tok->encoding) { - unsigned char *c; - int length; - for (c = (unsigned char *)line; *c; c += length) - if (!(length = valid_utf8(c))) { - badchar = *c; - break; - } - } if (badchar) { /* Need to add 1 to the line number, since this line has not been counted, yet. */ @@ -634,30 +581,9 @@ decoding_fgets(char *s, int size, struct "but no encoding declared; " "see http://python.org/dev/peps/pep-0263/ for details", badchar, tok->filename, tok->lineno + 1); - return error_ret(tok); + return 0; } -#endif - return line; -} - -static int -decoding_feof(struct tok_state *tok) -{ - if (tok->decoding_state != STATE_NORMAL) { - return feof(tok->fp); - } else { - PyObject* buf = tok->decoding_buffer; - if (buf == NULL) { - buf = PyObject_CallObject(tok->decoding_readline, NULL); - if (buf == NULL) { - error_ret(tok); - return 1; - } else { - tok->decoding_buffer = buf; - } - } - return PyObject_Length(buf) == 0; - } + return 1; } /* Fetch a byte from TOK, using the string buffer. */ @@ -781,11 +707,11 @@ decode_str(const char *input, int single assumes a single line as input */ if (newl[0]) { if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl)) - return error_ret(tok); - if (tok->enc == NULL && !tok->read_coding_spec && newl[1]) { + return NULL; + if (tok->enc == NULL && tok->decoding_state != STATE_NORMAL && newl[1]) { if (!check_coding_spec(newl[0]+1, newl[1] - newl[0], tok, buf_setreadl)) - return error_ret(tok); + return NULL; } } if (tok->enc != NULL) { @@ -834,16 +760,14 @@ PyTokenizer_FromUTF8(const char *str, in PyTokenizer_Free(tok); return NULL; } - tok->decoding_state = STATE_RAW; - tok->read_coding_spec = 1; + tok->decoding_state = STATE_NORMAL; tok->enc = NULL; tok->str = str; - tok->encoding = (char *)PyMem_MALLOC(6); + tok->encoding = new_string("utf-8", 5, tok); if (!tok->encoding) { PyTokenizer_Free(tok); return NULL; } - strcpy(tok->encoding, "utf-8"); /* XXX: constify members. */ tok->buf = tok->cur = tok->end = tok->inp = (char*)str; @@ -871,12 +795,11 @@ PyTokenizer_FromFile(FILE *fp, const cha if (enc != NULL) { /* Must copy encoding declaration since it gets copied into the parse tree. */ - tok->encoding = PyMem_MALLOC(strlen(enc)+1); + tok->encoding = new_string(enc, strlen(enc), tok); if (!tok->encoding) { PyTokenizer_Free(tok); return NULL; } - strcpy(tok->encoding, enc); tok->decoding_state = STATE_NORMAL; } return tok; @@ -905,197 +828,238 @@ PyTokenizer_Free(struct tok_state *tok) /* Get next char, updating state; error code goes into tok->done */ static int +tok_underflow_string(struct tok_state *tok) +{ + char *end = strchr(tok->inp, '\n'); + if (end != NULL) + end++; + else { + end = strchr(tok->inp, '\0'); + if (end == tok->inp) { + tok->done = E_EOF; + return 0; + } + } + if (tok->start == NULL) + tok->buf = tok->cur; + tok->lineno++; + tok->inp = end; + return 1; +} + +static int +tok_underflow_interactive(struct tok_state *tok) +{ + char *newtok = PyOS_Readline(stdin, stdout, tok->prompt); +#ifndef PGEN + if (newtok != NULL) { + char *translated = translate_newlines(newtok, 0, tok); + PyMem_FREE(newtok); + if (translated == NULL) + return 0; + newtok = translated; + } + if (tok->encoding && newtok && *newtok) { + /* Recode to UTF-8 */ + Py_ssize_t buflen; + const char* buf; + PyObject *u = translate_into_utf8(newtok, tok->encoding); + PyMem_FREE(newtok); + if (!u) { + tok->done = E_DECODE; + return 0; + } + buflen = PyBytes_GET_SIZE(u); + buf = PyBytes_AS_STRING(u); + newtok = PyMem_MALLOC(buflen+1); + strcpy(newtok, buf); + Py_DECREF(u); + } +#endif + if (tok->nextprompt != NULL) + tok->prompt = tok->nextprompt; + if (newtok == NULL) + tok->done = E_INTR; + else if (*newtok == '\0') { + PyMem_FREE(newtok); + tok->done = E_EOF; + } + else if (tok->start != NULL) { + size_t size = strlen(newtok); + tok->lineno++; + if (!tok_reserve_buf(tok, size + 1)) { + PyMem_FREE(tok->buf); + tok->buf = NULL; + PyMem_FREE(newtok); + return 0; + } + memcpy(tok->cur, newtok, size + 1); + PyMem_FREE(newtok); + tok->inp += size; + } + else { + tok->lineno++; + if (tok->buf != NULL) + PyMem_FREE(tok->buf); + tok->buf = newtok; + tok->cur = tok->buf; + tok->inp = strchr(tok->buf, '\0'); + tok->end = tok->inp + 1; + } + if (tok->done != E_OK) { + if (tok->prompt != NULL) + PySys_WriteStderr("\n"); + return 0; + } + return 1; +} + +static int +tok_readline_raw(struct tok_state *tok) +{ + do { + if (!tok_reserve_buf(tok, BUFSIZ)) + return 0; +#ifdef PGEN + if (fgets(tok->inp, (int)(tok->end - tok->inp), tok->fp) == NULL) +#else + if (Py_UniversalNewlineFgets(tok->inp, + (int)(tok->end - tok->inp), + tok->fp, NULL) == NULL) +#endif + return 1; + tok->inp = strchr(tok->inp, '\0'); + } while (tok->inp[-1] != '\n'); +#ifdef PGEN + /* replace "\r\n" with "\n" */ + /* For Mac leave the \r, giving a syntax error */ + if (tok->inp - tok->cur >= 2 && tok->inp[-2] == '\r') { + tok->inp--; + tok->inp[-1] = '\n'; + tok->inp[0] = '\0'; + } +#endif + return 1; +} + +/* Read a line of input from TOK. Determine encoding + if necessary. */ + +static int +tok_underflow_file(struct tok_state *tok) +{ + if (tok->start == NULL) + tok->cur = tok->inp = tok->buf; +#ifndef PGEN + if (tok->decoding_state == STATE_INIT) { + /* We have not yet determined the encoding. + If an encoding is found, use the file-pointer + reader functions from now on. */ + if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok)) { + error_ret(tok); + return 0; + } + assert(tok->decoding_state != STATE_INIT); + } +#endif + /* Read until '\n' or EOF */ +#ifndef PGEN + if (tok->decoding_readline != NULL) { + /* We already have a codec associated with this input. */ + if (!tok_readline_recode(tok)) + return 0; + } + else +#endif + { + /* We want a 'raw' read. */ + if (!tok_readline_raw(tok)) + return 0; + } + if (tok->inp == tok->cur) { + tok->done = E_EOF; + return 0; + } + if (tok->inp[-1] != '\n') { + /* Last line does not end in \n, fake one */ + *tok->inp++ = '\n'; + *tok->inp = '\0'; + } + + tok->lineno++; +#ifndef PGEN + if (tok->decoding_state != STATE_NORMAL) { + if (tok->lineno > 2) + tok->decoding_state = STATE_NORMAL; + else if (!check_coding_spec(tok->cur, tok->end - tok->cur, tok, fp_setreadl)) + return 0; + } + /* The default encoding is UTF-8, so make sure we don't have any + non-UTF-8 sequences in it. */ + if (!tok->encoding && (tok->decoding_state != STATE_NORMAL || tok->lineno >= 2)) + if (!ensure_utf8(tok->cur, tok)) + return 0; +#endif + assert(tok->done == E_OK); + return tok->done == E_OK; +} + +static void +print_escape(FILE *f, const char *s, Py_ssize_t size) +{ + if (s == NULL) { + fputs("NULL", f); + return; + } + putc('"', f); + while (size-- > 0) { + unsigned char c = *s++; + switch (c) { + case '\n': fputs("\\n", f); break; + case '\r': fputs("\\r", f); break; + case '\t': fputs("\\t", f); break; + case '\f': fputs("\\f", f); break; + case '\'': fputs("\\'", f); break; + case '"': fputs("\\\"", f); break; + default: + if (0x20 <= c && c <= 0x7f) + putc(c, f); + else + fprintf(f, "\\x%02x", c); + } + } + putc('"', f); +} + +static int tok_nextc(struct tok_state *tok) { + int rc; for (;;) { - if (tok->cur != tok->inp) { + if (tok->cur != tok->inp) return Py_CHARMASK(*tok->cur++); /* Fast path */ - } if (tok->done != E_OK) return EOF; - if (tok->fp == NULL) { - char *end = strchr(tok->inp, '\n'); - if (end != NULL) - end++; - else { - end = strchr(tok->inp, '\0'); - if (end == tok->inp) { - tok->done = E_EOF; - return EOF; - } - } - if (tok->start == NULL) - tok->buf = tok->cur; - tok->line_start = tok->cur; - tok->lineno++; - tok->inp = end; - return Py_CHARMASK(*tok->cur++); + if (tok->fp == NULL) + rc = tok_underflow_string(tok); + else if (tok->prompt != NULL) + rc = tok_underflow_interactive(tok); + else + rc = tok_underflow_file(tok); + if (Py_DebugFlag) { + printf("line[%d] = ", tok->lineno); + print_escape(stdout, tok->cur, tok->inp - tok->cur); + printf(" tok->done = %d\n", tok->done); } - if (tok->prompt != NULL) { - char *newtok = PyOS_Readline(stdin, stdout, tok->prompt); -#ifndef PGEN - if (newtok != NULL) { - char *translated = translate_newlines(newtok, 0, tok); - PyMem_FREE(newtok); - if (translated == NULL) - return EOF; - newtok = translated; - } - if (tok->encoding && newtok && *newtok) { - /* Recode to UTF-8 */ - Py_ssize_t buflen; - const char* buf; - PyObject *u = translate_into_utf8(newtok, tok->encoding); - PyMem_FREE(newtok); - if (!u) { - tok->done = E_DECODE; - return EOF; - } - buflen = PyBytes_GET_SIZE(u); - buf = PyBytes_AS_STRING(u); - newtok = PyMem_MALLOC(buflen+1); - strcpy(newtok, buf); - Py_DECREF(u); - } -#endif - if (tok->nextprompt != NULL) - tok->prompt = tok->nextprompt; - if (newtok == NULL) - tok->done = E_INTR; - else if (*newtok == '\0') { - PyMem_FREE(newtok); - tok->done = E_EOF; - } - else if (tok->start != NULL) { - size_t start = tok->start - tok->buf; - size_t oldlen = tok->cur - tok->buf; - size_t newlen = oldlen + strlen(newtok); - char *buf = tok->buf; - buf = (char *)PyMem_REALLOC(buf, newlen+1); - tok->lineno++; - if (buf == NULL) { - PyMem_FREE(tok->buf); - tok->buf = NULL; - PyMem_FREE(newtok); - tok->done = E_NOMEM; - return EOF; - } - tok->buf = buf; - tok->cur = tok->buf + oldlen; - tok->line_start = tok->cur; - strcpy(tok->buf + oldlen, newtok); - PyMem_FREE(newtok); - tok->inp = tok->buf + newlen; - tok->end = tok->inp + 1; - tok->start = tok->buf + start; - } - else { - tok->lineno++; - if (tok->buf != NULL) - PyMem_FREE(tok->buf); - tok->buf = newtok; - tok->cur = tok->buf; - tok->line_start = tok->buf; - tok->inp = strchr(tok->buf, '\0'); - tok->end = tok->inp + 1; - } - } - else { - int done = 0; - Py_ssize_t cur = 0; - char *pt; - if (tok->start == NULL) { - if (tok->buf == NULL) { - tok->buf = (char *) - PyMem_MALLOC(BUFSIZ); - if (tok->buf == NULL) { - tok->done = E_NOMEM; - return EOF; - } - tok->end = tok->buf + BUFSIZ; - } - if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf), - tok) == NULL) { - if (!tok->decoding_erred) - tok->done = E_EOF; - done = 1; - } - else { - tok->done = E_OK; - tok->inp = strchr(tok->buf, '\0'); - done = tok->inp[-1] == '\n'; - } - } - else { - cur = tok->cur - tok->buf; - if (decoding_feof(tok)) { - tok->done = E_EOF; - done = 1; - } - else - tok->done = E_OK; - } - tok->lineno++; - /* Read until '\n' or EOF */ - while (!done) { - Py_ssize_t curstart = tok->start == NULL ? -1 : - tok->start - tok->buf; - Py_ssize_t curvalid = tok->inp - tok->buf; - Py_ssize_t newsize = curvalid + BUFSIZ; - char *newbuf = tok->buf; - newbuf = (char *)PyMem_REALLOC(newbuf, - newsize); - if (newbuf == NULL) { - tok->done = E_NOMEM; - tok->cur = tok->inp; - return EOF; - } - tok->buf = newbuf; - tok->cur = tok->buf + cur; - tok->line_start = tok->cur; - tok->inp = tok->buf + curvalid; - tok->end = tok->buf + newsize; - tok->start = curstart < 0 ? NULL : - tok->buf + curstart; - if (decoding_fgets(tok->inp, - (int)(tok->end - tok->inp), - tok) == NULL) { - /* Break out early on decoding - errors, as tok->buf will be NULL - */ - if (tok->decoding_erred) - return EOF; - /* Last line does not end in \n, - fake one */ - strcpy(tok->inp, "\n"); - } - tok->inp = strchr(tok->inp, '\0'); - done = tok->inp[-1] == '\n'; - } - if (tok->buf != NULL) { - tok->cur = tok->buf + cur; - tok->line_start = tok->cur; - /* replace "\r\n" with "\n" */ - /* For Mac leave the \r, giving a syntax error */ - pt = tok->inp - 2; - if (pt >= tok->buf && *pt == '\r') { - *pt++ = '\n'; - *pt = '\0'; - tok->inp = pt; - } - } - } - if (tok->done != E_OK) { - if (tok->prompt != NULL) - PySys_WriteStderr("\n"); + if (!rc) { tok->cur = tok->inp; return EOF; } + tok->line_start = tok->cur; } /*NOTREACHED*/ } - /* Back-up one character */ static void @@ -1104,8 +1068,8 @@ tok_backup(struct tok_state *tok, int c) if (c != EOF) { if (--tok->cur < tok->buf) Py_FatalError("tok_backup: beginning of buffer"); - if (*tok->cur != c) - *tok->cur = c; + if ((int)(unsigned char)*tok->cur != c) + Py_FatalError("tok_backup: wrong character"); } } @@ -1868,7 +1832,7 @@ PyTokenizer_FindEncodingFilename(int fd, if (tok->encoding) { encoding = (char *)PyMem_MALLOC(strlen(tok->encoding) + 1); if (encoding) - strcpy(encoding, tok->encoding); + strcpy(encoding, tok->encoding); } PyTokenizer_Free(tok); return encoding; diff -r 2c97612859b3 Parser/tokenizer.h --- a/Parser/tokenizer.h Mon Nov 16 07:36:44 2015 -0500 +++ b/Parser/tokenizer.h Tue Nov 17 03:15:46 2015 +0200 @@ -14,8 +14,8 @@ extern "C" { enum decoding_state { STATE_INIT, - STATE_RAW, - STATE_NORMAL /* have a codec associated with input */ + STATE_SEEK_CODING, + STATE_NORMAL }; /* Tokenizer state */ @@ -54,7 +54,6 @@ struct tok_state { /* Stuff for PEP 0263 */ enum decoding_state decoding_state; int decoding_erred; /* whether erred in decoding */ - int read_coding_spec; /* whether 'coding:...' has been read */ char *encoding; /* Source encoding. */ int cont_line; /* whether we are in a continuation line. */ const char* line_start; /* pointer to start of current line */