diff -r ae69ebd41807 Lib/idlelib/IOBinding.py --- a/Lib/idlelib/IOBinding.py Fri Jan 03 15:52:22 2014 -0500 +++ b/Lib/idlelib/IOBinding.py Sat Jan 04 10:35:59 2014 +0200 @@ -64,6 +64,7 @@ ### 'encoding' is used below in encode(), check! coding_re = re.compile(r'^[ \t\f]*#.*coding[:=][ \t]*([-\w.]+)', re.ASCII) +blank_re = re.compile(r'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII) def coding_spec(data): """Return the encoding declaration according to PEP 263. @@ -93,6 +94,8 @@ match = coding_re.match(line) if match is not None: break + if not blank_re.match(line): + return None else: return None name = match.group(1) diff -r ae69ebd41807 Lib/lib2to3/pgen2/tokenize.py --- a/Lib/lib2to3/pgen2/tokenize.py Fri Jan 03 15:52:22 2014 -0500 +++ b/Lib/lib2to3/pgen2/tokenize.py Sat Jan 04 10:35:59 2014 +0200 @@ -237,6 +237,7 @@ toks_append(tokval) cookie_re = re.compile(r'^[ \t\f]*#.*coding[:=][ \t]*([-\w.]+)', re.ASCII) +blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII) def _get_normal_name(orig_enc): """Imitates get_normal_name in tokenizer.c.""" @@ -309,6 +310,8 @@ encoding = find_cookie(first) if encoding: return encoding, [first] + if not blank_re.match(first): + return default, [first] second = read_or_stop() if not second: diff -r ae69ebd41807 Lib/test/test_source_encoding.py --- a/Lib/test/test_source_encoding.py Fri Jan 03 15:52:22 2014 -0500 +++ b/Lib/test/test_source_encoding.py Sat Jan 04 10:35:59 2014 +0200 @@ -1,7 +1,8 @@ # -*- coding: koi8-r -*- import unittest -from test.support import TESTFN, unlink, unload +from test.support import TESTFN, unlink, unload, captured_stdout +from test.script_helper import assert_python_ok import importlib import os import sys @@ -130,6 +131,50 @@ self.assertTrue(c.exception.args[0].startswith(expected), msg=c.exception.args[0]) + def source_encoding_test(self, source, expected): + with captured_stdout() as out: + exec(source) + self.assertEqual(out.getvalue(), expected) + + filename = TESTFN + '.py' + self.addCleanup(unlink, filename) + with open(filename, 'wb') as f: + f.write(source) + rc, out, err = assert_python_ok(filename) + self.assertEqual(out, expected.encode('ascii')) + + def test_noncommented_first_line(self): + self.source_encoding_test( + ("print(ascii('\u00a31'), end=';')\n" + "# -*- coding: iso8859-15 -*-\n" + "print(ascii('\u20ac2'), end='')\n").encode('utf-8'), + r"'\xa31';'\u20ac2'") + + def test_commented_first_line(self): + self.source_encoding_test( + ("#print(ascii('\u00a31'), end=';')\n" + "# -*- coding: iso8859-15 -*-\n" + "print(ascii('\u20ac2'), end='')\n").encode('utf-8'), + r"'\xe2\x82\xac2'") + + def test_empty_first_line(self): + self.source_encoding_test( + ("\n" + "# -*- coding: iso8859-15 -*-\n" + "print(ascii('\u20ac2'), end='')\n").encode('utf-8'), + r"'\xe2\x82\xac2'") + + def test_skip_first_line(self): + source = ("print(ascii('\u00a31'), end=';')\n" + "# -*- coding: iso8859-15 -*-\n" + "print(ascii('\u20ac2'), end='')\n").encode('utf-8') + filename = TESTFN + '.py' + self.addCleanup(unlink, filename) + with open(filename, 'wb') as f: + f.write(source) + rc, out, err = assert_python_ok('-x', filename) + self.assertEqual(out, br"'\xe2\x82\xac2'") + if __name__ == "__main__": unittest.main() diff -r ae69ebd41807 Lib/test/test_tokenize.py --- a/Lib/test/test_tokenize.py Fri Jan 03 15:52:22 2014 -0500 +++ b/Lib/test/test_tokenize.py Sat Jan 04 10:35:59 2014 +0200 @@ -885,6 +885,39 @@ readline = self.get_readline(lines) self.assertRaises(SyntaxError, detect_encoding, readline) + def test_cookie_second_line_noncommented_first_line(self): + lines = ( + b"print('\xc2\xa3')\n", + b'# vim: set fileencoding=iso8859-15 :\n', + b"print('\xe2\x82\xac')\n" + ) + encoding, consumed_lines = detect_encoding(self.get_readline(lines)) + self.assertEqual(encoding, 'utf-8') + expected = [b"print('\xc2\xa3')\n"] + self.assertEqual(consumed_lines, expected) + + def test_cookie_second_line_commented_first_line(self): + lines = ( + b"#print('\xc2\xa3')\n", + b'# vim: set fileencoding=iso8859-15 :\n', + b"print('\xe2\x82\xac')\n" + ) + encoding, consumed_lines = detect_encoding(self.get_readline(lines)) + self.assertEqual(encoding, 'iso8859-15') + expected = [b"#print('\xc2\xa3')\n", b'# vim: set fileencoding=iso8859-15 :\n'] + self.assertEqual(consumed_lines, expected) + + def test_cookie_second_line_empty_first_line(self): + lines = ( + b'\n', + b'# vim: set fileencoding=iso8859-15 :\n', + b"print('\xe2\x82\xac')\n" + ) + encoding, consumed_lines = detect_encoding(self.get_readline(lines)) + self.assertEqual(encoding, 'iso8859-15') + expected = [b'\n', b'# vim: set fileencoding=iso8859-15 :\n'] + self.assertEqual(consumed_lines, expected) + def test_latin1_normalization(self): # See get_normal_name() in tokenizer.c. encodings = ("latin-1", "iso-8859-1", "iso-latin-1", "latin-1-unix", diff -r ae69ebd41807 Lib/tokenize.py --- a/Lib/tokenize.py Fri Jan 03 15:52:22 2014 -0500 +++ b/Lib/tokenize.py Sat Jan 04 10:35:59 2014 +0200 @@ -32,6 +32,7 @@ import collections from io import TextIOWrapper cookie_re = re.compile(r'^[ \t\f]*#.*coding[:=][ \t]*([-\w.]+)', re.ASCII) +blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII) import token __all__ = token.__all__ + ["COMMENT", "tokenize", "detect_encoding", @@ -409,6 +410,8 @@ encoding = find_cookie(first) if encoding: return encoding, [first] + if not blank_re.match(first): + return default, [first] second = read_or_stop() if not second: diff -r ae69ebd41807 Parser/tokenizer.c --- a/Parser/tokenizer.c Fri Jan 03 15:52:22 2014 -0500 +++ b/Parser/tokenizer.c Sat Jan 04 10:35:59 2014 +0200 @@ -283,13 +283,27 @@ char *cs; int r = 1; - if (tok->cont_line) + if (tok->cont_line) { /* It's a continuation line, so it can't be a coding spec. */ + tok->read_coding_spec = 1; return 1; + } if (!get_coding_spec(line, &cs, size, tok)) return 0; - if (!cs) + if (!cs) { + Py_ssize_t i; + for (i = 0; i < size; i++) { + if (line[i] == '#' || line[i] == '\n' || line[i] == '\r') + break; + if (line[i] != ' ' && line[i] != '\t' && line[i] != '\014') { + /* Stop checking coding spec after a line containing + * anything except a comment. */ + tok->read_coding_spec = 1; + break; + } + } return 1; + } tok->read_coding_spec = 1; if (tok->encoding == NULL) { assert(tok->decoding_state == STATE_RAW); @@ -476,13 +490,17 @@ _Py_IDENTIFIER(open); _Py_IDENTIFIER(readline); int fd; + long pos; io = PyImport_ImportModuleNoBlock("io"); if (io == NULL) goto cleanup; fd = fileno(tok->fp); - if (lseek(fd, 0, SEEK_SET) == (off_t)-1) { + /* Due to buffering the file offset for fd can be different from the file + * position of tok->fp. */ + pos = ftell(tok->fp); + if (pos == -1 || lseek(fd, (off_t)pos, SEEK_SET) == (off_t)-1) { PyErr_SetFromErrnoWithFilename(PyExc_OSError, NULL); goto cleanup; } @@ -752,7 +770,7 @@ if (newl[0]) { if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl)) return error_ret(tok); - if (tok->enc == NULL && newl[1]) { + if (tok->enc == NULL && !tok->read_coding_spec && newl[1]) { if (!check_coding_spec(newl[0]+1, newl[1] - newl[0], tok, buf_setreadl)) return error_ret(tok); diff -r ae69ebd41807 Tools/scripts/findnocoding.py --- a/Tools/scripts/findnocoding.py Fri Jan 03 15:52:22 2014 -0500 +++ b/Tools/scripts/findnocoding.py Sat Jan 04 10:35:59 2014 +0200 @@ -33,6 +33,7 @@ decl_re = re.compile(rb'^[ \t\f]*#.*coding[:=][ \t]*([-\w.]+)') +blank_re = re.compile(rb'^[ \t\f]*(?:[#\r\n]|$)') def get_declaration(line): match = decl_re.match(line) @@ -58,7 +59,8 @@ line1 = infile.readline() line2 = infile.readline() - if get_declaration(line1) or get_declaration(line2): + if (get_declaration(line1) or + blank_re.match(line1) and get_declaration(line2)): # the file does have an encoding declaration, so trust it return False