Index: Doc/library/tokenize.rst =================================================================== --- Doc/library/tokenize.rst (revision 84486) +++ Doc/library/tokenize.rst (working copy) @@ -83,7 +83,7 @@ :func:`tokenize` needs to detect the encoding of source files it tokenizes. The function it uses to do this is available: -.. function:: detect_encoding(readline) +.. function:: detect_encoding(readline, default="utf-8") The :func:`detect_encoding` function is used to detect the encoding that should be used to decode a Python source file. It requires one argument, @@ -98,8 +98,8 @@ but disagree, a SyntaxError will be raised. Note that if the BOM is found, ``'utf-8-sig'`` will be returned as an encoding. - If no encoding is specified, then the default of ``'utf-8'`` will be - returned. + If no encoding is specified, then the default value will be returned + (default: ``'utf-8`''). :func:`detect_encoding` is useful for robustly reading Python source files. A common pattern for this follows:: @@ -130,7 +130,7 @@ we're only showing 12 digits, and the 13th isn't close to 5, the rest of the output should be platform-independent. - >>> exec(s) #doctest: +ELLIPSIS + >>> exec(s) #doctest: +ELLIPSIS -3.21716034272e-0...7 Output from calculations with Decimal should be identical across all @@ -140,7 +140,7 @@ -3.217160342717258261933904529E-7 """ result = [] - g = tokenize(BytesIO(s.encode('utf-8')).readline) # tokenize the string + g = tokenize(BytesIO(s.encode('utf-8')).readline) # tokenize the string for toknum, tokval, _, _, _ in g: if toknum == NUMBER and '.' in tokval: # replace NUMBER tokens result.extend([ Index: Lib/tokenize.py =================================================================== --- Lib/tokenize.py (revision 84486) +++ Lib/tokenize.py (working copy) @@ -297,11 +297,11 @@ return "iso-8859-1" return orig_enc -def detect_encoding(readline): +def detect_encoding(readline, default='utf-8'): """ The detect_encoding() function is used to detect the encoding that should - be used to decode a Python source file. It requires one argment, readline, - in the same way as the tokenize() generator. + be used to decode a Python source file. It requires one argument, + readline, in the same way as the tokenize() generator. It will call readline a maximum of twice, and return the encoding used (as a string) and a list of any lines (left as bytes) it has read in. @@ -312,11 +312,11 @@ invalid charset, raise a SyntaxError. Note that if a utf-8 bom is found, 'utf-8-sig' is returned. - If no encoding is specified, then the default of 'utf-8' will be returned. + If no encoding is specified, then the default value will be returned + (default: 'utf-8'). """ bom_found = False encoding = None - default = 'utf-8' def read_or_stop(): try: return readline() Index: Lib/test/test_tokenize.py =================================================================== --- Lib/test/test_tokenize.py (revision 84486) +++ Lib/test/test_tokenize.py (working copy) @@ -720,6 +720,15 @@ return line return readline + def _check_detected(self, lines, encoding, consumed_lines): + # Check if the encoding is really declared in the file, + # with a BOM or a coding cookie. + for default in None, '', 42: + declared = default if (encoding is None) else encoding + found, consumed = detect_encoding(self.get_readline(lines), default) + self.assertEquals(found, declared) + self.assertEquals(consumed, consumed_lines) + def test_no_bom_no_encoding_cookie(self): lines = ( b'# something\n', @@ -729,6 +738,7 @@ encoding, consumed_lines = detect_encoding(self.get_readline(lines)) self.assertEquals(encoding, 'utf-8') self.assertEquals(consumed_lines, list(lines[:2])) + self._check_detected(lines, None, consumed_lines) def test_bom_no_cookie(self): lines = ( @@ -740,6 +750,7 @@ self.assertEquals(encoding, 'utf-8-sig') self.assertEquals(consumed_lines, [b'# something\n', b'print(something)\n']) + self._check_detected(lines, encoding, consumed_lines) def test_cookie_first_line_no_bom(self): lines = ( @@ -750,6 +761,7 @@ encoding, consumed_lines = detect_encoding(self.get_readline(lines)) self.assertEquals(encoding, 'iso-8859-1') self.assertEquals(consumed_lines, [b'# -*- coding: latin-1 -*-\n']) + self._check_detected(lines, encoding, consumed_lines) def test_matched_bom_and_cookie_first_line(self): lines = ( @@ -760,6 +772,7 @@ encoding, consumed_lines = detect_encoding(self.get_readline(lines)) self.assertEquals(encoding, 'utf-8-sig') self.assertEquals(consumed_lines, [b'# coding=utf-8\n']) + self._check_detected(lines, encoding, consumed_lines) def test_mismatched_bom_and_cookie_first_line_raises_syntaxerror(self): lines = ( @@ -769,6 +782,8 @@ ) readline = self.get_readline(lines) self.assertRaises(SyntaxError, detect_encoding, readline) + readline = self.get_readline(lines) + self.assertRaises(SyntaxError, detect_encoding, readline, 'ascii') def test_cookie_second_line_no_bom(self): lines = ( @@ -781,6 +796,7 @@ self.assertEquals(encoding, 'ascii') expected = [b'#! something\n', b'# vim: set fileencoding=ascii :\n'] self.assertEquals(consumed_lines, expected) + self._check_detected(lines, encoding, consumed_lines) def test_matched_bom_and_cookie_second_line(self): lines = ( @@ -793,6 +809,7 @@ self.assertEquals(encoding, 'utf-8-sig') self.assertEquals(consumed_lines, [b'#! something\n', b'f# coding=utf-8\n']) + self._check_detected(lines, encoding, consumed_lines) def test_mismatched_bom_and_cookie_second_line_raises_syntaxerror(self): lines = ( @@ -803,6 +820,8 @@ ) readline = self.get_readline(lines) self.assertRaises(SyntaxError, detect_encoding, readline) + readline = self.get_readline(lines) + self.assertRaises(SyntaxError, detect_encoding, readline, 'ascii') def test_latin1_normalization(self): # See get_normal_name() in tokenizer.c. @@ -818,6 +837,7 @@ rl = self.get_readline(lines) found, consumed_lines = detect_encoding(rl) self.assertEquals(found, "iso-8859-1") + self._check_detected(lines, found, consumed_lines) def test_utf8_normalization(self): # See get_normal_name() in tokenizer.c. @@ -831,29 +851,37 @@ rl = self.get_readline(lines) found, consumed_lines = detect_encoding(rl) self.assertEquals(found, "utf-8") + self._check_detected(lines, found, consumed_lines) def test_short_files(self): readline = self.get_readline((b'print(something)\n',)) encoding, consumed_lines = detect_encoding(readline) self.assertEquals(encoding, 'utf-8') self.assertEquals(consumed_lines, [b'print(something)\n']) + self._check_detected([b'print(something)\n'], None, consumed_lines) encoding, consumed_lines = detect_encoding(self.get_readline(())) self.assertEquals(encoding, 'utf-8') self.assertEquals(consumed_lines, []) + self._check_detected((), None, consumed_lines) readline = self.get_readline((b'\xef\xbb\xbfprint(something)\n',)) encoding, consumed_lines = detect_encoding(readline) self.assertEquals(encoding, 'utf-8-sig') self.assertEquals(consumed_lines, [b'print(something)\n']) + self._check_detected((b'\xef\xbb\xbfprint(something)\n',), + encoding, consumed_lines) readline = self.get_readline((b'\xef\xbb\xbf',)) encoding, consumed_lines = detect_encoding(readline) self.assertEquals(encoding, 'utf-8-sig') self.assertEquals(consumed_lines, []) + self._check_detected((b'\xef\xbb\xbf',), encoding, consumed_lines) readline = self.get_readline((b'# coding: bad\n',)) self.assertRaises(SyntaxError, detect_encoding, readline) + readline = self.get_readline((b'# coding: bad\n',)) + self.assertRaises(SyntaxError, detect_encoding, readline, 'ascii') class TestTokenize(TestCase):