diff -r 3a57eafd8401 Lib/test/test_tokenize.py --- a/Lib/test/test_tokenize.py Tue May 24 09:15:14 2016 +0300 +++ b/Lib/test/test_tokenize.py Tue May 24 22:53:56 2016 +0200 @@ -1,14 +1,13 @@ from test import support -from tokenize import (tokenize, _tokenize, untokenize, NUMBER, NAME, OP, - STRING, ENDMARKER, ENCODING, tok_name, detect_encoding, - open as tokenize_open, Untokenizer) +import tokenize from io import BytesIO -from unittest import TestCase, mock +import unittest +from unittest import mock import os import token -class TokenizeTest(TestCase): +class TokenizeTest(unittest.TestCase): # Tests for the tokenize module. # The tests can be really simple. Given a small fragment of source @@ -20,10 +19,10 @@ # The ENDMARKER is omitted. result = [] f = BytesIO(s.encode('utf-8')) - for type, token, start, end, line in tokenize(f.readline): - if type == ENDMARKER: + for type, token, start, end, line in tokenize.tokenize(f.readline): + if type == tokenize.ENDMARKER: break - type = tok_name[type] + type = tokenize.tok_name[type] result.append(f" {type:10} {token!r:13} {start} {end}") self.assertEqual(result, [" ENCODING 'utf-8' (0, 0) (0, 0)"] + @@ -61,7 +60,7 @@ with self.assertRaisesRegex(IndentationError, "unindent does not match any " "outer indentation level"): - for tok in tokenize(readline): + for tok in tokenize.tokenize(readline): pass def test_int(self): @@ -883,20 +882,22 @@ def decistmt(s): result = [] - g = tokenize(BytesIO(s.encode('utf-8')).readline) # tokenize the string + # Tokenize the string. + g = tokenize.tokenize(BytesIO(s.encode('utf-8')).readline) for toknum, tokval, _, _, _ in g: - if toknum == NUMBER and '.' in tokval: # replace NUMBER tokens + # Replace NUMBER tokens. + if toknum == tokenize.NUMBER and '.' in tokval: result.extend([ - (NAME, 'Decimal'), - (OP, '('), - (STRING, repr(tokval)), - (OP, ')') + (tokenize.NAME, 'Decimal'), + (tokenize.OP, '('), + (tokenize.STRING, repr(tokval)), + (tokenize.OP, ')') ]) else: result.append((toknum, tokval)) - return untokenize(result).decode('utf-8') + return tokenize.untokenize(result).decode('utf-8') -class TestMisc(TestCase): +class TestMisc(unittest.TestCase): def test_decistmt(self): # Substitute Decimals for floats in a string of statements. @@ -919,7 +920,7 @@ Decimal('-3.217160342717258261933904529E-7')) -class TestTokenizerAdheresToPep0263(TestCase): +class TestTokenizerAdheresToPep0263(unittest.TestCase): """ Test that tokenizer adheres to the coding behaviour stipulated in PEP 0263. """ @@ -956,7 +957,7 @@ self.assertRaises(SyntaxError, self._testFile, 'bad_coding2.py') -class Test_Tokenize(TestCase): +class Test_Tokenize(unittest.TestCase): def test__tokenize_decodes_with_specified_encoding(self): literal = '"ЉЊЈЁЂ"' @@ -971,7 +972,7 @@ return b'' # skip the initial encoding token and the end token - tokens = list(_tokenize(readline, encoding='utf-8'))[1:-1] + tokens = list(tokenize._tokenize(readline, encoding='utf-8'))[1:-1] expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')] self.assertEqual(tokens, expected_tokens, "bytes not decoded with encoding") @@ -988,13 +989,13 @@ return b'' # skip the end token - tokens = list(_tokenize(readline, encoding=None))[:-1] + tokens = list(tokenize._tokenize(readline, encoding=None))[:-1] expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')] self.assertEqual(tokens, expected_tokens, "string not tokenized when encoding is None") -class TestDetectEncoding(TestCase): +class TestDetectEncoding(unittest.TestCase): def get_readline(self, lines): index = 0 @@ -1013,7 +1014,9 @@ b'print(something)\n', b'do_something(else)\n' ) - encoding, consumed_lines = detect_encoding(self.get_readline(lines)) + encoding, consumed_lines = tokenize.detect_encoding( + self.get_readline(lines), + ) self.assertEqual(encoding, 'utf-8') self.assertEqual(consumed_lines, list(lines[:2])) @@ -1023,7 +1026,9 @@ b'print(something)\n', b'do_something(else)\n' ) - encoding, consumed_lines = detect_encoding(self.get_readline(lines)) + encoding, consumed_lines = tokenize.detect_encoding( + self.get_readline(lines), + ) self.assertEqual(encoding, 'utf-8-sig') self.assertEqual(consumed_lines, [b'# something\n', b'print(something)\n']) @@ -1034,7 +1039,9 @@ b'print(something)\n', b'do_something(else)\n' ) - encoding, consumed_lines = detect_encoding(self.get_readline(lines)) + encoding, consumed_lines = tokenize.detect_encoding( + self.get_readline(lines), + ) self.assertEqual(encoding, 'iso-8859-1') self.assertEqual(consumed_lines, [b'# -*- coding: latin-1 -*-\n']) @@ -1044,7 +1051,9 @@ b'print(something)\n', b'do_something(else)\n' ) - encoding, consumed_lines = detect_encoding(self.get_readline(lines)) + encoding, consumed_lines = tokenize.detect_encoding( + self.get_readline(lines), + ) self.assertEqual(encoding, 'utf-8-sig') self.assertEqual(consumed_lines, [b'# coding=utf-8\n']) @@ -1055,7 +1064,7 @@ b'do_something(else)\n' ) readline = self.get_readline(lines) - self.assertRaises(SyntaxError, detect_encoding, readline) + self.assertRaises(SyntaxError, tokenize.detect_encoding, readline) def test_cookie_second_line_no_bom(self): lines = ( @@ -1064,7 +1073,9 @@ b'print(something)\n', b'do_something(else)\n' ) - encoding, consumed_lines = detect_encoding(self.get_readline(lines)) + encoding, consumed_lines = tokenize.detect_encoding( + self.get_readline(lines), + ) self.assertEqual(encoding, 'ascii') expected = [b'#! something\n', b'# vim: set fileencoding=ascii :\n'] self.assertEqual(consumed_lines, expected) @@ -1076,7 +1087,9 @@ b'print(something)\n', b'do_something(else)\n' ) - encoding, consumed_lines = detect_encoding(self.get_readline(lines)) + encoding, consumed_lines = tokenize.detect_encoding( + self.get_readline(lines), + ) self.assertEqual(encoding, 'utf-8-sig') self.assertEqual(consumed_lines, [b'#! something\n', b'f# coding=utf-8\n']) @@ -1089,7 +1102,7 @@ b'do_something(else)\n' ) readline = self.get_readline(lines) - self.assertRaises(SyntaxError, detect_encoding, readline) + self.assertRaises(SyntaxError, tokenize.detect_encoding, readline) def test_cookie_second_line_noncommented_first_line(self): lines = ( @@ -1097,7 +1110,9 @@ b'# vim: set fileencoding=iso8859-15 :\n', b"print('\xe2\x82\xac')\n" ) - encoding, consumed_lines = detect_encoding(self.get_readline(lines)) + encoding, consumed_lines = tokenize.detect_encoding( + self.get_readline(lines), + ) self.assertEqual(encoding, 'utf-8') expected = [b"print('\xc2\xa3')\n"] self.assertEqual(consumed_lines, expected) @@ -1108,7 +1123,9 @@ b'# vim: set fileencoding=iso8859-15 :\n', b"print('\xe2\x82\xac')\n" ) - encoding, consumed_lines = detect_encoding(self.get_readline(lines)) + encoding, consumed_lines = tokenize.detect_encoding( + self.get_readline(lines), + ) self.assertEqual(encoding, 'iso8859-15') expected = [b"#print('\xc2\xa3')\n", b'# vim: set fileencoding=iso8859-15 :\n'] self.assertEqual(consumed_lines, expected) @@ -1119,7 +1136,9 @@ b'# vim: set fileencoding=iso8859-15 :\n', b"print('\xe2\x82\xac')\n" ) - encoding, consumed_lines = detect_encoding(self.get_readline(lines)) + encoding, consumed_lines = tokenize.detect_encoding( + self.get_readline(lines), + ) self.assertEqual(encoding, 'iso8859-15') expected = [b'\n', b'# vim: set fileencoding=iso8859-15 :\n'] self.assertEqual(consumed_lines, expected) @@ -1136,7 +1155,7 @@ b"print(things)\n", b"do_something += 4\n") rl = self.get_readline(lines) - found, consumed_lines = detect_encoding(rl) + found, consumed_lines = tokenize.detect_encoding(rl) self.assertEqual(found, "iso-8859-1") def test_syntaxerror_latin1(self): @@ -1146,7 +1165,7 @@ b'print("\xdf")', # Latin-1: LATIN SMALL LETTER SHARP S ) readline = self.get_readline(lines) - self.assertRaises(SyntaxError, detect_encoding, readline) + self.assertRaises(SyntaxError, tokenize.detect_encoding, readline) def test_utf8_normalization(self): @@ -1159,36 +1178,36 @@ b"# coding: " + enc.encode("ascii") + b"\n", b"1 + 3\n") rl = self.get_readline(lines) - found, consumed_lines = detect_encoding(rl) + found, consumed_lines = tokenize.detect_encoding(rl) self.assertEqual(found, "utf-8") def test_short_files(self): readline = self.get_readline((b'print(something)\n',)) - encoding, consumed_lines = detect_encoding(readline) + encoding, consumed_lines = tokenize.detect_encoding(readline) self.assertEqual(encoding, 'utf-8') self.assertEqual(consumed_lines, [b'print(something)\n']) - encoding, consumed_lines = detect_encoding(self.get_readline(())) + encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(())) self.assertEqual(encoding, 'utf-8') self.assertEqual(consumed_lines, []) readline = self.get_readline((b'\xef\xbb\xbfprint(something)\n',)) - encoding, consumed_lines = detect_encoding(readline) + encoding, consumed_lines = tokenize.detect_encoding(readline) self.assertEqual(encoding, 'utf-8-sig') self.assertEqual(consumed_lines, [b'print(something)\n']) readline = self.get_readline((b'\xef\xbb\xbf',)) - encoding, consumed_lines = detect_encoding(readline) + encoding, consumed_lines = tokenize.detect_encoding(readline) self.assertEqual(encoding, 'utf-8-sig') self.assertEqual(consumed_lines, []) readline = self.get_readline((b'# coding: bad\n',)) - self.assertRaises(SyntaxError, detect_encoding, readline) + self.assertRaises(SyntaxError, tokenize.detect_encoding, readline) def test_false_encoding(self): # Issue 18873: "Encoding" detected in non-comment lines readline = self.get_readline((b'print("#coding=fake")',)) - encoding, consumed_lines = detect_encoding(readline) + encoding, consumed_lines = tokenize.detect_encoding(readline) self.assertEqual(encoding, 'utf-8') self.assertEqual(consumed_lines, [b'print("#coding=fake")']) @@ -1201,14 +1220,14 @@ with open(filename, 'w', encoding=encoding) as fp: print("# coding: %s" % encoding, file=fp) print("print('euro:\u20ac')", file=fp) - with tokenize_open(filename) as fp: + with tokenize.open(filename) as fp: self.assertEqual(fp.encoding, encoding) self.assertEqual(fp.mode, 'r') # test BOM (no coding cookie) with open(filename, 'w', encoding='utf-8-sig') as fp: print("print('euro:\u20ac')", file=fp) - with tokenize_open(filename) as fp: + with tokenize.open(filename) as fp: self.assertEqual(fp.encoding, 'utf-8-sig') self.assertEqual(fp.mode, 'r') @@ -1235,23 +1254,22 @@ ins = Bunk(lines, path) # Make sure lacking a name isn't an issue. del ins.name - detect_encoding(ins.readline) + tokenize.detect_encoding(ins.readline) with self.assertRaisesRegex(SyntaxError, '.*{}'.format(path)): ins = Bunk(lines, path) - detect_encoding(ins.readline) + tokenize.detect_encoding(ins.readline) def test_open_error(self): # Issue #23840: open() must close the binary file on error m = BytesIO(b'#coding:xxx') with mock.patch('tokenize._builtin_open', return_value=m): - self.assertRaises(SyntaxError, tokenize_open, 'foobar') + self.assertRaises(SyntaxError, tokenize.open, 'foobar') self.assertTrue(m.closed) -class TestTokenize(TestCase): +class TestTokenize(unittest.TestCase): def test_tokenize(self): - import tokenize as tokenize_module encoding = object() encoding_used = None def mock_detect_encoding(readline): @@ -1276,17 +1294,17 @@ return b'' return str(counter).encode() - orig_detect_encoding = tokenize_module.detect_encoding - orig__tokenize = tokenize_module._tokenize - tokenize_module.detect_encoding = mock_detect_encoding - tokenize_module._tokenize = mock__tokenize + orig_detect_encoding = tokenize.detect_encoding + orig__tokenize = tokenize._tokenize + tokenize.detect_encoding = mock_detect_encoding + tokenize._tokenize = mock__tokenize try: - results = tokenize(mock_readline) + results = tokenize.tokenize(mock_readline) self.assertEqual(list(results), [b'first', b'second', b'1', b'2', b'3', b'4']) finally: - tokenize_module.detect_encoding = orig_detect_encoding - tokenize_module._tokenize = orig__tokenize + tokenize.detect_encoding = orig_detect_encoding + tokenize._tokenize = orig__tokenize self.assertTrue(encoding_used, encoding) @@ -1298,15 +1316,17 @@ buf = '\n'.join(buf) # Test that 500 consequent, one-line defs is OK - toks = list(tokenize(BytesIO(buf.encode('utf-8')).readline)) + toks = list(tokenize.tokenize(BytesIO(buf.encode('utf-8')).readline)) self.assertEqual(toks[-2].string, 'OK') # [-1] is always ENDMARKER def assertExactTypeEqual(self, opstr, *optypes): - tokens = list(tokenize(BytesIO(opstr.encode('utf-8')).readline)) + tokens = list( + tokenize.tokenize(BytesIO(opstr.encode('utf-8')).readline) + ) num_optypes = len(optypes) self.assertEqual(len(tokens), 2 + num_optypes) self.assertEqual(token.tok_name[tokens[0].exact_type], - token.tok_name[ENCODING]) + token.tok_name[tokenize.ENCODING]) for i in range(num_optypes): self.assertEqual(token.tok_name[tokens[i + 1].exact_type], token.tok_name[optypes[i]]) @@ -1358,11 +1378,11 @@ self.assertExactTypeEqual('@=', token.ATEQUAL) self.assertExactTypeEqual('a**2+b**2==c**2', - NAME, token.DOUBLESTAR, NUMBER, - token.PLUS, - NAME, token.DOUBLESTAR, NUMBER, - token.EQEQUAL, - NAME, token.DOUBLESTAR, NUMBER) + tokenize.NAME, token.DOUBLESTAR, + tokenize.NUMBER, token.PLUS, tokenize.NAME, + token.DOUBLESTAR, tokenize.NUMBER, + token.EQEQUAL, tokenize.NAME, + token.DOUBLESTAR, tokenize.NUMBER) self.assertExactTypeEqual('{1, 2, 3}', token.LBRACE, token.NUMBER, token.COMMA, @@ -1380,11 +1400,11 @@ self.assertExactTypeEqual('@ ', token.AT) -class UntokenizeTest(TestCase): +class UntokenizeTest(unittest.TestCase): def test_bad_input_order(self): # raise if previous row - u = Untokenizer() + u = tokenize.Untokenizer() u.prev_row = 2 u.prev_col = 2 with self.assertRaises(ValueError) as cm: @@ -1396,7 +1416,7 @@ def test_backslash_continuation(self): # The problem is that \ leaves no token - u = Untokenizer() + u = tokenize.Untokenizer() u.prev_row = 1 u.prev_col = 1 u.tokens = [] @@ -1408,20 +1428,20 @@ TestRoundtrip.check_roundtrip(self, 'a\n b\n c\n \\\n c\n') def test_iter_compat(self): - u = Untokenizer() - token = (NAME, 'Hello') - tokens = [(ENCODING, 'utf-8'), token] + u = tokenize.Untokenizer() + token = (tokenize.NAME, 'Hello') + tokens = [(tokenize.ENCODING, 'utf-8'), token] u.compat(token, iter([])) self.assertEqual(u.tokens, ["Hello "]) - u = Untokenizer() + u = tokenize.Untokenizer() self.assertEqual(u.untokenize(iter([token])), 'Hello ') - u = Untokenizer() + u = tokenize.Untokenizer() self.assertEqual(u.untokenize(iter(tokens)), 'Hello ') self.assertEqual(u.encoding, 'utf-8') - self.assertEqual(untokenize(iter(tokens)), b'Hello ') + self.assertEqual(tokenize.untokenize(iter(tokens)), b'Hello ') -class TestRoundtrip(TestCase): +class TestRoundtrip(unittest.TestCase): def check_roundtrip(self, f): """ @@ -1442,17 +1462,17 @@ code = f.read() f.close() readline = iter(code.splitlines(keepends=True)).__next__ - tokens5 = list(tokenize(readline)) + tokens5 = list(tokenize.tokenize(readline)) tokens2 = [tok[:2] for tok in tokens5] # Reproduce tokens2 from pairs - bytes_from2 = untokenize(tokens2) + bytes_from2 = tokenize.untokenize(tokens2) readline2 = iter(bytes_from2.splitlines(keepends=True)).__next__ - tokens2_from2 = [tok[:2] for tok in tokenize(readline2)] + tokens2_from2 = [tok[:2] for tok in tokenize.tokenize(readline2)] self.assertEqual(tokens2_from2, tokens2) # Reproduce tokens2 from 5-tuples - bytes_from5 = untokenize(tokens5) + bytes_from5 = tokenize.untokenize(tokens5) readline5 = iter(bytes_from5.splitlines(keepends=True)).__next__ - tokens2_from5 = [tok[:2] for tok in tokenize(readline5)] + tokens2_from5 = [tok[:2] for tok in tokenize.tokenize(readline5)] self.assertEqual(tokens2_from5, tokens2) def test_roundtrip(self): @@ -1531,7 +1551,7 @@ # Tokenize is broken on test_pep3131.py because regular expressions are # broken on the obscure unicode identifiers in it. *sigh* - # With roundtrip extended to test the 5-tuple mode of untokenize, + # With roundtrip extended to test the 5-tuple mode of untokenize, # 7 more testfiles fail. Remove them also until the failure is diagnosed. testfiles.remove(os.path.join(tempdir, "test_pep3131.py")) @@ -1550,7 +1570,10 @@ def roundtrip(self, code): if isinstance(code, str): code = code.encode('utf-8') - return untokenize(tokenize(BytesIO(code).readline)).decode('utf-8') + return ( + tokenize.untokenize(tokenize.tokenize(BytesIO(code).readline)) + .decode('utf-8') + ) def test_indentation_semantics_retained(self): """ @@ -1563,5 +1586,26 @@ self.check_roundtrip(code) +class MiscTestCase(unittest.TestCase): + def test__all__(self): + extra = { + "ISTERMINAL", "ISNONTERMINAL", "ISEOF" + } + blacklist = { + "BOM_UTF8", "cookie_re", "blank_re", "EXACT_TOKEN_TYPES", + "group", "any", "maybe", + "Whitespace", "Comment", "Ignore", "Name", + "Hexnumber", "Binnumber", "Octnumber", "Decnumber", "Intnumber", + "Exponent", "Pointfloat", "Expfloat", "Floatnumber", "Imagnumber", + "Number", + "StringPrefix", "Single", "Double", "Single3", "Double3", "Triple", + "String", "Operator", "Bracket", "Special", "Funny", "PlainToken", + "Token", "ContStr", "PseudoToken", "PseudoExtras", + "endpats", "single_quoted", "triple_quoted", "t", "u", "tabsize", + "StopTokenizing", "Untokenizer", "generate_tokens", "main", + } + support.check__all__(self, tokenize, extra=extra, blacklist=blacklist) + + if __name__ == "__main__": unittest.main() diff -r 3a57eafd8401 Lib/tokenize.py --- a/Lib/tokenize.py Tue May 24 09:15:14 2016 +0300 +++ b/Lib/tokenize.py Tue May 24 22:53:56 2016 +0200 @@ -38,8 +38,10 @@ blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII) import token -__all__ = token.__all__ + ["COMMENT", "tokenize", "detect_encoding", - "NL", "untokenize", "ENCODING", "TokenInfo"] +__all__ = token.__all__ + [ + "COMMENT", "NL", "ENCODING", "TokenInfo", "TokenError", "detect_encoding", + "untokenize", "open", "tokenize" +] del token COMMENT = N_TOKENS