diff -r 3102951cc1ce Doc/library/tokenize.rst --- a/Doc/library/tokenize.rst Wed Aug 31 00:43:55 2011 +0200 +++ b/Doc/library/tokenize.rst Tue Aug 30 23:21:04 2011 -0500 @@ -116,6 +116,13 @@ .. versionadded:: 3.2 +.. function:: tokenize_basestring(source) + + Tokenizes the byte or string object specified in *source*. + + .. versionadded:: 3.3 + + Example of a script rewriter that transforms float literals into Decimal objects:: diff -r 3102951cc1ce Lib/test/test_tokenize.py --- a/Lib/test/test_tokenize.py Wed Aug 31 00:43:55 2011 +0200 +++ b/Lib/test/test_tokenize.py Tue Aug 30 23:21:04 2011 -0500 @@ -1,17 +1,17 @@ -doctests = """ +doctests_template = """ Tests for the tokenize module. The tests can be really simple. Given a small fragment of source code, print out a table with tokens. The ENDMARK is omitted for brevity. - >>> dump_tokens("1 + 1") + >>> {dump_tokens}("1 + 1") ENCODING 'utf-8' (0, 0) (0, 0) NUMBER '1' (1, 0) (1, 1) OP '+' (1, 2) (1, 3) NUMBER '1' (1, 4) (1, 5) - >>> dump_tokens("if False:\\n" + >>> {dump_tokens}("if False:\\n" ... " # NL\\n" ... " True = False # NEWLINE\\n") ENCODING 'utf-8' (0, 0) (0, 0) @@ -96,8 +96,8 @@ ... "5,6)\\n" ... "y = [3, 4,\\n" ... "5]\\n" - ... "z = {'a': 5,\\n" - ... "'b':15, 'c':True}\\n" + ... "z = {{'a': 5,\\n" + ... "'b':15, 'c':True}}\\n" ... "x = len(y) + 5 - a[\\n" ... "3] - a[2]\\n" ... "+ len(z) - z[\\n" @@ -106,33 +106,33 @@ Ordinary integers and binary operators - >>> dump_tokens("0xff <= 255") + >>> {dump_tokens}("0xff <= 255") ENCODING 'utf-8' (0, 0) (0, 0) NUMBER '0xff' (1, 0) (1, 4) OP '<=' (1, 5) (1, 7) NUMBER '255' (1, 8) (1, 11) - >>> dump_tokens("0b10 <= 255") + >>> {dump_tokens}("0b10 <= 255") ENCODING 'utf-8' (0, 0) (0, 0) NUMBER '0b10' (1, 0) (1, 4) OP '<=' (1, 5) (1, 7) NUMBER '255' (1, 8) (1, 11) - >>> dump_tokens("0o123 <= 0O123") + >>> {dump_tokens}("0o123 <= 0O123") ENCODING 'utf-8' (0, 0) (0, 0) NUMBER '0o123' (1, 0) (1, 5) OP '<=' (1, 6) (1, 8) NUMBER '0O123' (1, 9) (1, 14) - >>> dump_tokens("1234567 > ~0x15") + >>> {dump_tokens}("1234567 > ~0x15") ENCODING 'utf-8' (0, 0) (0, 0) NUMBER '1234567' (1, 0) (1, 7) OP '>' (1, 8) (1, 9) OP '~' (1, 10) (1, 11) NUMBER '0x15' (1, 11) (1, 15) - >>> dump_tokens("2134568 != 1231515") + >>> {dump_tokens}("2134568 != 1231515") ENCODING 'utf-8' (0, 0) (0, 0) NUMBER '2134568' (1, 0) (1, 7) OP '!=' (1, 8) (1, 10) NUMBER '1231515' (1, 11) (1, 18) - >>> dump_tokens("(-124561-1) & 200000000") + >>> {dump_tokens}("(-124561-1) & 200000000") ENCODING 'utf-8' (0, 0) (0, 0) OP '(' (1, 0) (1, 1) OP '-' (1, 1) (1, 2) @@ -142,18 +142,18 @@ OP ')' (1, 10) (1, 11) OP '&' (1, 12) (1, 13) NUMBER '200000000' (1, 14) (1, 23) - >>> dump_tokens("0xdeadbeef != -1") + >>> {dump_tokens}("0xdeadbeef != -1") ENCODING 'utf-8' (0, 0) (0, 0) NUMBER '0xdeadbeef' (1, 0) (1, 10) OP '!=' (1, 11) (1, 13) OP '-' (1, 14) (1, 15) NUMBER '1' (1, 15) (1, 16) - >>> dump_tokens("0xdeadc0de & 12345") + >>> {dump_tokens}("0xdeadc0de & 12345") ENCODING 'utf-8' (0, 0) (0, 0) NUMBER '0xdeadc0de' (1, 0) (1, 10) OP '&' (1, 11) (1, 12) NUMBER '12345' (1, 13) (1, 18) - >>> dump_tokens("0xFF & 0x15 | 1234") + >>> {dump_tokens}("0xFF & 0x15 | 1234") ENCODING 'utf-8' (0, 0) (0, 0) NUMBER '0xFF' (1, 0) (1, 4) OP '&' (1, 5) (1, 6) @@ -163,22 +163,22 @@ Long integers - >>> dump_tokens("x = 0") + >>> {dump_tokens}("x = 0") ENCODING 'utf-8' (0, 0) (0, 0) NAME 'x' (1, 0) (1, 1) OP '=' (1, 2) (1, 3) NUMBER '0' (1, 4) (1, 5) - >>> dump_tokens("x = 0xfffffffffff") + >>> {dump_tokens}("x = 0xfffffffffff") ENCODING 'utf-8' (0, 0) (0, 0) NAME 'x' (1, 0) (1, 1) OP '=' (1, 2) (1, 3) NUMBER '0xffffffffff (1, 4) (1, 17) - >>> dump_tokens("x = 123141242151251616110") + >>> {dump_tokens}("x = 123141242151251616110") ENCODING 'utf-8' (0, 0) (0, 0) NAME 'x' (1, 0) (1, 1) OP '=' (1, 2) (1, 3) NUMBER '123141242151 (1, 4) (1, 25) - >>> dump_tokens("x = -15921590215012591") + >>> {dump_tokens}("x = -15921590215012591") ENCODING 'utf-8' (0, 0) (0, 0) NAME 'x' (1, 0) (1, 1) OP '=' (1, 2) (1, 3) @@ -187,39 +187,39 @@ Floating point numbers - >>> dump_tokens("x = 3.14159") + >>> {dump_tokens}("x = 3.14159") ENCODING 'utf-8' (0, 0) (0, 0) NAME 'x' (1, 0) (1, 1) OP '=' (1, 2) (1, 3) NUMBER '3.14159' (1, 4) (1, 11) - >>> dump_tokens("x = 314159.") + >>> {dump_tokens}("x = 314159.") ENCODING 'utf-8' (0, 0) (0, 0) NAME 'x' (1, 0) (1, 1) OP '=' (1, 2) (1, 3) NUMBER '314159.' (1, 4) (1, 11) - >>> dump_tokens("x = .314159") + >>> {dump_tokens}("x = .314159") ENCODING 'utf-8' (0, 0) (0, 0) NAME 'x' (1, 0) (1, 1) OP '=' (1, 2) (1, 3) NUMBER '.314159' (1, 4) (1, 11) - >>> dump_tokens("x = 3e14159") + >>> {dump_tokens}("x = 3e14159") ENCODING 'utf-8' (0, 0) (0, 0) NAME 'x' (1, 0) (1, 1) OP '=' (1, 2) (1, 3) NUMBER '3e14159' (1, 4) (1, 11) - >>> dump_tokens("x = 3E123") + >>> {dump_tokens}("x = 3E123") ENCODING 'utf-8' (0, 0) (0, 0) NAME 'x' (1, 0) (1, 1) OP '=' (1, 2) (1, 3) NUMBER '3E123' (1, 4) (1, 9) - >>> dump_tokens("x+y = 3e-1230") + >>> {dump_tokens}("x+y = 3e-1230") ENCODING 'utf-8' (0, 0) (0, 0) NAME 'x' (1, 0) (1, 1) OP '+' (1, 1) (1, 2) NAME 'y' (1, 2) (1, 3) OP '=' (1, 4) (1, 5) NUMBER '3e-1230' (1, 6) (1, 13) - >>> dump_tokens("x = 3.14e159") + >>> {dump_tokens}("x = 3.14e159") ENCODING 'utf-8' (0, 0) (0, 0) NAME 'x' (1, 0) (1, 1) OP '=' (1, 2) (1, 3) @@ -227,7 +227,7 @@ String literals - >>> dump_tokens("x = ''; y = \\\"\\\"") + >>> {dump_tokens}("x = ''; y = \\\"\\\"") ENCODING 'utf-8' (0, 0) (0, 0) NAME 'x' (1, 0) (1, 1) OP '=' (1, 2) (1, 3) @@ -236,7 +236,7 @@ NAME 'y' (1, 8) (1, 9) OP '=' (1, 10) (1, 11) STRING '""' (1, 12) (1, 14) - >>> dump_tokens("x = '\\\"'; y = \\\"'\\\"") + >>> {dump_tokens}("x = '\\\"'; y = \\\"'\\\"") ENCODING 'utf-8' (0, 0) (0, 0) NAME 'x' (1, 0) (1, 1) OP '=' (1, 2) (1, 3) @@ -245,28 +245,28 @@ NAME 'y' (1, 9) (1, 10) OP '=' (1, 11) (1, 12) STRING '"\\'"' (1, 13) (1, 16) - >>> dump_tokens("x = \\\"doesn't \\\"shrink\\\", does it\\\"") + >>> {dump_tokens}("x = \\\"doesn't \\\"shrink\\\", does it\\\"") ENCODING 'utf-8' (0, 0) (0, 0) NAME 'x' (1, 0) (1, 1) OP '=' (1, 2) (1, 3) STRING '"doesn\\'t "' (1, 4) (1, 14) NAME 'shrink' (1, 14) (1, 20) STRING '", does it"' (1, 20) (1, 31) - >>> dump_tokens("x = 'abc' + 'ABC'") + >>> {dump_tokens}("x = 'abc' + 'ABC'") ENCODING 'utf-8' (0, 0) (0, 0) NAME 'x' (1, 0) (1, 1) OP '=' (1, 2) (1, 3) STRING "'abc'" (1, 4) (1, 9) OP '+' (1, 10) (1, 11) STRING "'ABC'" (1, 12) (1, 17) - >>> dump_tokens('y = "ABC" + "ABC"') + >>> {dump_tokens}('y = "ABC" + "ABC"') ENCODING 'utf-8' (0, 0) (0, 0) NAME 'y' (1, 0) (1, 1) OP '=' (1, 2) (1, 3) STRING '"ABC"' (1, 4) (1, 9) OP '+' (1, 10) (1, 11) STRING '"ABC"' (1, 12) (1, 17) - >>> dump_tokens("x = r'abc' + r'ABC' + R'ABC' + R'ABC'") + >>> {dump_tokens}("x = r'abc' + r'ABC' + R'ABC' + R'ABC'") ENCODING 'utf-8' (0, 0) (0, 0) NAME 'x' (1, 0) (1, 1) OP '=' (1, 2) (1, 3) @@ -277,7 +277,7 @@ STRING "R'ABC'" (1, 22) (1, 28) OP '+' (1, 29) (1, 30) STRING "R'ABC'" (1, 31) (1, 37) - >>> dump_tokens('y = r"abc" + r"ABC" + R"ABC" + R"ABC"') + >>> {dump_tokens}('y = r"abc" + r"ABC" + R"ABC" + R"ABC"') ENCODING 'utf-8' (0, 0) (0, 0) NAME 'y' (1, 0) (1, 1) OP '=' (1, 2) (1, 3) @@ -291,7 +291,7 @@ Operators - >>> dump_tokens("def d22(a, b, c=2, d=2, *k): pass") + >>> {dump_tokens}("def d22(a, b, c=2, d=2, *k): pass") ENCODING 'utf-8' (0, 0) (0, 0) NAME 'def' (1, 0) (1, 3) NAME 'd22' (1, 4) (1, 7) @@ -313,7 +313,7 @@ OP ')' (1, 26) (1, 27) OP ':' (1, 27) (1, 28) NAME 'pass' (1, 29) (1, 33) - >>> dump_tokens("def d01v_(a=1, *k, **w): pass") + >>> {dump_tokens}("def d01v_(a=1, *k, **w): pass") ENCODING 'utf-8' (0, 0) (0, 0) NAME 'def' (1, 0) (1, 3) NAME 'd01v_' (1, 4) (1, 9) @@ -333,7 +333,7 @@ Comparison - >>> dump_tokens("if 1 < 1 > 1 == 1 >= 5 <= 0x15 <= 0x12 != " + + >>> {dump_tokens}("if 1 < 1 > 1 == 1 >= 5 <= 0x15 <= 0x12 != " + ... "1 and 5 in 1 not in 1 is 1 or 5 is not 1: pass") ENCODING 'utf-8' (0, 0) (0, 0) NAME 'if' (1, 0) (1, 2) @@ -371,7 +371,7 @@ Shift - >>> dump_tokens("x = 1 << 1 >> 5") + >>> {dump_tokens}("x = 1 << 1 >> 5") ENCODING 'utf-8' (0, 0) (0, 0) NAME 'x' (1, 0) (1, 1) OP '=' (1, 2) (1, 3) @@ -383,7 +383,7 @@ Additive - >>> dump_tokens("x = 1 - y + 15 - 1 + 0x124 + z + a[5]") + >>> {dump_tokens}("x = 1 - y + 15 - 1 + 0x124 + z + a[5]") ENCODING 'utf-8' (0, 0) (0, 0) NAME 'x' (1, 0) (1, 1) OP '=' (1, 2) (1, 3) @@ -406,7 +406,7 @@ Multiplicative - >>> dump_tokens("x = 1//1*1/5*12%0x12") + >>> {dump_tokens}("x = 1//1*1/5*12%0x12") ENCODING 'utf-8' (0, 0) (0, 0) NAME 'x' (1, 0) (1, 1) OP '=' (1, 2) (1, 3) @@ -424,7 +424,7 @@ Unary - >>> dump_tokens("~1 ^ 1 & 1 |1 ^ -1") + >>> {dump_tokens}("~1 ^ 1 & 1 |1 ^ -1") ENCODING 'utf-8' (0, 0) (0, 0) OP '~' (1, 0) (1, 1) NUMBER '1' (1, 1) (1, 2) @@ -437,7 +437,7 @@ OP '^' (1, 14) (1, 15) OP '-' (1, 16) (1, 17) NUMBER '1' (1, 17) (1, 18) - >>> dump_tokens("-1*1/1+1*1//1 - ---1**1") + >>> {dump_tokens}("-1*1/1+1*1//1 - ---1**1") ENCODING 'utf-8' (0, 0) (0, 0) OP '-' (1, 0) (1, 1) NUMBER '1' (1, 1) (1, 2) @@ -461,7 +461,7 @@ Selector - >>> dump_tokens("import sys, time\\nx = sys.modules['time'].time()") + >>> {dump_tokens}("import sys, time\\nx = sys.modules['time'].time()") ENCODING 'utf-8' (0, 0) (0, 0) NAME 'import' (1, 0) (1, 6) NAME 'sys' (1, 7) (1, 10) @@ -483,7 +483,7 @@ Methods - >>> dump_tokens("@staticmethod\\ndef foo(x,y): pass") + >>> {dump_tokens}("@staticmethod\\ndef foo(x,y): pass") ENCODING 'utf-8' (0, 0) (0, 0) OP '@' (1, 0) (1, 1) NAME 'staticmethod (1, 1) (1, 13) @@ -535,7 +535,7 @@ Evil tabs - >>> dump_tokens("def f():\\n\\tif x\\n \\tpass") + >>> {dump_tokens}("def f():\\n\\tif x\\n \\tpass") ENCODING 'utf-8' (0, 0) (0, 0) NAME 'def' (1, 0) (1, 3) NAME 'f' (1, 4) (1, 5) @@ -554,7 +554,7 @@ Non-ascii identifiers - >>> dump_tokens("Örter = 'places'\\ngrün = 'green'") + >>> {dump_tokens}("Örter = 'places'\\ngrün = 'green'") ENCODING 'utf-8' (0, 0) (0, 0) NAME 'Örter' (1, 0) (1, 5) OP '=' (1, 6) (1, 7) @@ -565,21 +565,22 @@ STRING "'green'" (2, 7) (2, 14) """ +UTF8_ENCODING = "ENCODING 'utf-8' (0, 0) (0, 0)" + from test import support from tokenize import (tokenize, _tokenize, untokenize, NUMBER, NAME, OP, STRING, ENDMARKER, tok_name, detect_encoding, - open as tokenize_open) + open as tokenize_open, tokenize_basestring) from io import BytesIO from unittest import TestCase import os, sys, glob -def dump_tokens(s): +def dump_tokens(s, _tokenize): """Print out the tokens in s in a table format. The ENDMARKER is omitted. """ - f = BytesIO(s.encode('utf-8')) - for type, token, start, end, line in tokenize(f.readline): + for type, token, start, end, line in _tokenize(s): if type == ENDMARKER: break type = tok_name[type] @@ -641,6 +642,24 @@ result.append((toknum, tokval)) return untokenize(result).decode('utf-8') +# Helper functions for testing tokenization of byte IO streams. +def tokenize_bytes_io(s): + f = BytesIO(s.encode('utf-8')) + return tokenize(f.readline) +bytes_io_dump_tokens = lambda s: dump_tokens(s, tokenize_bytes_io) +bytes_io_doctests = doctests_template.format(dump_tokens='bytes_io_dump_tokens') + +# Helper functions for testing tokenization of byte strings. +def tokenize_bytes(s): + return tokenize_basestring(bytes(s, 'utf-8')) +bytes_dump_tokens = lambda s: dump_tokens(s, tokenize_bytes) +bytes_doctests = doctests_template.format(dump_tokens='bytes_dump_tokens') + +# Helper functions for testing tokenization of strings. +str_dump_tokens = lambda s: dump_tokens(s, tokenize_basestring) +str_doctests = doctests_template.format(dump_tokens='str_dump_tokens') +# Strings don't have encodings. +str_doctests = str_doctests.replace('\n ' + UTF8_ENCODING + '\n', '\n') class TestTokenizerAdheresToPep0263(TestCase): """ @@ -923,7 +942,20 @@ self.assertTrue(encoding_used, encoding) -__test__ = {"doctests" : doctests, 'decistmt': decistmt} +class TestTokenizeBasestring(TestCase): + + def test_type_error(self): + with self.assertRaises(TypeError): + tokenize_basestring(187) + with self.assertRaises(TypeError): + tokenize_basestring(list()) + +__test__ = { + "bytes_io_doctests" : bytes_io_doctests, + "bytes_doctests" : bytes_doctests, + "str_doctests" : str_doctests, + 'decistmt': decistmt +} def test_main(): from test import test_tokenize @@ -932,6 +964,7 @@ support.run_unittest(Test_Tokenize) support.run_unittest(TestDetectEncoding) support.run_unittest(TestTokenize) + support.run_unittest(TestTokenizeBasestring) if __name__ == "__main__": test_main() diff -r 3102951cc1ce Lib/tokenize.py --- a/Lib/tokenize.py Wed Aug 31 00:43:55 2011 +0200 +++ b/Lib/tokenize.py Tue Aug 30 23:21:04 2011 -0500 @@ -35,7 +35,8 @@ import token __all__ = token.__all__ + ["COMMENT", "tokenize", "detect_encoding", - "NL", "untokenize", "ENCODING", "TokenInfo"] + "NL", "untokenize", "ENCODING", "TokenInfo", + "tokenize_basestring"] del token COMMENT = N_TOKENS @@ -525,6 +526,16 @@ yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '') +def tokenize_basestring(source): + import io + + if isinstance(source, str): + return _tokenize(io.StringIO(source).readline, None) + elif isinstance(source, bytes): + return tokenize(io.BytesIO(source).readline) + else: + raise TypeError("expected a 'str' or 'bytes' object") + # An undocumented, backwards compatible, API for all the places in the standard # library that expect to be able to use tokenize with strings def generate_tokens(readline):