diff --git a/Doc/library/tokenize.rst b/Doc/library/tokenize.rst --- a/Doc/library/tokenize.rst +++ b/Doc/library/tokenize.rst @@ -15,6 +15,11 @@ implemented in Python. The scanner in t as well, making it useful for implementing "pretty-printers," including colorizers for on-screen displays. +To simplify token stream handling, all :ref:`operators` and :ref:`delimiters` +tokens are returned using the generic :data:`token.OP` token type. The exact +type can be determined by checking the ``exact_type`` property on the +:term:`named tuple` returned from :func:`tokenize.tokenize`. + Tokenizing Input ---------------- @@ -36,9 +41,17 @@ The primary entry point is a :term:`gene returned as a :term:`named tuple` with the field names: ``type string start end line``. + The returned :term:`named tuple` has a additional property named + ``exact_type`` that contains the exact operator type for + :data:`token.OP` tokens. For all other token types ``exact_type`` + equals the named tuple ``type`` field. + .. versionchanged:: 3.1 Added support for named tuples. + .. versionchanged:: 3.3 + Added support for ``exact_type``. + :func:`tokenize` determines the source encoding of the file by looking for a UTF-8 BOM or encoding cookie, according to :pep:`263`. @@ -131,7 +144,13 @@ It is as simple as: .. code-block:: sh - python -m tokenize [filename.py] + python -m tokenize [-e] [filename.py] + +The following options are accepted: + +.. cmdoption:: -e, --exact + + display token names using the exact type If :file:`filename.py` is specified its contents are tokenized to stdout. Otherwise, tokenization is performed on stdin. @@ -215,3 +234,29 @@ the name of the token, and the final col 4,10-4,11: OP ')' 4,11-4,12: NEWLINE '\n' 5,0-5,0: ENDMARKER '' + +The exact token type names can be displayed using the ``-e`` option: + +.. code-block:: sh + + $ python -m tokenize -e hello.py + 0,0-0,0: ENCODING 'utf-8' + 1,0-1,3: NAME 'def' + 1,4-1,13: NAME 'say_hello' + 1,13-1,14: LPAR '(' + 1,14-1,15: RPAR ')' + 1,15-1,16: COLON ':' + 1,16-1,17: NEWLINE '\n' + 2,0-2,4: INDENT ' ' + 2,4-2,9: NAME 'print' + 2,9-2,10: LPAR '(' + 2,10-2,25: STRING '"Hello, World!"' + 2,25-2,26: RPAR ')' + 2,26-2,27: NEWLINE '\n' + 3,0-3,1: NL '\n' + 4,0-4,0: DEDENT '' + 4,0-4,9: NAME 'say_hello' + 4,9-4,10: LPAR '(' + 4,10-4,11: RPAR ')' + 4,11-4,12: NEWLINE '\n' + 5,0-5,0: ENDMARKER '' diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py --- a/Lib/test/test_tokenize.py +++ b/Lib/test/test_tokenize.py @@ -567,11 +567,12 @@ Non-ascii identifiers from test import support from tokenize import (tokenize, _tokenize, untokenize, NUMBER, NAME, OP, - STRING, ENDMARKER, tok_name, detect_encoding, + STRING, ENDMARKER, ENCODING, tok_name, detect_encoding, open as tokenize_open) from io import BytesIO from unittest import TestCase import os, sys, glob +import token def dump_tokens(s): """Print out the tokens in s in a table format. @@ -922,6 +923,78 @@ class TestTokenize(TestCase): self.assertTrue(encoding_used, encoding) + def assertExactTypeEqual(self, opstr, *optypes): + tokens = list(tokenize(BytesIO(opstr.encode('utf-8')).readline)) + num_optypes = len(optypes) + self.assertEqual(len(tokens), 2 + num_optypes) + self.assertEqual(token.tok_name[tokens[0].exact_type], + token.tok_name[ENCODING]) + for i in range(num_optypes): + self.assertEqual(token.tok_name[tokens[i + 1].exact_type], + token.tok_name[optypes[i]]) + self.assertEqual(token.tok_name[tokens[1 + num_optypes].exact_type], + token.tok_name[token.ENDMARKER]) + + def test_exact_type(self): + self.assertExactTypeEqual('()', token.LPAR, token.RPAR) + self.assertExactTypeEqual('[]', token.LSQB, token.RSQB) + self.assertExactTypeEqual(':', token.COLON) + self.assertExactTypeEqual(',', token.COMMA) + self.assertExactTypeEqual(';', token.SEMI) + self.assertExactTypeEqual('+', token.PLUS) + self.assertExactTypeEqual('-', token.MINUS) + self.assertExactTypeEqual('*', token.STAR) + self.assertExactTypeEqual('/', token.SLASH) + self.assertExactTypeEqual('|', token.VBAR) + self.assertExactTypeEqual('&', token.AMPER) + self.assertExactTypeEqual('<', token.LESS) + self.assertExactTypeEqual('>', token.GREATER) + self.assertExactTypeEqual('=', token.EQUAL) + self.assertExactTypeEqual('.', token.DOT) + self.assertExactTypeEqual('%', token.PERCENT) + self.assertExactTypeEqual('{}', token.LBRACE, token.RBRACE) + self.assertExactTypeEqual('==', token.EQEQUAL) + self.assertExactTypeEqual('!=', token.NOTEQUAL) + self.assertExactTypeEqual('<=', token.LESSEQUAL) + self.assertExactTypeEqual('>=', token.GREATEREQUAL) + self.assertExactTypeEqual('~', token.TILDE) + self.assertExactTypeEqual('^', token.CIRCUMFLEX) + self.assertExactTypeEqual('<<', token.LEFTSHIFT) + self.assertExactTypeEqual('>>', token.RIGHTSHIFT) + self.assertExactTypeEqual('**', token.DOUBLESTAR) + self.assertExactTypeEqual('+=', token.PLUSEQUAL) + self.assertExactTypeEqual('-=', token.MINEQUAL) + self.assertExactTypeEqual('*=', token.STAREQUAL) + self.assertExactTypeEqual('/=', token.SLASHEQUAL) + self.assertExactTypeEqual('%=', token.PERCENTEQUAL) + self.assertExactTypeEqual('&=', token.AMPEREQUAL) + self.assertExactTypeEqual('|=', token.VBAREQUAL) + self.assertExactTypeEqual('^=', token.CIRCUMFLEXEQUAL) + self.assertExactTypeEqual('^=', token.CIRCUMFLEXEQUAL) + self.assertExactTypeEqual('<<=', token.LEFTSHIFTEQUAL) + self.assertExactTypeEqual('>>=', token.RIGHTSHIFTEQUAL) + self.assertExactTypeEqual('**=', token.DOUBLESTAREQUAL) + self.assertExactTypeEqual('//', token.DOUBLESLASH) + self.assertExactTypeEqual('//=', token.DOUBLESLASHEQUAL) + self.assertExactTypeEqual('@', token.AT) + + self.assertExactTypeEqual('a**2+b**2==c**2', + NAME, token.DOUBLESTAR, NUMBER, + token.PLUS, + NAME, token.DOUBLESTAR, NUMBER, + token.EQEQUAL, + NAME, token.DOUBLESTAR, NUMBER) + self.assertExactTypeEqual('{1, 2, 3}', + token.LBRACE, + token.NUMBER, token.COMMA, + token.NUMBER, token.COMMA, + token.NUMBER, + token.RBRACE) + self.assertExactTypeEqual('^(x & 0x1)', + token.CIRCUMFLEX, + token.LPAR, + token.NAME, token.AMPER, token.NUMBER, + token.RPAR) __test__ = {"doctests" : doctests, 'decistmt': decistmt} diff --git a/Lib/tokenize.py b/Lib/tokenize.py --- a/Lib/tokenize.py +++ b/Lib/tokenize.py @@ -45,6 +45,51 @@ tok_name[NL] = 'NL' ENCODING = N_TOKENS + 2 tok_name[ENCODING] = 'ENCODING' N_TOKENS += 3 +EXACT_TOKEN_TYPES = { + '(': LPAR, + ')': RPAR, + '[': LSQB, + ']': RSQB, + ':': COLON, + ',': COMMA, + ';': SEMI, + '+': PLUS, + '-': MINUS, + '*': STAR, + '/': SLASH, + '|': VBAR, + '&': AMPER, + '<': LESS, + '>': GREATER, + '=': EQUAL, + '.': DOT, + '%': PERCENT, + '{': LBRACE, + '}': RBRACE, + '==': EQEQUAL, + '!=': NOTEQUAL, + '<=': LESSEQUAL, + '>=': GREATEREQUAL, + '~': TILDE, + '^': CIRCUMFLEX, + '<<': LEFTSHIFT, + '>>': RIGHTSHIFT, + '**': DOUBLESTAR, + '+=': PLUSEQUAL, + '-=': MINEQUAL, + '*=': STAREQUAL, + '/=': SLASHEQUAL, + '%=': PERCENTEQUAL, + '&=': AMPEREQUAL, + '|=': VBAREQUAL, + '^=': CIRCUMFLEXEQUAL, + '<<=': LEFTSHIFTEQUAL, + '>>=': RIGHTSHIFTEQUAL, + '**=': DOUBLESTAREQUAL, + '//': DOUBLESLASH, + '//=': DOUBLESLASHEQUAL, + '@': AT +} class TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line')): def __repr__(self): @@ -52,6 +97,13 @@ class TokenInfo(collections.namedtuple(' return ('TokenInfo(type=%s, string=%r, start=%r, end=%r, line=%r)' % self._replace(type=annotated_type)) + @property + def exact_type(self): + if self.type == OP and self.string in EXACT_TOKEN_TYPES: + return EXACT_TOKEN_TYPES[self.string] + else: + return self.type + def group(*choices): return '(' + '|'.join(choices) + ')' def any(*choices): return group(*choices) + '*' def maybe(*choices): return group(*choices) + '?' @@ -549,6 +601,8 @@ def main(): parser.add_argument(dest='filename', nargs='?', metavar='filename.py', help='the file to tokenize; defaults to stdin') + parser.add_argument('-e', '--exact', dest='exact', action='store_true', + help='display token names using the exact type') args = parser.parse_args() try: @@ -563,9 +617,12 @@ def main(): # Output the tokenization for token in tokens: + token_type = token.type + if args.exact: + token_type = token.exact_type token_range = "%d,%d-%d,%d:" % (token.start + token.end) print("%-20s%-15s%-15r" % - (token_range, tok_name[token.type], token.string)) + (token_range, tok_name[token_type], token.string)) except IndentationError as err: line, column = err.args[1][1:3] error(err.args[0], filename, (line, column))