diff --git a/Doc/library/tokenize.rst b/Doc/library/tokenize.rst --- a/Doc/library/tokenize.rst +++ b/Doc/library/tokenize.rst @@ -15,6 +15,9 @@ as well, making it useful for implementing "pretty-printers," including colorizers for on-screen displays. +Tokenizing Input +---------------- + The primary entry point is a :term:`generator`: .. function:: tokenize(readline) @@ -116,6 +119,26 @@ .. versionadded:: 3.2 +.. _tokenize-cli: + +Command-Line Usage +------------------ + +.. versionadded:: 3.3 + +The :mod:`tokenize` module can be executed as a script from the command line. +It is as simple as + +.. code-block:: sh + + python -m tokenize [filename.py] + +If :file:`filename.py` is specified its contents are tokenized to stdout. +Otherwise, tokenization is performed on stdin. + +Examples +------------------ + Example of a script rewriter that transforms float literals into Decimal objects:: @@ -158,3 +181,37 @@ result.append((toknum, tokval)) return untokenize(result).decode('utf-8') +Example of tokenizing from the command line. The script:: + + def say_hello(): + print("Hello, World!") + + say_hello() + +will be tokenized to the following output where the first column is the range +of the line/column coordinates where the token is found, the second column is +the name of the token, and the final column is the value of the token (if any) + +.. code-block:: sh + + $ python -m tokenize hello.py + 0,0-0,0: ENCODING 'utf-8' + 1,0-1,3: NAME 'def' + 1,4-1,13: NAME 'say_hello' + 1,13-1,14: OP '(' + 1,14-1,15: OP ')' + 1,15-1,16: OP ':' + 1,16-1,17: NEWLINE '\n' + 2,0-2,4: INDENT ' ' + 2,4-2,9: NAME 'print' + 2,9-2,10: OP '(' + 2,10-2,25: STRING '"Hello, World!"' + 2,25-2,26: OP ')' + 2,26-2,27: NEWLINE '\n' + 3,0-3,1: NL '\n' + 4,0-4,0: DEDENT '' + 4,0-4,9: NAME 'say_hello' + 4,9-4,10: OP '(' + 4,10-4,11: OP ')' + 4,11-4,12: NEWLINE '\n' + 5,0-5,0: ENDMARKER '' diff --git a/Lib/tokenize.py b/Lib/tokenize.py --- a/Lib/tokenize.py +++ b/Lib/tokenize.py @@ -530,27 +530,54 @@ def generate_tokens(readline): return _tokenize(readline, None) +def main(): + import argparse + + def error(message, filename=None, location=None): + if location: + args = (filename,) + location + (message,) + print("%s:%d:%d: error: %s" % args) + elif filename: + print("%s: error: %s" % (filename, message)) + else: + print("error: %s" % message) + sys.exit(1) + + # Parse the arguments and options + parser = argparse.ArgumentParser(prog='python -m tokenize') + parser.add_argument(dest='filename', nargs='?', + metavar='filename.py', + help='the file to tokenize; defaults to stdin') + args = parser.parse_args() + + try: + # Tokenize the input + if args.filename: + filename = args.filename + with builtins.open(args.filename, 'rb') as f: + tokens = list(tokenize(f.readline)) + else: + filename = "" + tokens = _tokenize(sys.stdin.readline, None) + + # Output the tokenization + for token in tokens: + token_range = "%d,%d-%d,%d:" % (token.start + token.end) + print("%-20s%-15s%-15r" % + (token_range, tok_name[token.type], token.string)) + except IndentationError as err: + line, column = err.args[1][1:3] + error(err.args[0], filename, (line, column)) + except TokenError as err: + line, column = err.args[1] + error(err.args[0], filename, (line, column)) + except SyntaxError as err: + error(err, filename) + except IOError as err: + error(err) + except: + print("unexpected error:", sys.exc_info()[0]) + raise + if __name__ == "__main__": - # Quick sanity check - s = b'''def parseline(self, line): - """Parse the line into a command name and a string containing - the arguments. Returns a tuple containing (command, args, line). - 'command' and 'args' may be None if the line couldn't be parsed. - """ - line = line.strip() - if not line: - return None, None, line - elif line[0] == '?': - line = 'help ' + line[1:] - elif line[0] == '!': - if hasattr(self, 'do_shell'): - line = 'shell ' + line[1:] - else: - return None, None, line - i, n = 0, len(line) - while i < n and line[i] in self.identchars: i = i+1 - cmd, arg = line[:i], line[i:].strip() - return cmd, arg, line - ''' - for tok in tokenize(iter(s.splitlines()).__next__): - print(tok) + main()