Index: Misc/ACKS =================================================================== --- Misc/ACKS (revision 61560) +++ Misc/ACKS (working copy) @@ -752,3 +752,5 @@ Mike Zarnstorff Siebren van der Zee Uwe Zessin +Trent Nelson +Michael Foord Index: Misc/NEWS =================================================================== --- Misc/NEWS (revision 61560) +++ Misc/NEWS (working copy) @@ -41,6 +41,12 @@ - Issue #1202: zlib.crc32 and zlib.adler32 now return an unsigned value. +- Issue #719888: Updated tokenize to use a bytes API. generate_tokens has been + renamed tokenize and now works with bytes rather than strings. A new + detect_encoding function has been added for determining source file encoding + according to PEP-0263. Token sequences returned by tokenize always start + with an ENCODING token which specifies the encoding used to decode the file. + This token is used to encode the output of untokenize back to bytes. What's New in Python 3.0a3? =========================== @@ -175,7 +181,6 @@ - Issue #1578: Problems in win_getpass. - Build ----- Index: Tools/i18n/pygettext.py =================================================================== --- Tools/i18n/pygettext.py (revision 61560) +++ Tools/i18n/pygettext.py (working copy) @@ -631,7 +631,9 @@ try: eater.set_filename(filename) try: - tokenize.tokenize(fp.readline, eater) + tokens = tokenize.generate_tokens(fp.readline) + for _token in tokens: + eater(*_token) except tokenize.TokenError as e: print('%s: %s, line %d, column %d' % ( e.args[0], filename, e.args[1][0], e.args[1][1]), Index: Tools/scripts/checkappend.py =================================================================== --- Tools/scripts/checkappend.py (revision 61560) +++ Tools/scripts/checkappend.py (working copy) @@ -103,7 +103,9 @@ def run(self): try: - tokenize.tokenize(self.file.readline, self.tokeneater) + tokens = tokenize.generate_tokens(self.file.readline) + for _token in tokens: + self.tokeneater(*_token) except tokenize.TokenError as msg: errprint("%r: Token Error: %s" % (self.fname, msg)) self.nerrors = self.nerrors + 1 Index: Tools/scripts/reindent.py =================================================================== --- Tools/scripts/reindent.py (revision 61560) +++ Tools/scripts/reindent.py (working copy) @@ -173,7 +173,9 @@ self.stats = [] def run(self): - tokenize.tokenize(self.getline, self.tokeneater) + tokens = tokenize.generate_tokens(self.getline) + for _token in tokens: + self.tokeneater(*_token) # Remove trailing empty lines. lines = self.lines while lines and lines[-1] == "\n": Index: Doc/ACKS.txt =================================================================== --- Doc/ACKS.txt (revision 61560) +++ Doc/ACKS.txt (working copy) @@ -209,3 +209,5 @@ * Moshe Zadka * Milan Zamazal * Cheng Zhang + * Trent Nelson + * Michael Foord Index: Doc/library/tokenize.rst =================================================================== --- Doc/library/tokenize.rst (revision 61560) +++ Doc/library/tokenize.rst (working copy) @@ -9,50 +9,34 @@ The :mod:`tokenize` module provides a lexical scanner for Python source code, -implemented in Python. The scanner in this module returns comments as tokens as -well, making it useful for implementing "pretty-printers," including colorizers -for on-screen displays. +implemented in Python. The scanner in this module returns comments as tokens +as well, making it useful for implementing "pretty-printers," including +colorizers for on-screen displays. The primary entry point is a :term:`generator`: -.. function:: generate_tokens(readline) +.. function:: tokenize(readline) - The :func:`generate_tokens` generator requires one argument, *readline*, which + The :func:`tokenize` generator requires one argument, *readline*, which must be a callable object which provides the same interface as the :meth:`readline` method of built-in file objects (see section - :ref:`bltin-file-objects`). Each call to the function should return one line of - input as a string. + :ref:`bltin-file-objects`). Each call to the function should return one + line of input as bytes. - The generator produces 5-tuples with these members: the token type; the token - string; a 2-tuple ``(srow, scol)`` of ints specifying the row and column where - the token begins in the source; a 2-tuple ``(erow, ecol)`` of ints specifying - the row and column where the token ends in the source; and the line on which the - token was found. The line passed is the *logical* line; continuation lines are - included. + The generator produces 5-tuples with these members: the token type; the + token string; a 2-tuple ``(srow, scol)`` of ints specifying the row and + column where the token begins in the source; a 2-tuple ``(erow, ecol)`` of + ints specifying the row and column where the token ends in the source; and + the line on which the token was found. The line passed is the *logical* + line; continuation lines are included. + + tokenize determines the source encoding of the file by looking for a utf-8 + bom or encoding cookie, according to :pep:`263`. -An older entry point is retained for backward compatibility: - -.. function:: tokenize(readline[, tokeneater]) - - The :func:`tokenize` function accepts two parameters: one representing the input - stream, and one providing an output mechanism for :func:`tokenize`. - - The first parameter, *readline*, must be a callable object which provides the - same interface as the :meth:`readline` method of built-in file objects (see - section :ref:`bltin-file-objects`). Each call to the function should return one - line of input as a string. Alternately, *readline* may be a callable object that - signals completion by raising :exc:`StopIteration`. - - The second parameter, *tokeneater*, must also be a callable object. It is - called once for each token, with five arguments, corresponding to the tuples - generated by :func:`generate_tokens`. - - All constants from the :mod:`token` module are also exported from -:mod:`tokenize`, as are two additional token type values that might be passed to -the *tokeneater* function by :func:`tokenize`: +:mod:`tokenize`, as are three additional token type values: .. data:: COMMENT @@ -62,55 +46,94 @@ .. data:: NL Token value used to indicate a non-terminating newline. The NEWLINE token - indicates the end of a logical line of Python code; NL tokens are generated when - a logical line of code is continued over multiple physical lines. + indicates the end of a logical line of Python code; NL tokens are generated + when a logical line of code is continued over multiple physical lines. -Another function is provided to reverse the tokenization process. This is useful -for creating tools that tokenize a script, modify the token stream, and write -back the modified script. +.. data:: ENCODING + Token value that indicates the encoding used to decode the source bytes + into text. The first token returned by :func:`tokenize` will always be an + ENCODING token. + + +Another function is provided to reverse the tokenization process. This is +useful for creating tools that tokenize a script, modify the token stream, and +write back the modified script. + + .. function:: untokenize(iterable) - Converts tokens back into Python source code. The *iterable* must return - sequences with at least two elements, the token type and the token string. Any - additional sequence elements are ignored. + Converts tokens back into Python source code. The *iterable* must return + sequences with at least two elements, the token type and the token string. + Any additional sequence elements are ignored. + + The reconstructed script is returned as a single string. The result is + guaranteed to tokenize back to match the input so that the conversion is + lossless and round-trips are assured. The guarantee applies only to the + token type and token string as the spacing between tokens (column + positions) may change. + + It returns bytes, encoded using the ENCODING token, which is the first + token sequence output by :func:`tokenize`. - The reconstructed script is returned as a single string. The result is - guaranteed to tokenize back to match the input so that the conversion is - lossless and round-trips are assured. The guarantee applies only to the token - type and token string as the spacing between tokens (column positions) may - change. +:func:`tokenize` needs to detect the encoding of source files it tokenizes. The +function it uses to do this is available: +.. function:: detect_encoding(readline) + + The :func:`detect_encoding` function is used to detect the encoding that + should be used to decode a Python source file. It requires one argment, + readline, in the same way as the :func:`tokenize` generator. + + It will call readline a maximum of twice, and return the encoding used + (as a string) and a list of any lines (not decoded from bytes) it has read + in. + + It detects the encoding from the presence of a utf-8 bom or an encoding + cookie as specified in pep-0263. If both a bom and a cookie are present, + but disagree, a SyntaxError will be raised. + + If no encoding is specified, then the default of 'utf-8' will be returned. + + Example of a script re-writer that transforms float literals into Decimal objects:: - def decistmt(s): - """Substitute Decimals for floats in a string of statements. + def decistmt(s): + """Substitute Decimals for floats in a string of statements. + + >>> from decimal import Decimal + >>> s = 'print(+21.3e-5*-.1234/81.7)' + >>> decistmt(s) + "print (+Decimal ('21.3e-5')*-Decimal ('.1234')/Decimal ('81.7'))" + + The format of the exponent is inherited from the platform C library. + Known cases are "e-007" (Windows) and "e-07" (not Windows). Since + we're only showing 12 digits, and the 13th isn't close to 5, the + rest of the output should be platform-independent. + + >>> exec(s) #doctest: +ELLIPSIS + -3.21716034272e-0...7 + + Output from calculations with Decimal should be identical across all + platforms. + + >>> exec(decistmt(s)) + -3.217160342717258261933904529E-7 + """ + result = [] + g = tokenize(BytesIO(s.encode('utf-8')).readline) # tokenize the string + for toknum, tokval, _, _, _ in g: + if toknum == NUMBER and '.' in tokval: # replace NUMBER tokens + result.extend([ + (NAME, 'Decimal'), + (OP, '('), + (STRING, repr(tokval)), + (OP, ')') + ]) + else: + result.append((toknum, tokval)) + return untokenize(result).decode('utf-8') - >>> from decimal import Decimal - >>> s = 'print(+21.3e-5*-.1234/81.7)' - >>> decistmt(s) - "print(+Decimal ('21.3e-5')*-Decimal ('.1234')/Decimal ('81.7'))" - - >>> exec(s) - -3.21716034272e-007 - >>> exec(decistmt(s)) - -3.217160342717258261933904529E-7 - - """ - result = [] - g = generate_tokens(StringIO(s).readline) # tokenize the string - for toknum, tokval, _, _, _ in g: - if toknum == NUMBER and '.' in tokval: # replace NUMBER tokens - result.extend([ - (NAME, 'Decimal'), - (OP, '('), - (STRING, repr(tokval)), - (OP, ')') - ]) - else: - result.append((toknum, tokval)) - return untokenize(result) - Index: Lib/idlelib/EditorWindow.py =================================================================== --- Lib/idlelib/EditorWindow.py (revision 61560) +++ Lib/idlelib/EditorWindow.py (working copy) @@ -1437,7 +1437,9 @@ _tokenize.tabsize = self.tabwidth try: try: - _tokenize.tokenize(self.readline, self.tokeneater) + tokens = _tokenize.generate_tokens(self.readline) + for token in tokens: + self.tokeneater(*token) except _tokenize.TokenError: # since we cut off the tokenizer early, we can trigger # spurious errors Index: Lib/tokenize.py =================================================================== --- Lib/tokenize.py (revision 61560) +++ Lib/tokenize.py (working copy) @@ -1,8 +1,11 @@ """Tokenization help for Python programs. -generate_tokens(readline) is a generator that breaks a stream of -text into Python tokens. It accepts a readline-like method which is called -repeatedly to get the next line of input (or "" for EOF). It generates +tokenize(readline) is a generator that breaks a stream of +bytes into Python tokens. It decodes the bytes according to +PEP-0263 for determining source file encoding. + +It accepts a readline-like method which is called +repeatedly to get the next line of input (or b"" for EOF). It generates 5-tuples with these members: the token type (see token.py) @@ -13,32 +16,32 @@ It is designed to match the working of the Python tokenizer exactly, except that it produces COMMENT tokens for comments and gives type OP for all -operators +operators. Aditionally, all token lists start with an ENCODING token +which tells you which encoding was used to decode the bytes stream.""" -Older entry points - tokenize_loop(readline, tokeneater) - tokenize(readline, tokeneater=printtoken) -are the same, except instead of generating tokens, tokeneater is a callback -function to which the 5 fields described above are passed as 5 arguments, -each time a new token is found.""" - __author__ = 'Ka-Ping Yee ' -__credits__ = \ - 'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro, Raymond Hettinger' +__credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, ' + 'Skip Montanaro, Raymond Hettinger, Trent Nelson, ' + 'Michael Foord') -import string, re +import re, string, sys from token import * +from codecs import lookup +from itertools import chain, repeat +cookie_re = re.compile("coding[:=]\s*([-\w.]+)") import token __all__ = [x for x in dir(token) if x[0] != '_'] + ["COMMENT", "tokenize", - "generate_tokens", "NL", "untokenize"] + "detect_encoding", "NL", "untokenize", "ENCODING"] del token COMMENT = N_TOKENS tok_name[COMMENT] = 'COMMENT' NL = N_TOKENS + 1 tok_name[NL] = 'NL' -N_TOKENS += 2 +ENCODING = N_TOKENS + 2 +tok_name[ENCODING] = 'ENCODING' +N_TOKENS += 3 def group(*choices): return '(' + '|'.join(choices) + ')' def any(*choices): return group(*choices) + '*' @@ -132,40 +135,14 @@ class StopTokenizing(Exception): pass -def printtoken(type, token, startrowcol, endrowcol, line): # for testing - (srow, scol), (erow, ecol) = startrowcol, endrowcol - print("%d,%d-%d,%d:\t%s\t%s" % \ - (srow, scol, erow, ecol, tok_name[type], repr(token))) -def tokenize(readline, tokeneater=printtoken): - """ - The tokenize() function accepts two parameters: one representing the - input stream, and one providing an output mechanism for tokenize(). - - The first parameter, readline, must be a callable object which provides - the same interface as the readline() method of built-in file objects. - Each call to the function should return one line of input as a string. - - The second parameter, tokeneater, must also be a callable object. It is - called once for each token, with five arguments, corresponding to the - tuples generated by generate_tokens(). - """ - try: - tokenize_loop(readline, tokeneater) - except StopTokenizing: - pass - -# backwards compatible interface -def tokenize_loop(readline, tokeneater): - for token_info in generate_tokens(readline): - tokeneater(*token_info) - class Untokenizer: def __init__(self): self.tokens = [] self.prev_row = 1 self.prev_col = 0 + self.encoding = None def add_whitespace(self, start): row, col = start @@ -180,6 +157,9 @@ self.compat(t, iterable) break tok_type, token, start, end, line = t + if tok_type == ENCODING: + self.encoding = token + continue self.add_whitespace(start) self.tokens.append(token) self.prev_row, self.prev_col = end @@ -193,12 +173,16 @@ indents = [] toks_append = self.tokens.append toknum, tokval = token + if toknum in (NAME, NUMBER): tokval += ' ' if toknum in (NEWLINE, NL): startline = True for tok in iterable: toknum, tokval = tok[:2] + if toknum == ENCODING: + self.encoding = tokval + continue if toknum in (NAME, NUMBER): tokval += ' ' @@ -216,8 +200,11 @@ startline = False toks_append(tokval) + def untokenize(iterable): """Transform tokens back into Python source code. + It returns a bytes object, encoded using the ENCODING + token, which is the first token sequence output by tokenize. Each element returned by the iterable must be a token sequence with at least two elements, a token number and token value. If @@ -227,24 +214,89 @@ Untokenized source will match input source exactly Round-trip invariant for limited intput: - # Output text will tokenize the back to the input - t1 = [tok[:2] for tok in generate_tokens(f.readline)] + # Output bytes will tokenize the back to the input + t1 = [tok[:2] for tok in tokenize(f.readline)] newcode = untokenize(t1) - readline = iter(newcode.splitlines(1)).__next__ - t2 = [tok[:2] for tokin generate_tokens(readline)] + readline = BytesIO(newcode).readline + t2 = [tok[:2] for tok in tokenize(readline)] assert t1 == t2 """ ut = Untokenizer() - return ut.untokenize(iterable) + out = ut.untokenize(iterable) + if ut.encoding is not None: + out = out.encode(ut.encoding) + return out -def generate_tokens(readline): + +def detect_encoding(readline): """ - The generate_tokens() generator requires one argment, readline, which + The detect_encoding() function is used to detect the encoding that should + be used to decode a Python source file. It requires one argment, readline, + in the same way as the tokenize() generator. + + It will call readline a maximum of twice, and return the encoding used + (as a string) and a list of any lines (left as bytes) it has read + in. + + It detects the encoding from the presence of a utf-8 bom or an encoding + cookie as specified in pep-0263. If both a bom and a cookie are present, + but disagree, a SyntaxError will be raised. + + If no encoding is specified, then the default of 'utf-8' will be returned. + """ + utf8_bom = b'\xef\xbb\xbf' + bom_found = False + encoding = None + def read_or_stop(): + try: + return readline() + except StopIteration: + return b'' + + def find_cookie(line): + try: + line_string = line.decode('ascii') + except UnicodeDecodeError: + pass + else: + matches = cookie_re.findall(line_string) + if matches: + encoding = matches[0] + if bom_found and lookup(encoding).name != 'utf-8': + # This behaviour mimics the Python interpreter + raise SyntaxError('encoding problem: utf-8') + return encoding + + first = read_or_stop() + if first.startswith(utf8_bom): + bom_found = True + first = first[3:] + if not first: + return 'utf-8', [] + + encoding = find_cookie(first) + if encoding: + return encoding, [first] + + second = read_or_stop() + if not second: + return 'utf-8', [first] + + encoding = find_cookie(second) + if encoding: + return encoding, [first, second] + + return 'utf-8', [first, second] + + +def tokenize(readline): + """ + The tokenize() generator requires one argment, readline, which must be a callable object which provides the same interface as the readline() method of built-in file objects. Each call to the function - should return one line of input as a string. Alternately, readline + should return one line of input as bytes. Alternately, readline can be a callable function terminating with StopIteration: - readline = open(myfile).__next__ # Example of alternate readline + readline = open(myfile, 'rb').__next__ # Example of alternate readline The generator produces 5-tuples with these members: the token type; the token string; a 2-tuple (srow, scol) of ints specifying the row and @@ -252,18 +304,38 @@ ints specifying the row and column where the token ends in the source; and the line on which the token was found. The line passed is the logical line; continuation lines are included. + + The first token sequence will always be an ENCODING token + which tells you which encoding was used to decode the bytes stream. """ + encoding, consumed = detect_encoding(readline) + def readline_generator(): + while True: + try: + yield readline() + except StopIteration: + return + chained = chain(consumed, readline_generator()) + return _tokenize(chained.__next__, encoding) + + +def _tokenize(readline, encoding): lnum = parenlev = continued = 0 namechars, numchars = string.ascii_letters + '_', '0123456789' contstr, needcont = '', 0 contline = None indents = [0] - + + if encoding is not None: + yield (ENCODING, encoding, (0, 0), (0, 0), '') while 1: # loop over lines in stream try: line = readline() except StopIteration: - line = '' + line = b'' + + if encoding is not None: + line = line.decode(encoding) lnum = lnum + 1 pos, max = 0, len(line) @@ -385,7 +457,9 @@ yield (DEDENT, '', (lnum, 0), (lnum, 0), '') yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '') -if __name__ == '__main__': # testing - import sys - if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline) - else: tokenize(sys.stdin.readline) + +# An undocumented, backwards compatible, API for all the places in the standard +# library that expect to be able to use tokenize with strings +def generate_tokens(readline): + return _tokenize(readline, None) + \ No newline at end of file Index: Lib/inspect.py =================================================================== --- Lib/inspect.py (revision 61560) +++ Lib/inspect.py (working copy) @@ -657,7 +657,9 @@ """Extract the block of code at the top of the given list of lines.""" blockfinder = BlockFinder() try: - tokenize.tokenize(iter(lines).__next__, blockfinder.tokeneater) + tokens = tokenize.generate_tokens(iter(lines).__next__) + for _token in tokens: + blockfinder.tokeneater(*_token) except (EndOfBlock, IndentationError): pass return lines[:blockfinder.last] Index: Lib/test/test_tokenize.py =================================================================== --- Lib/test/test_tokenize.py (revision 61560) +++ Lib/test/test_tokenize.py (working copy) @@ -1,13 +1,14 @@ +# -*- coding: utf-8 -*- + doctests = """ Tests for the tokenize module. - >>> import glob, random, sys - The tests can be really simple. Given a small fragment of source code, print out a table with tokens. The ENDMARK is omitted for brevity. >>> dump_tokens("1 + 1") + ENCODING 'utf-8' (0, 0) (0, 0) NUMBER '1' (1, 0) (1, 1) OP '+' (1, 2) (1, 3) NUMBER '1' (1, 4) (1, 5) @@ -15,6 +16,7 @@ >>> dump_tokens("if False:\\n" ... " # NL\\n" ... " True = False # NEWLINE\\n") + ENCODING 'utf-8' (0, 0) (0, 0) NAME 'if' (1, 0) (1, 2) NAME 'False' (1, 3) (1, 8) OP ':' (1, 8) (1, 9) @@ -34,27 +36,12 @@ ... x += 2 ... x += 5 ... \""" - >>> for tok in generate_tokens(StringIO(indent_error_file).readline): pass + >>> readline = BytesIO(indent_error_file.encode('utf-8')).readline + >>> for tok in tokenize(readline): pass Traceback (most recent call last): ... IndentationError: unindent does not match any outer indentation level -Test roundtrip for `untokenize`. `f` is an open file or a string. The source -code in f is tokenized, converted back to source code via tokenize.untokenize(), -and tokenized again from the latter. The test fails if the second tokenization -doesn't match the first. - - >>> def roundtrip(f): - ... if isinstance(f, str): f = StringIO(f) - ... token_list = list(generate_tokens(f.readline)) - ... f.close() - ... tokens1 = [tok[:2] for tok in token_list] - ... new_text = untokenize(tokens1) - ... readline = iter(new_text.splitlines(1)).__next__ - ... tokens2 = [tok[:2] for tok in generate_tokens(readline)] - ... return tokens1 == tokens2 - ... - There are some standard formattig practises that are easy to get right. >>> roundtrip("if x == 1:\\n" @@ -67,14 +54,14 @@ Some people use different formatting conventions, which makes untokenize a little trickier. Note that this test involves trailing whitespace after the colon. Note that we use hex escapes to make the -two trailing blanks apperant in the expected output. +two trailing blanks apparent in the expected output. >>> roundtrip("if x == 1 : \\n" ... " print(x)\\n") True >>> f = test_support.findfile("tokenize_tests.txt") - >>> roundtrip(open(f)) + >>> roundtrip(open(f, 'rb')) True >>> roundtrip("if x == 1:\\n" @@ -122,27 +109,33 @@ Ordinary integers and binary operators >>> dump_tokens("0xff <= 255") + ENCODING 'utf-8' (0, 0) (0, 0) NUMBER '0xff' (1, 0) (1, 4) OP '<=' (1, 5) (1, 7) NUMBER '255' (1, 8) (1, 11) >>> dump_tokens("0b10 <= 255") + ENCODING 'utf-8' (0, 0) (0, 0) NUMBER '0b10' (1, 0) (1, 4) OP '<=' (1, 5) (1, 7) NUMBER '255' (1, 8) (1, 11) >>> dump_tokens("0o123 <= 0O123") + ENCODING 'utf-8' (0, 0) (0, 0) NUMBER '0o123' (1, 0) (1, 5) OP '<=' (1, 6) (1, 8) NUMBER '0O123' (1, 9) (1, 14) >>> dump_tokens("1234567 > ~0x15") + ENCODING 'utf-8' (0, 0) (0, 0) NUMBER '1234567' (1, 0) (1, 7) OP '>' (1, 8) (1, 9) OP '~' (1, 10) (1, 11) NUMBER '0x15' (1, 11) (1, 15) >>> dump_tokens("2134568 != 1231515") + ENCODING 'utf-8' (0, 0) (0, 0) NUMBER '2134568' (1, 0) (1, 7) OP '!=' (1, 8) (1, 10) NUMBER '1231515' (1, 11) (1, 18) >>> dump_tokens("(-124561-1) & 200000000") + ENCODING 'utf-8' (0, 0) (0, 0) OP '(' (1, 0) (1, 1) OP '-' (1, 1) (1, 2) NUMBER '124561' (1, 2) (1, 8) @@ -152,15 +145,18 @@ OP '&' (1, 12) (1, 13) NUMBER '200000000' (1, 14) (1, 23) >>> dump_tokens("0xdeadbeef != -1") + ENCODING 'utf-8' (0, 0) (0, 0) NUMBER '0xdeadbeef' (1, 0) (1, 10) OP '!=' (1, 11) (1, 13) OP '-' (1, 14) (1, 15) NUMBER '1' (1, 15) (1, 16) >>> dump_tokens("0xdeadc0de & 12345") + ENCODING 'utf-8' (0, 0) (0, 0) NUMBER '0xdeadc0de' (1, 0) (1, 10) OP '&' (1, 11) (1, 12) NUMBER '12345' (1, 13) (1, 18) >>> dump_tokens("0xFF & 0x15 | 1234") + ENCODING 'utf-8' (0, 0) (0, 0) NUMBER '0xFF' (1, 0) (1, 4) OP '&' (1, 5) (1, 6) NUMBER '0x15' (1, 7) (1, 11) @@ -170,18 +166,22 @@ Long integers >>> dump_tokens("x = 0") + ENCODING 'utf-8' (0, 0) (0, 0) NAME 'x' (1, 0) (1, 1) OP '=' (1, 2) (1, 3) NUMBER '0' (1, 4) (1, 5) >>> dump_tokens("x = 0xfffffffffff") + ENCODING 'utf-8' (0, 0) (0, 0) NAME 'x' (1, 0) (1, 1) OP '=' (1, 2) (1, 3) NUMBER '0xffffffffff (1, 4) (1, 17) >>> dump_tokens("x = 123141242151251616110") + ENCODING 'utf-8' (0, 0) (0, 0) NAME 'x' (1, 0) (1, 1) OP '=' (1, 2) (1, 3) NUMBER '123141242151 (1, 4) (1, 25) >>> dump_tokens("x = -15921590215012591") + ENCODING 'utf-8' (0, 0) (0, 0) NAME 'x' (1, 0) (1, 1) OP '=' (1, 2) (1, 3) OP '-' (1, 4) (1, 5) @@ -190,32 +190,39 @@ Floating point numbers >>> dump_tokens("x = 3.14159") + ENCODING 'utf-8' (0, 0) (0, 0) NAME 'x' (1, 0) (1, 1) OP '=' (1, 2) (1, 3) NUMBER '3.14159' (1, 4) (1, 11) >>> dump_tokens("x = 314159.") + ENCODING 'utf-8' (0, 0) (0, 0) NAME 'x' (1, 0) (1, 1) OP '=' (1, 2) (1, 3) NUMBER '314159.' (1, 4) (1, 11) >>> dump_tokens("x = .314159") + ENCODING 'utf-8' (0, 0) (0, 0) NAME 'x' (1, 0) (1, 1) OP '=' (1, 2) (1, 3) NUMBER '.314159' (1, 4) (1, 11) >>> dump_tokens("x = 3e14159") + ENCODING 'utf-8' (0, 0) (0, 0) NAME 'x' (1, 0) (1, 1) OP '=' (1, 2) (1, 3) NUMBER '3e14159' (1, 4) (1, 11) >>> dump_tokens("x = 3E123") + ENCODING 'utf-8' (0, 0) (0, 0) NAME 'x' (1, 0) (1, 1) OP '=' (1, 2) (1, 3) NUMBER '3E123' (1, 4) (1, 9) >>> dump_tokens("x+y = 3e-1230") + ENCODING 'utf-8' (0, 0) (0, 0) NAME 'x' (1, 0) (1, 1) OP '+' (1, 1) (1, 2) NAME 'y' (1, 2) (1, 3) OP '=' (1, 4) (1, 5) NUMBER '3e-1230' (1, 6) (1, 13) >>> dump_tokens("x = 3.14e159") + ENCODING 'utf-8' (0, 0) (0, 0) NAME 'x' (1, 0) (1, 1) OP '=' (1, 2) (1, 3) NUMBER '3.14e159' (1, 4) (1, 12) @@ -223,6 +230,7 @@ String literals >>> dump_tokens("x = ''; y = \\\"\\\"") + ENCODING 'utf-8' (0, 0) (0, 0) NAME 'x' (1, 0) (1, 1) OP '=' (1, 2) (1, 3) STRING "''" (1, 4) (1, 6) @@ -231,6 +239,7 @@ OP '=' (1, 10) (1, 11) STRING '""' (1, 12) (1, 14) >>> dump_tokens("x = '\\\"'; y = \\\"'\\\"") + ENCODING 'utf-8' (0, 0) (0, 0) NAME 'x' (1, 0) (1, 1) OP '=' (1, 2) (1, 3) STRING '\\'"\\'' (1, 4) (1, 7) @@ -239,24 +248,28 @@ OP '=' (1, 11) (1, 12) STRING '"\\'"' (1, 13) (1, 16) >>> dump_tokens("x = \\\"doesn't \\\"shrink\\\", does it\\\"") + ENCODING 'utf-8' (0, 0) (0, 0) NAME 'x' (1, 0) (1, 1) OP '=' (1, 2) (1, 3) STRING '"doesn\\'t "' (1, 4) (1, 14) NAME 'shrink' (1, 14) (1, 20) STRING '", does it"' (1, 20) (1, 31) >>> dump_tokens("x = 'abc' + 'ABC'") + ENCODING 'utf-8' (0, 0) (0, 0) NAME 'x' (1, 0) (1, 1) OP '=' (1, 2) (1, 3) STRING "'abc'" (1, 4) (1, 9) OP '+' (1, 10) (1, 11) STRING "'ABC'" (1, 12) (1, 17) >>> dump_tokens('y = "ABC" + "ABC"') + ENCODING 'utf-8' (0, 0) (0, 0) NAME 'y' (1, 0) (1, 1) OP '=' (1, 2) (1, 3) STRING '"ABC"' (1, 4) (1, 9) OP '+' (1, 10) (1, 11) STRING '"ABC"' (1, 12) (1, 17) >>> dump_tokens("x = r'abc' + r'ABC' + R'ABC' + R'ABC'") + ENCODING 'utf-8' (0, 0) (0, 0) NAME 'x' (1, 0) (1, 1) OP '=' (1, 2) (1, 3) STRING "r'abc'" (1, 4) (1, 10) @@ -267,6 +280,7 @@ OP '+' (1, 29) (1, 30) STRING "R'ABC'" (1, 31) (1, 37) >>> dump_tokens('y = r"abc" + r"ABC" + R"ABC" + R"ABC"') + ENCODING 'utf-8' (0, 0) (0, 0) NAME 'y' (1, 0) (1, 1) OP '=' (1, 2) (1, 3) STRING 'r"abc"' (1, 4) (1, 10) @@ -280,6 +294,7 @@ Operators >>> dump_tokens("def d22(a, b, c=2, d=2, *k): pass") + ENCODING 'utf-8' (0, 0) (0, 0) NAME 'def' (1, 0) (1, 3) NAME 'd22' (1, 4) (1, 7) OP '(' (1, 7) (1, 8) @@ -301,6 +316,7 @@ OP ':' (1, 27) (1, 28) NAME 'pass' (1, 29) (1, 33) >>> dump_tokens("def d01v_(a=1, *k, **w): pass") + ENCODING 'utf-8' (0, 0) (0, 0) NAME 'def' (1, 0) (1, 3) NAME 'd01v_' (1, 4) (1, 9) OP '(' (1, 9) (1, 10) @@ -321,6 +337,7 @@ >>> dump_tokens("if 1 < 1 > 1 == 1 >= 5 <= 0x15 <= 0x12 != " + ... "1 and 5 in 1 not in 1 is 1 or 5 is not 1: pass") + ENCODING 'utf-8' (0, 0) (0, 0) NAME 'if' (1, 0) (1, 2) NUMBER '1' (1, 3) (1, 4) OP '<' (1, 5) (1, 6) @@ -357,6 +374,7 @@ Shift >>> dump_tokens("x = 1 << 1 >> 5") + ENCODING 'utf-8' (0, 0) (0, 0) NAME 'x' (1, 0) (1, 1) OP '=' (1, 2) (1, 3) NUMBER '1' (1, 4) (1, 5) @@ -368,6 +386,7 @@ Additive >>> dump_tokens("x = 1 - y + 15 - 1 + 0x124 + z + a[5]") + ENCODING 'utf-8' (0, 0) (0, 0) NAME 'x' (1, 0) (1, 1) OP '=' (1, 2) (1, 3) NUMBER '1' (1, 4) (1, 5) @@ -390,6 +409,7 @@ Multiplicative >>> dump_tokens("x = 1//1*1/5*12%0x12") + ENCODING 'utf-8' (0, 0) (0, 0) NAME 'x' (1, 0) (1, 1) OP '=' (1, 2) (1, 3) NUMBER '1' (1, 4) (1, 5) @@ -407,6 +427,7 @@ Unary >>> dump_tokens("~1 ^ 1 & 1 |1 ^ -1") + ENCODING 'utf-8' (0, 0) (0, 0) OP '~' (1, 0) (1, 1) NUMBER '1' (1, 1) (1, 2) OP '^' (1, 3) (1, 4) @@ -419,6 +440,7 @@ OP '-' (1, 16) (1, 17) NUMBER '1' (1, 17) (1, 18) >>> dump_tokens("-1*1/1+1*1//1 - ---1**1") + ENCODING 'utf-8' (0, 0) (0, 0) OP '-' (1, 0) (1, 1) NUMBER '1' (1, 1) (1, 2) OP '*' (1, 2) (1, 3) @@ -442,6 +464,7 @@ Selector >>> dump_tokens("import sys, time\\nx = sys.modules['time'].time()") + ENCODING 'utf-8' (0, 0) (0, 0) NAME 'import' (1, 0) (1, 6) NAME 'sys' (1, 7) (1, 10) OP ',' (1, 10) (1, 11) @@ -463,6 +486,7 @@ Methods >>> dump_tokens("@staticmethod\\ndef foo(x,y): pass") + ENCODING 'utf-8' (0, 0) (0, 0) OP '@' (1, 0) (1, 1) NAME 'staticmethod (1, 1) (1, 13) NEWLINE '\\n' (1, 13) (1, 14) @@ -485,42 +509,43 @@ True >>> roundtrip("# Comment \\\\nx = 0") True - - >>> - >>> tempdir = os.path.dirname(f) or os.curdir - >>> testfiles = glob.glob(os.path.join(tempdir, "test*.py")) - >>> if not test_support.is_resource_enabled("compiler"): - ... testfiles = random.sample(testfiles, 10) - ... - >>> for testfile in testfiles: - ... if not roundtrip(open(testfile)): break - ... else: True - True """ - from test import test_support -from tokenize import (tokenize, untokenize, generate_tokens, NUMBER, NAME, OP, - STRING, ENDMARKER, tok_name) -from io import StringIO -import os +from tokenize import (tokenize, _tokenize, untokenize, NUMBER, NAME, OP, + STRING, ENDMARKER, tok_name, detect_encoding) +from io import BytesIO +from unittest import TestCase +import os, sys, glob def dump_tokens(s): """Print out the tokens in s in a table format. The ENDMARKER is omitted. """ - f = StringIO(s) - for type, token, start, end, line in generate_tokens(f.readline): + f = BytesIO(s.encode('utf-8')) + for type, token, start, end, line in tokenize(f.readline): if type == ENDMARKER: break type = tok_name[type] print("%(type)-10.10s %(token)-13.13r %(start)s %(end)s" % locals()) -def roundtrip(s): - f = StringIO(s) - source = untokenize(generate_tokens(f.readline)) - print(source, end="") +def roundtrip(f): + """ + Test roundtrip for `untokenize`. `f` is an open file or a string. + The source code in f is tokenized, converted back to source code via + tokenize.untokenize(), and tokenized again from the latter. The test + fails if the second tokenization doesn't match the first. + """ + if isinstance(f, str): + f = BytesIO(f.encode('utf-8')) + token_list = list(tokenize(f.readline)) + f.close() + tokens1 = [tok[:2] for tok in token_list] + new_bytes = untokenize(tokens1) + readline = (line for line in new_bytes.splitlines(1)).__next__ + tokens2 = [tok[:2] for tok in tokenize(readline)] + return tokens1 == tokens2 # This is an example from the docs, set up as a doctest. def decistmt(s): @@ -545,9 +570,8 @@ >>> exec(decistmt(s)) -3.217160342717258261933904529E-7 """ - result = [] - g = generate_tokens(StringIO(s).readline) # tokenize the string + g = tokenize(BytesIO(s.encode('utf-8')).readline) # tokenize the string for toknum, tokval, _, _, _ in g: if toknum == NUMBER and '.' in tokval: # replace NUMBER tokens result.extend([ @@ -558,14 +582,249 @@ ]) else: result.append((toknum, tokval)) - return untokenize(result) + return untokenize(result).decode('utf-8') +class TestTokenizerAdheresToPep0263(TestCase): + """ + Test that tokenizer adheres to the coding behaviour stipulated in PEP 0263. + """ + + def _testFile(self, filename): + path = os.path.join(os.path.dirname(__file__), filename) + return roundtrip(open(path, 'rb')) + + def test_utf8_coding_cookie_and_no_utf8_bom(self): + f = 'tokenize_tests-utf8-coding-cookie-and-utf8-bom-sig.txt' + self.assertTrue(self._testFile(f)) + + def test_latin1_coding_cookie_and_utf8_bom(self): + """ + As per PEP 0263, if a file starts with a utf-8 BOM signature, the only + allowed encoding for the comment is 'utf-8'. The text file used in + this test starts with a BOM signature, but specifies latin1 as the + coding, so verify that a SyntaxError is raised, which matches the + behaviour of the interpreter when it encounters a similar condition. + """ + f = 'tokenize_tests-latin1-coding-cookie-and-utf8-bom-sig.txt' + self.failUnlessRaises(SyntaxError, self._testFile, f) + + def test_no_coding_cookie_and_utf8_bom(self): + f = 'tokenize_tests-no-coding-cookie-and-utf8-bom-sig-only.txt' + self.assertTrue(self._testFile(f)) + + def test_utf8_coding_cookie_and_utf8_bom(self): + f = 'tokenize_tests-utf8-coding-cookie-and-utf8-bom-sig.txt' + self.assertTrue(self._testFile(f)) + + +class Test_Tokenize(TestCase): + + def test__tokenize_decodes_with_specified_encoding(self): + literal = '"ЉЊЈЁЂ"' + line = literal.encode('utf-8') + first = False + def readline(): + nonlocal first + if not first: + first = True + return line + else: + return b'' + + # skip the initial encoding token and the end token + tokens = list(_tokenize(readline, encoding='utf-8'))[1:-1] + expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')] + self.assertEquals(tokens, expected_tokens, + "bytes not decoded with encoding") + + def test__tokenize_does_not_decode_with_encoding_none(self): + literal = '"ЉЊЈЁЂ"' + first = False + def readline(): + nonlocal first + if not first: + first = True + return literal + else: + return b'' + + # skip the end token + tokens = list(_tokenize(readline, encoding=None))[:-1] + expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')] + self.assertEquals(tokens, expected_tokens, + "string not tokenized when encoding is None") + + +class TestDetectEncoding(TestCase): + + def get_readline(self, lines): + index = 0 + def readline(): + nonlocal index + if index == len(lines): + raise StopIteration + line = lines[index] + index += 1 + return line + return readline + + def test_no_bom_no_encoding_cookie(self): + lines = ( + b'# something\n', + b'print(something)\n', + b'do_something(else)\n' + ) + encoding, consumed_lines = detect_encoding(self.get_readline(lines)) + self.assertEquals(encoding, 'utf-8') + self.assertEquals(consumed_lines, list(lines[:2])) + + def test_bom_no_cookie(self): + lines = ( + b'\xef\xbb\xbf# something\n', + b'print(something)\n', + b'do_something(else)\n' + ) + encoding, consumed_lines = detect_encoding(self.get_readline(lines)) + self.assertEquals(encoding, 'utf-8') + self.assertEquals(consumed_lines, + [b'# something\n', b'print(something)\n']) + + def test_cookie_first_line_no_bom(self): + lines = ( + b'# -*- coding: latin-1 -*-\n', + b'print(something)\n', + b'do_something(else)\n' + ) + encoding, consumed_lines = detect_encoding(self.get_readline(lines)) + self.assertEquals(encoding, 'latin-1') + self.assertEquals(consumed_lines, [b'# -*- coding: latin-1 -*-\n']) + + def test_matched_bom_and_cookie_first_line(self): + lines = ( + b'\xef\xbb\xbf# coding=utf-8\n', + b'print(something)\n', + b'do_something(else)\n' + ) + encoding, consumed_lines = detect_encoding(self.get_readline(lines)) + self.assertEquals(encoding, 'utf-8') + self.assertEquals(consumed_lines, [b'# coding=utf-8\n']) + + def test_mismatched_bom_and_cookie_first_line_raises_syntaxerror(self): + lines = ( + b'\xef\xbb\xbf# vim: set fileencoding=ascii :\n', + b'print(something)\n', + b'do_something(else)\n' + ) + readline = self.get_readline(lines) + self.assertRaises(SyntaxError, detect_encoding, readline) + + def test_cookie_second_line_no_bom(self): + lines = ( + b'#! something\n', + b'# vim: set fileencoding=ascii :\n', + b'print(something)\n', + b'do_something(else)\n' + ) + encoding, consumed_lines = detect_encoding(self.get_readline(lines)) + self.assertEquals(encoding, 'ascii') + expected = [b'#! something\n', b'# vim: set fileencoding=ascii :\n'] + self.assertEquals(consumed_lines, expected) + + def test_matched_bom_and_cookie_second_line(self): + lines = ( + b'\xef\xbb\xbf#! something\n', + b'f# coding=utf-8\n', + b'print(something)\n', + b'do_something(else)\n' + ) + encoding, consumed_lines = detect_encoding(self.get_readline(lines)) + self.assertEquals(encoding, 'utf-8') + self.assertEquals(consumed_lines, + [b'#! something\n', b'f# coding=utf-8\n']) + + def test_mismatched_bom_and_cookie_second_line_raises_syntaxerror(self): + lines = ( + b'\xef\xbb\xbf#! something\n', + b'# vim: set fileencoding=ascii :\n', + b'print(something)\n', + b'do_something(else)\n' + ) + readline = self.get_readline(lines) + self.assertRaises(SyntaxError, detect_encoding, readline) + + def test_short_files(self): + readline = self.get_readline((b'print(something)\n',)) + encoding, consumed_lines = detect_encoding(readline) + self.assertEquals(encoding, 'utf-8') + self.assertEquals(consumed_lines, [b'print(something)\n']) + + encoding, consumed_lines = detect_encoding(self.get_readline(())) + self.assertEquals(encoding, 'utf-8') + self.assertEquals(consumed_lines, []) + + readline = self.get_readline((b'\xef\xbb\xbfprint(something)\n',)) + encoding, consumed_lines = detect_encoding(readline) + self.assertEquals(encoding, 'utf-8') + self.assertEquals(consumed_lines, [b'print(something)\n']) + + readline = self.get_readline((b'\xef\xbb\xbf',)) + encoding, consumed_lines = detect_encoding(readline) + self.assertEquals(encoding, 'utf-8') + self.assertEquals(consumed_lines, []) + + +class TestTokenize(TestCase): + + def test_tokenize(self): + import tokenize as tokenize_module + encoding = object() + encoding_used = None + def mock_detect_encoding(readline): + return encoding, ['first', 'second'] + + def mock__tokenize(readline, encoding): + nonlocal encoding_used + encoding_used = encoding + out = [] + while True: + next_line = readline() + if next_line: + out.append(next_line) + continue + return out + + counter = 0 + def mock_readline(): + nonlocal counter + counter += 1 + if counter == 5: + return b'' + return counter + + orig_detect_encoding = tokenize_module.detect_encoding + orig__tokenize = tokenize_module._tokenize + tokenize_module.detect_encoding = mock_detect_encoding + tokenize_module._tokenize = mock__tokenize + try: + results = tokenize(mock_readline) + self.assertEquals(list(results), ['first', 'second', 1, 2, 3, 4]) + finally: + tokenize_module.detect_encoding = orig_detect_encoding + tokenize_module._tokenize = orig__tokenize + + self.assertTrue(encoding_used, encoding) + + __test__ = {"doctests" : doctests, 'decistmt': decistmt} def test_main(): from test import test_tokenize test_support.run_doctest(test_tokenize, True) + test_support.run_unittest(TestTokenizerAdheresToPep0263) + test_support.run_unittest(Test_Tokenize) + test_support.run_unittest(TestDetectEncoding) + test_support.run_unittest(TestTokenize) if __name__ == "__main__": test_main()