diff -r f51921883f50 Doc/library/tokenize.rst --- a/Doc/library/tokenize.rst Sun Oct 04 01:19:36 2015 -0400 +++ b/Doc/library/tokenize.rst Mon Oct 05 09:04:22 2015 +0300 @@ -29,8 +29,9 @@ The primary entry point is a :term:`gene The :func:`tokenize` generator requires one argument, *readline*, which must be a callable object which provides the same interface as the - :meth:`io.IOBase.readline` method of file objects. Each call to the - function should return one line of input as bytes. + :meth:`io.IOBase.readline` or the :meth:`io.TextIOBase.readline` method + of file objects. Each call to the function should return one line of input + as bytes or text. The generator produces 5-tuples with these members: the token type; the token string; a 2-tuple ``(srow, scol)`` of ints specifying the row and @@ -52,8 +53,11 @@ The primary entry point is a :term:`gene .. versionchanged:: 3.3 Added support for ``exact_type``. - :func:`tokenize` determines the source encoding of the file by looking for a - UTF-8 BOM or encoding cookie, according to :pep:`263`. + .. versionchanged:: 3.6 + Added support for text input. + + :func:`tokenize` determines the source encoding of binary files by + looking for a UTF-8 BOM or encoding cookie, according to :pep:`263`. All constants from the :mod:`token` module are also exported from @@ -74,8 +78,8 @@ All constants from the :mod:`token` modu .. data:: ENCODING Token value that indicates the encoding used to decode the source bytes - into text. The first token returned by :func:`tokenize` will always be an - ENCODING token. + into text. For binary file the first token returned by :func:`tokenize` + will always be an ENCODING token. For text files, this token is not used. Another function is provided to reverse the tokenization process. This is @@ -89,15 +93,14 @@ write back the modified script. sequences with at least two elements, the token type and the token string. Any additional sequence elements are ignored. - The reconstructed script is returned as a single string. The result is + The reconstructed script is returned as a single string or bytes, encoded + using the ENCODING token if it is the first token sequence output by + :func:`tokenize`. The result is guaranteed to tokenize back to match the input so that the conversion is lossless and round-trips are assured. The guarantee applies only to the token type and token string as the spacing between tokens (column positions) may change. - It returns bytes, encoded using the ENCODING token, which is the first - token sequence output by :func:`tokenize`. - :func:`tokenize` needs to detect the encoding of source files it tokenizes. The function it uses to do this is available: @@ -108,9 +111,12 @@ function it uses to do this is available should be used to decode a Python source file. It requires one argument, readline, in the same way as the :func:`tokenize` generator. - It will call readline a maximum of twice, and return the encoding used - (as a string) and a list of any lines (not decoded from bytes) it has read - in. + If readline returns a string, ``detect_encoding`` returns the encoding as + ``None`` and a list containing this string. + + Otherwise, it will call readline a maximum of twice, and return the + encoding used (as a string) and a list of any lines (not decoded from + bytes) it has read in. It detects the encoding from the presence of a UTF-8 BOM or an encoding cookie as specified in :pep:`263`. If both a BOM and a cookie are present, @@ -123,6 +129,9 @@ function it uses to do this is available Use :func:`open` to open Python source files: it uses :func:`detect_encoding` to detect the file encoding. + .. versionchanged:: 3.6 + Added support for text input. + .. function:: open(filename) @@ -211,7 +220,7 @@ objects:: -3.217160342717258261933904529E-7 """ result = [] - g = tokenize(BytesIO(s.encode('utf-8')).readline) # tokenize the string + g = tokenize(StringIO(s).readline) # tokenize the string for toknum, tokval, _, _, _ in g: if toknum == NUMBER and '.' in tokval: # replace NUMBER tokens result.extend([ @@ -222,7 +231,7 @@ objects:: ]) else: result.append((toknum, tokval)) - return untokenize(result).decode('utf-8') + return untokenize(result) Example of tokenizing from the command line. The script:: diff -r f51921883f50 Lib/cgitb.py --- a/Lib/cgitb.py Sun Oct 04 01:19:36 2015 -0400 +++ b/Lib/cgitb.py Mon Oct 05 09:04:22 2015 +0300 @@ -80,7 +80,7 @@ def lookup(name, frame, locals): def scanvars(reader, frame, locals): """Scan one logical line of Python and look up values of variables used.""" vars, lasttoken, parent, prefix, value = [], None, None, '', __UNDEF__ - for ttype, token, start, end, line in tokenize.generate_tokens(reader): + for ttype, token, start, end, line in tokenize.tokenize(reader): if ttype == tokenize.NEWLINE: break if ttype == tokenize.NAME and token not in keyword.kwlist: if lasttoken == '.': diff -r f51921883f50 Lib/gettext.py --- a/Lib/gettext.py Sun Oct 04 01:19:36 2015 -0400 +++ b/Lib/gettext.py Mon Oct 05 09:04:22 2015 +0300 @@ -66,7 +66,7 @@ def c2py(plural): """ # Security check, allow only the "n" identifier import token, tokenize - tokens = tokenize.generate_tokens(io.StringIO(plural).readline) + tokens = tokenize.tokenize(io.StringIO(plural).readline) try: danger = [x for x in tokens if x[0] == token.NAME and x[1] != 'n'] except tokenize.TokenError: diff -r f51921883f50 Lib/idlelib/EditorWindow.py --- a/Lib/idlelib/EditorWindow.py Sun Oct 04 01:19:36 2015 -0400 +++ b/Lib/idlelib/EditorWindow.py Mon Oct 05 09:04:22 2015 +0300 @@ -1626,7 +1626,7 @@ class IndentSearcher(object): _tokenize.tabsize = self.tabwidth try: try: - tokens = _tokenize.generate_tokens(self.readline) + tokens = _tokenize.tokenize(self.readline) for token in tokens: self.tokeneater(*token) except (_tokenize.TokenError, SyntaxError): diff -r f51921883f50 Lib/idlelib/ScriptBinding.py --- a/Lib/idlelib/ScriptBinding.py Sun Oct 04 01:19:36 2015 -0400 +++ b/Lib/idlelib/ScriptBinding.py Mon Oct 05 09:04:22 2015 +0300 @@ -67,7 +67,7 @@ class ScriptBinding: # XXX: tabnanny should work on binary files as well with tokenize.open(filename) as f: try: - tabnanny.process_tokens(tokenize.generate_tokens(f.readline)) + tabnanny.process_tokens(tokenize.tokenize(f.readline)) except tokenize.TokenError as msg: msgtxt, (lineno, start) = msg.args self.editwin.gotoline(lineno) diff -r f51921883f50 Lib/inspect.py --- a/Lib/inspect.py Sun Oct 04 01:19:36 2015 -0400 +++ b/Lib/inspect.py Mon Oct 05 09:04:22 2015 +0300 @@ -894,7 +894,7 @@ def getblock(lines): """Extract the block of code at the top of the given list of lines.""" blockfinder = BlockFinder() try: - tokens = tokenize.generate_tokens(iter(lines).__next__) + tokens = tokenize.tokenize(iter(lines).__next__) for _token in tokens: blockfinder.tokeneater(*_token) except (EndOfBlock, IndentationError): diff -r f51921883f50 Lib/pyclbr.py --- a/Lib/pyclbr.py Sun Oct 04 01:19:36 2015 -0400 +++ b/Lib/pyclbr.py Mon Oct 05 09:04:22 2015 +0300 @@ -158,7 +158,7 @@ def _readmodule(module, path, inpackage= stack = [] # stack of (class, indent) pairs - g = tokenize.generate_tokens(f.readline) + g = tokenize.tokenize(f.readline) try: for tokentype, token, start, _end, _line in g: if tokentype == DEDENT: diff -r f51921883f50 Lib/tabnanny.py --- a/Lib/tabnanny.py Sun Oct 04 01:19:36 2015 -0400 +++ b/Lib/tabnanny.py Mon Oct 05 09:04:22 2015 +0300 @@ -103,7 +103,7 @@ def check(file): print("checking %r ..." % file) try: - process_tokens(tokenize.generate_tokens(f.readline)) + process_tokens(tokenize.tokenize(f.readline)) except tokenize.TokenError as msg: errprint("%r: Token Error: %s" % (file, msg)) diff -r f51921883f50 Lib/tokenize.py --- a/Lib/tokenize.py Sun Oct 04 01:19:36 2015 -0400 +++ b/Lib/tokenize.py Mon Oct 05 09:04:22 2015 +0300 @@ -318,8 +318,8 @@ class Untokenizer: def untokenize(iterable): """Transform tokens back into Python source code. - It returns a bytes object, encoded using the ENCODING - token, which is the first token sequence output by tokenize. + It returns a string or a bytes object, encoded using the ENCODING + token, if it is the first token sequence output by tokenize. Each element returned by the iterable must be a token sequence with at least two elements, a token number and token value. If @@ -423,6 +423,8 @@ def detect_encoding(readline): return encoding first = read_or_stop() + if isinstance(first, str): + return None, [first] if first.startswith(BOM_UTF8): bom_found = True first = first[3:] @@ -468,7 +470,7 @@ def tokenize(readline): The tokenize() generator requires one argment, readline, which must be a callable object which provides the same interface as the readline() method of built-in file objects. Each call to the function - should return one line of input as bytes. Alternately, readline + should return one line of input as bytes or text. Alternately, readline can be a callable function terminating with StopIteration: readline = open(myfile, 'rb').__next__ # Example of alternate readline @@ -479,16 +481,16 @@ def tokenize(readline): and the line on which the token was found. The line passed is the logical line; continuation lines are included. - The first token sequence will always be an ENCODING token - which tells you which encoding was used to decode the bytes stream. + If readline() returns bytes the first token sequence will always be an + ENCODING token which tells you which encoding was used to decode the bytes + stream. """ # This import is here to avoid problems when the itertools module is not # built yet and tokenize is imported. from itertools import chain, repeat encoding, consumed = detect_encoding(readline) - rl_gen = iter(readline, b"") - empty = repeat(b"") - return _tokenize(chain(consumed, rl_gen, empty).__next__, encoding) + rl_gen = iter(readline, consumed[:0]) + return _tokenize(chain(consumed, rl_gen).__next__, encoding) def _tokenize(readline, encoding): @@ -710,6 +712,8 @@ def _tokenize(readline, encoding): # An undocumented, backwards compatible, API for all the places in the standard # library that expect to be able to use tokenize with strings def generate_tokens(readline): + import warnings + warnings.warn("use tokenize()", DeprecationWarning, stacklevel=2) return _tokenize(readline, None) def main(): diff -r f51921883f50 Lib/trace.py --- a/Lib/trace.py Sun Oct 04 01:19:36 2015 -0400 +++ b/Lib/trace.py Mon Oct 05 09:04:22 2015 +0300 @@ -418,7 +418,7 @@ def _find_strings(filename, encoding=Non # Add this special case so that the test in the loop passes. prev_ttype = token.INDENT with open(filename, encoding=encoding) as f: - tok = tokenize.generate_tokens(f.readline) + tok = tokenize.tokenize(f.readline) for ttype, tstr, start, end, line in tok: if ttype == token.STRING: if prev_ttype == token.INDENT: diff -r f51921883f50 Tools/scripts/cleanfuture.py --- a/Tools/scripts/cleanfuture.py Sun Oct 04 01:19:36 2015 -0400 +++ b/Tools/scripts/cleanfuture.py Mon Oct 05 09:04:22 2015 +0300 @@ -162,7 +162,7 @@ class FutureFinder: OP = tokenize.OP changed = self.changed - get = tokenize.generate_tokens(self.getline).__next__ + get = tokenize.tokenize(self.getline).__next__ type, token, (srow, scol), (erow, ecol), line = get() # Chew up initial comments and blank lines (if any). diff -r f51921883f50 Tools/scripts/finddiv.py --- a/Tools/scripts/finddiv.py Sun Oct 04 01:19:36 2015 -0400 +++ b/Tools/scripts/finddiv.py Mon Oct 05 09:04:22 2015 +0300 @@ -55,7 +55,7 @@ def process(filename, listnames): except IOError as msg: sys.stderr.write("Can't open: %s\n" % msg) return 1 - g = tokenize.generate_tokens(fp.readline) + g = tokenize.tokenize(fp.readline) lastrow = None for type, token, (row, col), end, line in g: if token in ("/", "/="): diff -r f51921883f50 Tools/scripts/fixdiv.py --- a/Tools/scripts/fixdiv.py Sun Oct 04 01:19:36 2015 -0400 +++ b/Tools/scripts/fixdiv.py Mon Oct 05 09:04:22 2015 +0300 @@ -214,7 +214,7 @@ def process(filename, list): f = FileContext(fp) list.sort() index = 0 # list[:index] has been processed, list[index:] is still to do - g = tokenize.generate_tokens(f.readline) + g = tokenize.tokenize(f.readline) while 1: startlineno, endlineno, slashes = lineinfo = scanline(g) if startlineno is None: diff -r f51921883f50 Tools/scripts/highlight.py --- a/Tools/scripts/highlight.py Sun Oct 04 01:19:36 2015 -0400 +++ b/Tools/scripts/highlight.py Mon Oct 05 09:04:22 2015 +0300 @@ -34,7 +34,7 @@ def analyze_python(source): kind = tok_str = '' tok_type = tokenize.COMMENT written = (1, 0) - for tok in tokenize.generate_tokens(readline): + for tok in tokenize.tokenize(readline): prev_tok_type, prev_tok_str = tok_type, tok_str tok_type, tok_str, (srow, scol), (erow, ecol), logical_lineno = tok kind = '' diff -r f51921883f50 Tools/scripts/reindent.py --- a/Tools/scripts/reindent.py Sun Oct 04 01:19:36 2015 -0400 +++ b/Tools/scripts/reindent.py Mon Oct 05 09:04:22 2015 +0300 @@ -195,7 +195,7 @@ class Reindenter: self.newlines = f.newlines def run(self): - tokens = tokenize.generate_tokens(self.getline) + tokens = tokenize.tokenize(self.getline) for _token in tokens: self.tokeneater(*_token) # Remove trailing empty lines.