diff -r 6d278f426417 Doc/library/tokenize.rst --- a/Doc/library/tokenize.rst Fri Jul 05 18:05:29 2013 -1000 +++ b/Doc/library/tokenize.rst Sun Jul 07 13:18:24 2013 +0200 @@ -87,7 +87,10 @@ Converts tokens back into Python source code. The *iterable* must return sequences with at least two elements, the token type and the token string. - Any additional sequence elements are ignored. + If five elements are present, it is assumed that they are formatted + like output of :func:`tokenize` and additional elements are used to + restore original whitespace. Otherwise any additional sequence elements + are ignored. The reconstructed script is returned as a single string. The result is guaranteed to tokenize back to match the input so that the conversion is @@ -96,7 +99,8 @@ positions) may change. It returns bytes, encoded using the ENCODING token, which is the first - token sequence output by :func:`tokenize`. + token sequence output by :func:`tokenize`. If ENCODING token in not present + output will be encoded using 'utf-8'. :func:`tokenize` needs to detect the encoding of source files it tokenizes. The diff -r 6d278f426417 Lib/test/test_tokenize.py --- a/Lib/test/test_tokenize.py Fri Jul 05 18:05:29 2013 -1000 +++ b/Lib/test/test_tokenize.py Sun Jul 07 13:18:24 2013 +0200 @@ -641,7 +641,7 @@ open as tokenize_open) from io import BytesIO from unittest import TestCase -import os, sys, glob +import os, re, sys, glob import token def dump_tokens(s): @@ -1113,7 +1113,139 @@ # See http://bugs.python.org/issue16152 self.assertExactTypeEqual('@ ', token.AT) -__test__ = {"doctests" : doctests, 'decistmt': decistmt} + +class TestUntokenize(TestCase): + + def get_tokens(self, source): + self.assertIsInstance(source, bytes) + readline = BytesIO(source).readline + tokens = tokenize(readline) + return tokens + + def get_tokens_compat(self, source, tuple_elements): + tokens = self.get_tokens(source) + return (token[:tuple_elements] for token in tokens) + + def checkUntokenize(self, input_, tuple_elements=None, pattern=None, + drop_enc=False): + tokens = self.get_tokens(input_) + if tuple_elements is not None: + tokens = (tuple_elements(token) for token in tokens) + if drop_enc: # Skip the ENCODING token. + encoding_token = next(tokens) + self.assertEqual(encoding_token[0], ENCODING) + self.assertEqual(encoding_token[1], 'utf-8') + source = untokenize(tokens) + self.assertIsInstance(source, bytes) + # Testing 'compat' implementation (2-element tuples as input) + # produces different whitespacing than original source file + # and we use regexp to compare output. + if pattern is None: + self.assertEqual(source, input_) + else: + self.assertRegexpMatches(source.decode('utf-8'), pattern) + + def assertUntokenizeResult(self, input_, pattern): + # Run against original tokens. + self.checkUntokenize(input_) + self.checkUntokenize(input_, drop_enc=True) + + # Run against 2-element tokens. + tuple_elements = lambda token: token[:2] + self.checkUntokenize(input_, tuple_elements=tuple_elements, + pattern=pattern) + self.checkUntokenize(input_, tuple_elements=tuple_elements, + pattern=pattern, drop_enc=True) + + # Run agains tokens of unexpected length. + tuple_elements = lambda token: token[:2] + (None, ) + self.checkUntokenize(input_, tuple_elements=tuple_elements, + pattern=pattern) + self.checkUntokenize(input_, tuple_elements=tuple_elements, + pattern=pattern, drop_enc=True) + + tuple_elements = lambda token: token[:2] + (None, ) * 2 + self.checkUntokenize(input_, tuple_elements=tuple_elements, + pattern=pattern) + self.checkUntokenize(input_, tuple_elements=tuple_elements, + pattern=pattern, drop_enc=True) + + tuple_elements = lambda token: token[:2] + (None, ) * 4 + self.checkUntokenize(input_, tuple_elements=tuple_elements, + pattern=pattern) + self.checkUntokenize(input_, tuple_elements=tuple_elements, + pattern=pattern, drop_enc=True) + + tuple_elements = lambda token: token[:2] + (None, ) * 5 + self.checkUntokenize(input_, tuple_elements=tuple_elements, + pattern=pattern) + self.checkUntokenize(input_, tuple_elements=tuple_elements, + pattern=pattern, drop_enc=True) + + def test_untokenize_simple(self): + input_ = b"1 + 2" + pattern = re.compile(r'^1\ ?\+\ ?2\ ?$') + self.assertUntokenizeResult(input_, pattern) + + def test_untokenize_medium(self): + input_ = ( + b"def div(a, b):\n" + b" if b == 0:\n" + b" return None\n" + b" else:\n" + b" return a / b\n") + pattern = re.compile( + r"^def div\ ?\(a\ ?,\ ?b\ ?\):\n" + r" if b\ ?==\ ?0\ ?:\n" + r" return None\ ?\n" + r" else\ ?:\n" + r" return a\ ?/\ ?b\ ?\n$") + self.assertUntokenizeResult(input_, pattern) + + def test_untokenize_complex(self): + input_ = ( + b'#!/usr/bin/env python3\n' + b'# -*- coding: utf-8 -*-\n' + b'import functools\n' + b'\n' + b'class Example:\n' + b' """docstring"""\n' + b' def __init__(self, prop):\n' + b' self.prop = prop\n' + b'\n' + b' def __call__(self, func):\n' + b' @functools.wraps(func)\n' + b' def wrapper(param):\n' + b' # Comment.\n' + b' if param:\n' + b' return param\n' + b' else:\n' + b' return -1\n' + b' return wrapper\n') + pattern = re.compile( + r'^#!/usr/bin/env python3\n' + r'# -\*- coding: utf-8 -\*-\n' + r'import functools\ ?\n' + r'\n' + r'class Example\ ?:\n' + r' """docstring"""\n' + r' def __init__\ ?\(self\ ?,\ ?prop\ ?\):\n' + r' self\ ?.prop\ ?=\ ?prop\ ?\n' + r'\n' + r' def __call__\ ?\(self\ ?,\ ?func\ ?\):\n' + r' @functools\ ?.wraps\ ?\(func\ ?\)\n' + r' def wrapper\ ?\(param\ ?\):\n' + r'\ *# Comment.\n' + r' if param\ ?:\n' + r' return param\ ?\n' + r' else\ ?:\n' + r' return -1\ ?\n' + r' return wrapper\ ?\n$') + self.assertUntokenizeResult(input_, pattern) + + +__test__ = {"doctests": doctests, 'decistmt': decistmt} + def test_main(): from test import test_tokenize @@ -1122,6 +1254,7 @@ support.run_unittest(Test_Tokenize) support.run_unittest(TestDetectEncoding) support.run_unittest(TestTokenize) + support.run_unittest(TestUntokenize) if __name__ == "__main__": test_main() diff -r 6d278f426417 Lib/tokenize.py --- a/Lib/tokenize.py Fri Jul 05 18:05:29 2013 -1000 +++ b/Lib/tokenize.py Sun Jul 07 13:18:24 2013 +0200 @@ -25,6 +25,7 @@ 'Skip Montanaro, Raymond Hettinger, Trent Nelson, ' 'Michael Foord') import builtins +import itertools import re import sys from token import * @@ -228,14 +229,14 @@ def add_whitespace(self, start): row, col = start - assert row <= self.prev_row + assert row >= self.prev_row col_offset = col - self.prev_col if col_offset: self.tokens.append(" " * col_offset) def untokenize(self, iterable): for t in iterable: - if len(t) == 2: + if len(t) != 5: self.compat(t, iterable) break tok_type, token, start, end, line = t @@ -254,14 +255,10 @@ startline = False indents = [] toks_append = self.tokens.append - toknum, tokval = token - if toknum in (NAME, NUMBER): - tokval += ' ' - if toknum in (NEWLINE, NL): - startline = True prevstring = False - for tok in iterable: + # We still need to process the first read token. + for tok in itertools.chain([token], iterable): toknum, tokval = tok[:2] if toknum == ENCODING: self.encoding = tokval @@ -296,10 +293,14 @@ """Transform tokens back into Python source code. It returns a bytes object, encoded using the ENCODING token, which is the first token sequence output by tokenize. + If ENCODING token is not present, output will be encoded with 'utf-8'. Each element returned by the iterable must be a token sequence - with at least two elements, a token number and token value. If - only two tokens are passed, the resulting output is poor. + with at least two elements, a token number and token value. + If 5 elements are present, it is assumed that they are formatted like output + of tokenize and additional elements are used to restore original + whitespace. If different number of elements is present, the resulting output + has poor whitespace. Round-trip invariant for full input: Untokenized source will match input source exactly @@ -314,9 +315,10 @@ """ ut = Untokenizer() out = ut.untokenize(iterable) - if ut.encoding is not None: - out = out.encode(ut.encoding) - return out + + # Use utf-8 as default encoding if encoding token was not provided. + encoding = ut.encoding if ut.encoding is not None else 'utf-8' + return out.encode(encoding) def _get_normal_name(orig_enc):