diff -r 1141648fa655 Doc/library/tokenize.rst --- a/Doc/library/tokenize.rst Sat Oct 06 13:49:34 2012 +0200 +++ b/Doc/library/tokenize.rst Sat Oct 06 14:51:00 2012 +0200 @@ -86,14 +86,17 @@ write back the modified script. .. function:: untokenize(iterable) Converts tokens back into Python source code. The *iterable* must return - sequences with at least two elements, the token type and the token string. - Any additional sequence elements are ignored. + sequences with either two or five elements: the token type and the token + string, optionally followed by the source location as in the result of + :func:`tokenize`. The reconstructed script is returned as a single string. The result is guaranteed to tokenize back to match the input so that the conversion is lossless and round-trips are assured. The guarantee applies only to the - token type and token string as the spacing between tokens (column - positions) may change. + token type and token string as the spacing between tokens (column positions) + may change. If the iterable provides five elemnents per token, whitespace + in the output will be much closer to the original, but some constructs (like + line continuations with backslashes) will not be recovered literally. It returns bytes, encoded using the ENCODING token, which is the first token sequence output by :func:`tokenize`. diff -r 1141648fa655 Lib/test/test_tokenize.py --- a/Lib/test/test_tokenize.py Sat Oct 06 13:49:34 2012 +0200 +++ b/Lib/test/test_tokenize.py Sat Oct 06 14:51:00 2012 +0200 @@ -662,18 +662,29 @@ def roundtrip(f): The source code in f is tokenized, converted back to source code via tokenize.untokenize(), and tokenized again from the latter. The test fails if the second tokenization doesn't match the first. + + We also check that the argument to untokenize() can be an iterator, + see bug #8478. """ if isinstance(f, str): f = BytesIO(f.encode('utf-8')) + code = f.read() + readline = (line for line in code.splitlines(keepends=True)).__next__ try: - token_list = list(tokenize(f.readline)) + token_list = list(tokenize(readline)) finally: f.close() + # "compat" mode with only two-element tuples tokens1 = [tok[:2] for tok in token_list] - new_bytes = untokenize(tokens1) + tokeniter = iter(tokens1) + new_bytes = untokenize(tokeniter) readline = (line for line in new_bytes.splitlines(keepends=True)).__next__ tokens2 = [tok[:2] for tok in tokenize(readline)] - return tokens1 == tokens2 + # new mode with (more or less) correct whitespace handling + new_bytes2 = untokenize(iter(token_list)) + readline = (line for line in new_bytes2.splitlines(keepends=True)).__next__ + tokens3 = [tok[:2] for tok in tokenize(readline)] + return tokens1 == tokens2 == tokens3 # This is an example from the docs, set up as a doctest. def decistmt(s): diff -r 1141648fa655 Lib/tokenize.py --- a/Lib/tokenize.py Sat Oct 06 13:49:34 2012 +0200 +++ b/Lib/tokenize.py Sat Oct 06 14:51:00 2012 +0200 @@ -31,6 +31,7 @@ from token import * from codecs import lookup, BOM_UTF8 import collections from io import TextIOWrapper +from itertools import chain cookie_re = re.compile("coding[:=]\s*([-\w.]+)") import token @@ -228,15 +229,18 @@ class Untokenizer: def add_whitespace(self, start): row, col = start - assert row <= self.prev_row + assert row >= self.prev_row col_offset = col - self.prev_col if col_offset: self.tokens.append(" " * col_offset) def untokenize(self, iterable): - for t in iterable: + it = iter(iterable) + for t in it: if len(t) == 2: - self.compat(t, iterable) + # we've already consumed the first element of the iterator, + # so have to supply it back here + self.compat(t, chain([t], it)) break tok_type, token, start, end, line = t if tok_type == ENCODING: @@ -299,10 +303,11 @@ def untokenize(iterable): Each element returned by the iterable must be a token sequence with at least two elements, a token number and token value. If - only two tokens are passed, the resulting output is poor. + only two elements are passed, the resulting output is poor. Round-trip invariant for full input: - Untokenized source will match input source exactly + Untokenized source will match input source up to backslash + continuations Round-trip invariant for limited intput: # Output bytes will tokenize the back to the input