diff -r 1b8ba1346e67 Doc/library/tokenize.rst --- a/Doc/library/tokenize.rst Tue Feb 04 23:02:36 2014 +1000 +++ b/Doc/library/tokenize.rst Wed Feb 05 10:52:45 2014 +0000 @@ -85,18 +85,29 @@ .. function:: untokenize(iterable) - Converts tokens back into Python source code. The *iterable* must return - sequences with at least two elements, the token type and the token string. - Any additional sequence elements are ignored. + """Convert tokens back into Python source code. + The *iterable* must yield sequences of tokens. The reconstructed + source code is returned as a bytes object, encoded using the + ENCODING token, which is the first token output by :func:`tokenize`. - The reconstructed script is returned as a single string. The result is - guaranteed to tokenize back to match the input so that the conversion is - lossless and round-trips are assured. The guarantee applies only to the - token type and token string as the spacing between tokens (column - positions) may change. + :func:`untokenize` has two modes of operation. In the first mode, each + input token must be a sequence of length 2, whose members are the + token type and the token string. In this mode, the reconstructed + source code is guaranteed to tokenize back to match the input, so + that the conversion is lossless and round-trips are assured. This + guarantee applies only to the token type and token string, as the + spacing between tokens (column positions) may change. - It returns bytes, encoded using the ENCODING token, which is the first - token sequence output by :func:`tokenize`. + Otherwise, each input token must be a sequence of length 5 with + these members: the token type; the token string; a 2-tuple ``(srow, + scol)`` of ints specifying the row and column where the token begins + in the source; a 2-tuple ``(erow, ecol)`` of ints specifying the row + and column where the token ends in the source; and the line on + which the token was found, as returned by :func:`tokenize`. In this mode + of operation, spaces are added to the result so that each token + appears at the given row and column, if possible. The output + satisfies the above round-trip property, and in addition, for + valid Python source code, ``untokenize(tokenize(source)) == source``. :func:`tokenize` needs to detect the encoding of source files it tokenizes. The diff -r 1b8ba1346e67 Lib/test/test_tokenize.py --- a/Lib/test/test_tokenize.py Tue Feb 04 23:02:36 2014 +1000 +++ b/Lib/test/test_tokenize.py Wed Feb 05 10:52:45 2014 +0000 @@ -2,7 +2,7 @@ Tests for the tokenize module. The tests can be really simple. Given a small fragment of source -code, print out a table with tokens. The ENDMARK is omitted for +code, print out a table with tokens. The ENDMARKER is omitted for brevity. >>> dump_tokens("1 + 1") @@ -558,18 +558,36 @@ Backslash means line continuation, except for comments - >>> roundtrip("x=1+\\\\n" - ... "1\\n" - ... "# This is a comment\\\\n" - ... "# This also\\n") - True - >>> roundtrip("# Comment \\\\nx = 0") - True + >>> dump_tokens("x=1+\\\\\\n" + ... "1\\n" + ... "# This is a comment\\\\\\n" + ... "# This also\\n") + ENCODING 'utf-8' (0, 0) (0, 0) + NAME 'x' (1, 0) (1, 1) + OP '=' (1, 1) (1, 2) + NUMBER '1' (1, 2) (1, 3) + OP '+' (1, 3) (1, 4) + NUMBER '1' (2, 0) (2, 1) + NEWLINE '\\n' (2, 1) (2, 2) + COMMENT '# This is a (3, 0) (3, 20) + NL '\\n' (3, 20) (3, 21) + COMMENT '# This also' (4, 0) (4, 11) + NL '\\n' (4, 11) (4, 12) + + >>> dump_tokens("# Comment \\\\\\nx = 0") + ENCODING 'utf-8' (0, 0) (0, 0) + COMMENT '# Comment \\\\ (1, 0) (1, 11) + NL '\\n' (1, 11) (1, 12) + NAME 'x' (2, 0) (2, 1) + OP '=' (2, 2) (2, 3) + NUMBER '0' (2, 4) (2, 5) Two string literals on the same line - >>> roundtrip("'' ''") - True + >>> dump_tokens("'' ''") + ENCODING 'utf-8' (0, 0) (0, 0) + STRING "''" (1, 0) (1, 2) + STRING "''" (1, 3) (1, 5) Test roundtrip on random python modules. pass the '-ucpu' option to process the full directory. @@ -648,6 +666,8 @@ """Print out the tokens in s in a table format. The ENDMARKER is omitted. + + Also, check the round-trip property of s. """ f = BytesIO(s.encode('utf-8')) for type, token, start, end, line in tokenize(f.readline): @@ -655,6 +675,7 @@ break type = tok_name[type] print("%(type)-10.10s %(token)-13.13r %(start)s %(end)s" % locals()) + assert roundtrip(s) def roundtrip(f): """ @@ -669,11 +690,24 @@ token_list = list(tokenize(f.readline)) finally: f.close() + + # The test is repeated for the two modes of `untokenize`: in + # "compatibility" mode we truncate each token to its first two + # elements (type, token). tokens1 = [tok[:2] for tok in token_list] new_bytes = untokenize(tokens1) - readline = (line for line in new_bytes.splitlines(keepends=True)).__next__ + readline = iter(new_bytes.splitlines(keepends=True)).__next__ tokens2 = [tok[:2] for tok in tokenize(readline)] - return tokens1 == tokens2 + + # In "full" mode we pass the tokens unchanged. + new_bytes = untokenize(token_list) + readline = iter(new_bytes.splitlines(keepends=True)).__next__ + # For the moment we only compare the truncated tokens, leaving + # whitespace differences undetected. TODO: test the full tokens + # instead. + tokens3 = [tok[:2] for tok in tokenize(readline)] + + return tokens1 == tokens2 == tokens3 # This is an example from the docs, set up as a doctest. def decistmt(s): diff -r 1b8ba1346e67 Lib/tokenize.py --- a/Lib/tokenize.py Tue Feb 04 23:02:36 2014 +1000 +++ b/Lib/tokenize.py Wed Feb 05 10:52:45 2014 +0000 @@ -227,14 +227,21 @@ self.prev_col = 0 self.encoding = None - def add_whitespace(self, start): + def add_whitespace(self, start, tok_type, prev_tok_type): row, col = start - assert row <= self.prev_row + assert row >= self.prev_row + if (row > self.prev_row + and tok_type not in (DEDENT, ENDMARKER) + and prev_tok_type not in (NEWLINE, NL)): + # Line must have been backslash-continued. + self.tokens.append(" \\\n") + self.prev_col = 0 col_offset = col - self.prev_col if col_offset: self.tokens.append(" " * col_offset) def untokenize(self, iterable): + prev_tok_type = None for t in iterable: if len(t) == 2: self.compat(t, iterable) @@ -243,12 +250,13 @@ if tok_type == ENCODING: self.encoding = token continue - self.add_whitespace(start) + self.add_whitespace(start, tok_type, prev_tok_type) self.tokens.append(token) self.prev_row, self.prev_col = end if tok_type in (NEWLINE, NL): self.prev_row += 1 self.prev_col = 0 + prev_tok_type = tok_type return "".join(self.tokens) def compat(self, token, iterable): @@ -294,24 +302,30 @@ def untokenize(iterable): - """Transform tokens back into Python source code. - It returns a bytes object, encoded using the ENCODING - token, which is the first token sequence output by tokenize. + """Convert tokens back into Python source code. + The iterable must yield sequences of tokens. The reconstructed + source code is returned as a bytes object, encoded using the + ENCODING token, which is the first token output by tokenize(). - Each element returned by the iterable must be a token sequence - with at least two elements, a token number and token value. If - only two tokens are passed, the resulting output is poor. + untokenize() has two modes of operation. In the first mode, each + input token must be a sequence of length 2, whose members are the + token type and the token string. In this mode, the reconstructed + source code is guaranteed to tokenize back to match the input, so + that the conversion is lossless and round-trips are assured. This + guarantee applies only to the token type and token string, as the + spacing between tokens (column positions) may change. - Round-trip invariant for full input: - Untokenized source will match input source exactly + Otherwise, each input token must be a sequence of length 5 with + these members: the token type; the token string; a 2-tuple (srow, + scol) of ints specifying the row and column where the token begins + in the source; a 2-tuple (erow, ecol) of ints specifying the row + and column where the token ends in the source; and the line on + which the token was found, as returned by tokenize(). In this mode + of operation, spaces are added to the result so that each token + appears at the given row and column, if possible. The output + satisfies the above round-trip property, and in addition, for + valid Python source code, untokenize(tokenize(source)) == source. - Round-trip invariant for limited intput: - # Output bytes will tokenize the back to the input - t1 = [tok[:2] for tok in tokenize(f.readline)] - newcode = untokenize(t1) - readline = BytesIO(newcode).readline - t2 = [tok[:2] for tok in tokenize(readline)] - assert t1 == t2 """ ut = Untokenizer() out = ut.untokenize(iterable)