diff -r 854d05c13a8e Lib/test/test_tokenize.py --- a/Lib/test/test_tokenize.py Tue Feb 04 18:18:27 2014 +0100 +++ b/Lib/test/test_tokenize.py Wed Feb 05 01:50:31 2014 +0000 @@ -2,7 +2,7 @@ Tests for the tokenize module. The tests can be really simple. Given a small fragment of source -code, print out a table with tokens. The ENDMARK is omitted for +code, print out a table with tokens. The ENDMARKER is omitted for brevity. >>> dump_tokens("1 + 1") @@ -648,6 +648,8 @@ """Print out the tokens in s in a table format. The ENDMARKER is omitted. + + Also, check the round-trip property of s. """ f = BytesIO(s.encode('utf-8')) for type, token, start, end, line in tokenize(f.readline): @@ -655,6 +657,7 @@ break type = tok_name[type] print("%(type)-10.10s %(token)-13.13r %(start)s %(end)s" % locals()) + assert roundtrip(s) def roundtrip(f): """ @@ -669,11 +672,24 @@ token_list = list(tokenize(f.readline)) finally: f.close() + + # The test is repeated for the two modes of `untokenize`: in + # "compatibility" mode we truncate each token to its first two + # elements (type, token). tokens1 = [tok[:2] for tok in token_list] new_bytes = untokenize(tokens1) - readline = (line for line in new_bytes.splitlines(keepends=True)).__next__ + readline = iter(new_bytes.splitlines(keepends=True)).__next__ tokens2 = [tok[:2] for tok in tokenize(readline)] - return tokens1 == tokens2 + + # In "full" mode we pass the tokens unchanged. + new_bytes = untokenize(token_list) + readline = iter(new_bytes.splitlines(keepends=True)).__next__ + # For the moment we only compare the truncated tokens, leaving + # whitespace differences undetected. TODO: test the full tokens + # instead. + tokens3 = [tok[:2] for tok in tokenize(readline)] + + return tokens1 == tokens2 == tokens3 # This is an example from the docs, set up as a doctest. def decistmt(s): diff -r 854d05c13a8e Lib/tokenize.py --- a/Lib/tokenize.py Tue Feb 04 18:18:27 2014 +0100 +++ b/Lib/tokenize.py Wed Feb 05 01:50:31 2014 +0000 @@ -227,14 +227,21 @@ self.prev_col = 0 self.encoding = None - def add_whitespace(self, start): + def add_whitespace(self, start, tok_type, prev_tok_type): row, col = start - assert row <= self.prev_row + assert row >= self.prev_row + if (row > self.prev_row + and tok_type not in (DEDENT, ENDMARKER) + and prev_tok_type not in (NEWLINE, NL)): + # Line must have been backslash-continued. + self.tokens.append(" \\\n") + self.prev_col = 0 col_offset = col - self.prev_col if col_offset: self.tokens.append(" " * col_offset) def untokenize(self, iterable): + prev_tok_type = None for t in iterable: if len(t) == 2: self.compat(t, iterable) @@ -243,12 +250,13 @@ if tok_type == ENCODING: self.encoding = token continue - self.add_whitespace(start) + self.add_whitespace(start, tok_type, prev_tok_type) self.tokens.append(token) self.prev_row, self.prev_col = end if tok_type in (NEWLINE, NL): self.prev_row += 1 self.prev_col = 0 + prev_tok_type = tok_type return "".join(self.tokens) def compat(self, token, iterable): @@ -298,20 +306,20 @@ It returns a bytes object, encoded using the ENCODING token, which is the first token sequence output by tokenize. - Each element returned by the iterable must be a token sequence - with at least two elements, a token number and token value. If - only two tokens are passed, the resulting output is poor. + untokenize has two modes of operation. If each input token is a + sequence of length 2 (type, token) then the output satisfies the + following round-trip property: - Round-trip invariant for full input: - Untokenized source will match input source exactly + tokenize(untokenize(tokens)) == tokens - Round-trip invariant for limited intput: - # Output bytes will tokenize the back to the input - t1 = [tok[:2] for tok in tokenize(f.readline)] - newcode = untokenize(t1) - readline = BytesIO(newcode).readline - t2 = [tok[:2] for tok in tokenize(readline)] - assert t1 == t2 + If each input token is a sequence of length 5 (type, token, start, + end, line), as returned by tokenize, then spaces are added to the + result so that each token appears at the position indicated by + start and end, if possible. In this mode the output satisfies the + above round-trip property, and in addition: + + untokenize(tokenize(source)) == source + """ ut = Untokenizer() out = ut.untokenize(iterable)