diff -r 74e79b2c114a Doc/library/tokenize.rst --- a/Doc/library/tokenize.rst Fri Aug 05 23:05:35 2011 +0200 +++ b/Doc/library/tokenize.rst Fri Aug 05 23:50:26 2011 +0100 @@ -67,21 +67,27 @@ write back the modified script. -.. function:: untokenize(iterable) +.. function:: untokenize(tokens) - Converts tokens back into Python source code. The *iterable* must return - sequences with at least two elements, the token type and the token string. - Any additional sequence elements are ignored. + Convert *tokens* (a sequence of tokens) back into Python source + code. Return a bytes object, encoded using the encoding specified by + the last ENCODING token in *tokens*, or UTF-8 if no ENCODING token + is found. - The reconstructed script is returned as a single string. The result is - guaranteed to tokenize back to match the input so that the conversion is - lossless and round-trips are assured. The guarantee applies only to the - token type and token string as the spacing between tokens (column - positions) may change. + The result satisfies the following roundtrip property for any list + of tokens `T`:: - It returns bytes, encoded using the ENCODING token, which is the first - token sequence output by :func:`tokenize`. + >>> t = tokenize(io.BytesIO(untokenize(T)).readline) + >>> assert all(t1[:2] == t2[:2] for t1, t2 in zip(T, t)) + :func:`untokenize` has two modes. If the input tokens are sequences + of length 2 (`type`, `string`) then spaces are added as necessary to + preserve the roundtrip property. + + If the input tokens are sequences of length 4 or more (`type`, + `string`, `start`, `end`), as returned by :func:`tokenize`, then + spaces are added so that each token appears in the result at the + position indicated by `start` and `end`, if possible. :func:`tokenize` needs to detect the encoding of source files it tokenizes. The function it uses to do this is available: diff -r 74e79b2c114a Lib/test/test_tokenize.py --- a/Lib/test/test_tokenize.py Fri Aug 05 23:05:35 2011 +0200 +++ b/Lib/test/test_tokenize.py Fri Aug 05 23:50:26 2011 +0100 @@ -2,7 +2,7 @@ Tests for the tokenize module. The tests can be really simple. Given a small fragment of source -code, print out a table with tokens. The ENDMARK is omitted for +code, print out a table with tokens. The ENDMARKER is omitted for brevity. >>> dump_tokens("1 + 1") @@ -560,6 +560,18 @@ NAME 'grĂ¼n' (2, 0) (2, 4) OP '=' (2, 5) (2, 6) STRING "'green'" (2, 7) (2, 14) + +Untokenization of backslash-continued lines + + >>> roundtrip("1 and \\\\\\n not 2\\n") + True + +Untokenization without an ENCODING token + + >>> untokenize([(NAME, 'hello')]) + b'hello ' + >>> untokenize(iter([(NAME, 'hello')])) + b'hello ' """ from test import support @@ -584,22 +596,34 @@ def roundtrip(f): """ - Test roundtrip for `untokenize`. `f` is an open file or a string. - The source code in f is tokenized, converted back to source code via - tokenize.untokenize(), and tokenized again from the latter. The test - fails if the second tokenization doesn't match the first. + Test the roundtrip property of `untokenize` and return True if it + passes in both modes (normal and compatibility). The first argument + `f` is a string, or an object with a `readline` method, such as an + open file. The source code in `f` is tokenized, converted back to + source code via `untokenize`, and then re-tokenized from the + latter. The test succeeds if the second tokenization matches the + first. """ if isinstance(f, str): f = BytesIO(f.encode('utf-8')) try: - token_list = list(tokenize(f.readline)) + tokens1 = list(tokenize(f.readline)) finally: f.close() - tokens1 = [tok[:2] for tok in token_list] - new_bytes = untokenize(tokens1) - readline = (line for line in new_bytes.splitlines(1)).__next__ - tokens2 = [tok[:2] for tok in tokenize(readline)] - return tokens1 == tokens2 + # Full mode. + bytes1 = untokenize(tokens1) + tokens2 = list(tokenize(iter(bytes1.splitlines(1)).__next__)) + if any(t1[:2] != t2[:2] for t1, t2 in zip(tokens1, tokens2)): + print([(t1, t2) for t1, t2 in zip(tokens1, tokens2) if t1[:2] != t2[:2]][0]) + return False + # Compatibility mode (only pass a 2-tuple of type and string). + bytes2 = untokenize(t[:2] for t in tokens1) + tokens3 = list(tokenize(iter(bytes2.splitlines(1)).__next__)) + if any(t1[:2] != t3[:2] for t1, t3 in zip(tokens1, tokens3)): + print([(t1, t2) for t1, t2 in zip(tokens1, tokens3) if t1[:2] != t2[:2]][0]) + return False + # Both modes pass. + return True # This is an example from the docs, set up as a doctest. def decistmt(s): diff -r 74e79b2c114a Lib/tokenize.py --- a/Lib/tokenize.py Fri Aug 05 23:05:35 2011 +0200 +++ b/Lib/tokenize.py Fri Aug 05 23:50:26 2011 +0100 @@ -158,44 +158,45 @@ self.tokens = [] self.prev_row = 1 self.prev_col = 0 - self.encoding = None + self.encoding = 'utf-8' - def add_whitespace(self, start): + def add_whitespace(self, tok_type, start): row, col = start - assert row <= self.prev_row + assert row >= self.prev_row col_offset = col - self.prev_col - if col_offset: + if col_offset > 0: self.tokens.append(" " * col_offset) + elif row > self.prev_row and tok_type not in (NEWLINE, NL, ENDMARKER): + # Line was backslash-continued. + self.tokens.append(" ") - def untokenize(self, iterable): + def untokenize(self, tokens): + iterable = iter(tokens) for t in iterable: if len(t) == 2: self.compat(t, iterable) break - tok_type, token, start, end, line = t + tok_type, token, start, end, *_ = t if tok_type == ENCODING: self.encoding = token continue - self.add_whitespace(start) + self.add_whitespace(tok_type, start) self.tokens.append(token) self.prev_row, self.prev_col = end if tok_type in (NEWLINE, NL): self.prev_row += 1 self.prev_col = 0 - return "".join(self.tokens) + return "".join(self.tokens).encode(self.encoding) def compat(self, token, iterable): + # This import is here to avoid problems when the itertools + # module is not built yet and tokenize is imported. + from itertools import chain startline = False + prevstring = False indents = [] - toks_append = self.tokens.append - toknum, tokval = token - if toknum in (NAME, NUMBER): - tokval += ' ' - if toknum in (NEWLINE, NL): - startline = True - prevstring = False - for tok in iterable: + for tok in chain([token], iterable): toknum, tokval = tok[:2] if toknum == ENCODING: self.encoding = tokval @@ -221,36 +222,33 @@ elif toknum in (NEWLINE, NL): startline = True elif startline and indents: - toks_append(indents[-1]) + self.tokens.append(indents[-1]) startline = False - toks_append(tokval) + self.tokens.append(tokval) -def untokenize(iterable): - """Transform tokens back into Python source code. - It returns a bytes object, encoded using the ENCODING - token, which is the first token sequence output by tokenize. +def untokenize(tokens): + """Convert `tokens` (a sequence of tokens) back into Python source + code. Return a bytes object, encoded using the encoding specified by + the last ENCODING token in `tokens`, or UTF-8 if no ENCODING token + is found. - Each element returned by the iterable must be a token sequence - with at least two elements, a token number and token value. If - only two tokens are passed, the resulting output is poor. + The return satisfies the following roundtrip property for any list + of tokens `T`: - Round-trip invariant for full input: - Untokenized source will match input source exactly + >>> t = tokenize(io.BytesIO(untokenize(T)).readline) + >>> assert all(t1[:2] == t2[:2] for t1, t2 in zip(T, t)) - Round-trip invariant for limited intput: - # Output bytes will tokenize the back to the input - t1 = [tok[:2] for tok in tokenize(f.readline)] - newcode = untokenize(t1) - readline = BytesIO(newcode).readline - t2 = [tok[:2] for tok in tokenize(readline)] - assert t1 == t2 + `untokenize` has two modes. If the input tokens are sequences of + length 2 (`type`, `string`) then spaces are added as necessary to + preserve the roundtrip property. + + If the input tokens are sequences of length 4 or more (`type`, + `string`, `start`, `end`), as returned by `tokenize`, then spaces + are added so that each token appears in the result at the position + indicated by `start` and `end`, if possible. """ - ut = Untokenizer() - out = ut.untokenize(iterable) - if ut.encoding is not None: - out = out.encode(ut.encoding) - return out + return Untokenizer().untokenize(tokens) def _get_normal_name(orig_enc):