diff -r 74e79b2c114a -r 2f69f3679a41 Doc/library/tokenize.rst --- a/Doc/library/tokenize.rst Fri Aug 05 23:05:35 2011 +0200 +++ b/Doc/library/tokenize.rst Sat Aug 06 01:05:36 2011 +0100 @@ -67,21 +67,25 @@ write back the modified script. -.. function:: untokenize(iterable) +.. function:: untokenize(tokens) - Converts tokens back into Python source code. The *iterable* must return - sequences with at least two elements, the token type and the token string. - Any additional sequence elements are ignored. + Convert *tokens* (an iterable) back into Python source code. Return + a bytes object, encoded using the encoding specified by the last + ENCODING token in *tokens*, or UTF-8 if no ENCODING token is found. - The reconstructed script is returned as a single string. The result is - guaranteed to tokenize back to match the input so that the conversion is - lossless and round-trips are assured. The guarantee applies only to the - token type and token string as the spacing between tokens (column - positions) may change. + The result is guaranteed to tokenize back to match the input so that + the conversion is lossless and round-trips are assured. The + guarantee applies only to the token type and token string as the + spacing between tokens (column positions) may change. - It returns bytes, encoded using the ENCODING token, which is the first - token sequence output by :func:`tokenize`. + :func:`untokenize` has two modes. If the input tokens are sequences + of length 2 (``type``, ``string``) then spaces are added as necessary to + preserve the round-trip property. + If the input tokens are sequences of length 4 or more (``type``, + ``string``, ``start``, ``end``), as returned by :func:`tokenize`, then + spaces are added so that each token appears in the result at the + position indicated by ``start`` and ``end``, if possible. :func:`tokenize` needs to detect the encoding of source files it tokenizes. The function it uses to do this is available: diff -r 74e79b2c114a -r 2f69f3679a41 Lib/test/test_tokenize.py --- a/Lib/test/test_tokenize.py Fri Aug 05 23:05:35 2011 +0200 +++ b/Lib/test/test_tokenize.py Sat Aug 06 01:05:36 2011 +0100 @@ -2,7 +2,7 @@ Tests for the tokenize module. The tests can be really simple. Given a small fragment of source -code, print out a table with tokens. The ENDMARK is omitted for +code, print out a table with tokens. The ENDMARKER is omitted for brevity. >>> dump_tokens("1 + 1") @@ -560,6 +560,18 @@ NAME 'grĂ¼n' (2, 0) (2, 4) OP '=' (2, 5) (2, 6) STRING "'green'" (2, 7) (2, 14) + +Untokenization of backslash-continued lines + + >>> roundtrip("1 and \\\\\\n not 2\\n") + True + +Untokenization without an ENCODING token + + >>> untokenize([(NAME, 'hello')]) + b'hello ' + >>> untokenize(iter([(NAME, 'hello')])) + b'hello ' """ from test import support @@ -584,22 +596,34 @@ def roundtrip(f): """ - Test roundtrip for `untokenize`. `f` is an open file or a string. - The source code in f is tokenized, converted back to source code via - tokenize.untokenize(), and tokenized again from the latter. The test - fails if the second tokenization doesn't match the first. + Test the roundtrip property of `untokenize` and return True if it + passes in both modes (normal and compatibility). The first argument + `f` is a string, or an object with a `readline` method, such as an + open file. The source code in `f` is tokenized, converted back to + source code via `untokenize`, and then re-tokenized from the + latter. The test succeeds if the second tokenization matches the + first. """ if isinstance(f, str): f = BytesIO(f.encode('utf-8')) try: - token_list = list(tokenize(f.readline)) + tokens1 = list(tokenize(f.readline)) finally: f.close() - tokens1 = [tok[:2] for tok in token_list] - new_bytes = untokenize(tokens1) - readline = (line for line in new_bytes.splitlines(1)).__next__ - tokens2 = [tok[:2] for tok in tokenize(readline)] - return tokens1 == tokens2 + # Full mode. + bytes1 = untokenize(tokens1) + tokens2 = list(tokenize(iter(bytes1.splitlines(1)).__next__)) + if any(t1[:2] != t2[:2] for t1, t2 in zip(tokens1, tokens2)): + print([(t1, t2) for t1, t2 in zip(tokens1, tokens2) if t1[:2] != t2[:2]][0]) + return False + # Compatibility mode (only pass a 2-tuple of type and string). + bytes2 = untokenize(t[:2] for t in tokens1) + tokens3 = list(tokenize(iter(bytes2.splitlines(1)).__next__)) + if any(t1[:2] != t3[:2] for t1, t3 in zip(tokens1, tokens3)): + print([(t1, t2) for t1, t2 in zip(tokens1, tokens3) if t1[:2] != t2[:2]][0]) + return False + # Both modes pass. + return True # This is an example from the docs, set up as a doctest. def decistmt(s): diff -r 74e79b2c114a -r 2f69f3679a41 Lib/tokenize.py --- a/Lib/tokenize.py Fri Aug 05 23:05:35 2011 +0200 +++ b/Lib/tokenize.py Sat Aug 06 01:05:36 2011 +0100 @@ -158,44 +158,46 @@ self.tokens = [] self.prev_row = 1 self.prev_col = 0 - self.encoding = None + self.encoding = 'utf-8' - def add_whitespace(self, start): + def add_whitespace(self, tok_type, start): row, col = start - assert row <= self.prev_row + assert row >= self.prev_row col_offset = col - self.prev_col - if col_offset: + if col_offset > 0: self.tokens.append(" " * col_offset) + elif row > self.prev_row and tok_type not in (NEWLINE, NL, ENDMARKER): + # Line was backslash-continued. + self.tokens.append(" ") - def untokenize(self, iterable): + def untokenize(self, tokens): + iterable = iter(tokens) for t in iterable: if len(t) == 2: self.compat(t, iterable) break - tok_type, token, start, end, line = t + tok_type, token, start, end, *_ = t if tok_type == ENCODING: self.encoding = token continue - self.add_whitespace(start) + self.add_whitespace(tok_type, start) self.tokens.append(token) self.prev_row, self.prev_col = end if tok_type in (NEWLINE, NL): self.prev_row += 1 self.prev_col = 0 - return "".join(self.tokens) + return "".join(self.tokens).encode(self.encoding) def compat(self, token, iterable): + # This import is here to avoid problems when the itertools + # module is not built yet and tokenize is imported. + from itertools import chain startline = False + prevstring = False indents = [] toks_append = self.tokens.append - toknum, tokval = token - if toknum in (NAME, NUMBER): - tokval += ' ' - if toknum in (NEWLINE, NL): - startline = True - prevstring = False - for tok in iterable: + for tok in chain([token], iterable): toknum, tokval = tok[:2] if toknum == ENCODING: self.encoding = tokval @@ -226,31 +228,27 @@ toks_append(tokval) -def untokenize(iterable): - """Transform tokens back into Python source code. - It returns a bytes object, encoded using the ENCODING - token, which is the first token sequence output by tokenize. +def untokenize(tokens): + """ + Convert ``tokens`` (an iterable) back into Python source code. Return + a bytes object, encoded using the encoding specified by the last + ENCODING token in ``tokens``, or UTF-8 if no ENCODING token is found. - Each element returned by the iterable must be a token sequence - with at least two elements, a token number and token value. If - only two tokens are passed, the resulting output is poor. + The result is guaranteed to tokenize back to match the input so that + the conversion is lossless and round-trips are assured. The + guarantee applies only to the token type and token string as the + spacing between tokens (column positions) may change. - Round-trip invariant for full input: - Untokenized source will match input source exactly + :func:`untokenize` has two modes. If the input tokens are sequences + of length 2 (``type``, ``string``) then spaces are added as necessary to + preserve the round-trip property. - Round-trip invariant for limited intput: - # Output bytes will tokenize the back to the input - t1 = [tok[:2] for tok in tokenize(f.readline)] - newcode = untokenize(t1) - readline = BytesIO(newcode).readline - t2 = [tok[:2] for tok in tokenize(readline)] - assert t1 == t2 + If the input tokens are sequences of length 4 or more (``type``, + ``string``, ``start``, ``end``), as returned by :func:`tokenize`, then + spaces are added so that each token appears in the result at the + position indicated by ``start`` and ``end``, if possible. """ - ut = Untokenizer() - out = ut.untokenize(iterable) - if ut.encoding is not None: - out = out.encode(ut.encoding) - return out + return Untokenizer().untokenize(tokens) def _get_normal_name(orig_enc):