diff -r 74e79b2c114a -r 2f69f3679a41 Doc/library/tokenize.rst
--- a/Doc/library/tokenize.rst	Fri Aug 05 23:05:35 2011 +0200
+++ b/Doc/library/tokenize.rst	Sat Aug 06 01:05:36 2011 +0100
@@ -67,21 +67,25 @@
 write back the modified script.
 
 
-.. function:: untokenize(iterable)
+.. function:: untokenize(tokens)
 
-    Converts tokens back into Python source code.  The *iterable* must return
-    sequences with at least two elements, the token type and the token string.
-    Any additional sequence elements are ignored.
+    Convert *tokens* (an iterable) back into Python source code. Return
+    a bytes object, encoded using the encoding specified by the last
+    ENCODING token in *tokens*, or UTF-8 if no ENCODING token is found.
 
-    The reconstructed script is returned as a single string.  The result is
-    guaranteed to tokenize back to match the input so that the conversion is
-    lossless and round-trips are assured.  The guarantee applies only to the
-    token type and token string as the spacing between tokens (column
-    positions) may change.
+    The result is guaranteed to tokenize back to match the input so that
+    the conversion is lossless and round-trips are assured.  The
+    guarantee applies only to the token type and token string as the
+    spacing between tokens (column positions) may change.
 
-    It returns bytes, encoded using the ENCODING token, which is the first
-    token sequence output by :func:`tokenize`.
+    :func:`untokenize` has two modes. If the input tokens are sequences
+    of length 2 (``type``, ``string``) then spaces are added as necessary to
+    preserve the round-trip property.
 
+    If the input tokens are sequences of length 4 or more (``type``,
+    ``string``, ``start``, ``end``), as returned by :func:`tokenize`, then
+    spaces are added so that each token appears in the result at the
+    position indicated by ``start`` and ``end``, if possible.
 
 :func:`tokenize` needs to detect the encoding of source files it tokenizes. The
 function it uses to do this is available:
diff -r 74e79b2c114a -r 2f69f3679a41 Lib/test/test_tokenize.py
--- a/Lib/test/test_tokenize.py	Fri Aug 05 23:05:35 2011 +0200
+++ b/Lib/test/test_tokenize.py	Sat Aug 06 01:05:36 2011 +0100
@@ -2,7 +2,7 @@
 Tests for the tokenize module.
 
 The tests can be really simple. Given a small fragment of source
-code, print out a table with tokens. The ENDMARK is omitted for
+code, print out a table with tokens. The ENDMARKER is omitted for
 brevity.
 
     >>> dump_tokens("1 + 1")
@@ -560,6 +560,18 @@
     NAME       'grün'        (2, 0) (2, 4)
     OP         '='           (2, 5) (2, 6)
     STRING     "'green'"     (2, 7) (2, 14)
+
+Untokenization of backslash-continued lines
+
+    >>> roundtrip("1 and \\\\\\n not 2\\n")
+    True
+
+Untokenization without an ENCODING token
+
+    >>> untokenize([(NAME, 'hello')])
+    b'hello '
+    >>> untokenize(iter([(NAME, 'hello')]))
+    b'hello '
 """
 
 from test import support
@@ -584,22 +596,34 @@
 
 def roundtrip(f):
     """
-    Test roundtrip for `untokenize`. `f` is an open file or a string.
-    The source code in f is tokenized, converted back to source code via
-    tokenize.untokenize(), and tokenized again from the latter. The test
-    fails if the second tokenization doesn't match the first.
+    Test the roundtrip property of `untokenize` and return True if it
+    passes in both modes (normal and compatibility). The first argument
+    `f` is a string, or an object with a `readline` method, such as an
+    open file. The source code in `f` is tokenized, converted back to
+    source code via `untokenize`, and then re-tokenized from the
+    latter. The test succeeds if the second tokenization matches the
+    first.
     """
     if isinstance(f, str):
         f = BytesIO(f.encode('utf-8'))
     try:
-        token_list = list(tokenize(f.readline))
+        tokens1 = list(tokenize(f.readline))
     finally:
         f.close()
-    tokens1 = [tok[:2] for tok in token_list]
-    new_bytes = untokenize(tokens1)
-    readline = (line for line in new_bytes.splitlines(1)).__next__
-    tokens2 = [tok[:2] for tok in tokenize(readline)]
-    return tokens1 == tokens2
+    # Full mode.
+    bytes1 = untokenize(tokens1)
+    tokens2 = list(tokenize(iter(bytes1.splitlines(1)).__next__))
+    if any(t1[:2] != t2[:2] for t1, t2 in zip(tokens1, tokens2)):
+        print([(t1, t2) for t1, t2 in zip(tokens1, tokens2) if t1[:2] != t2[:2]][0])
+        return False
+    # Compatibility mode (only pass a 2-tuple of type and string).
+    bytes2 = untokenize(t[:2] for t in tokens1)
+    tokens3 = list(tokenize(iter(bytes2.splitlines(1)).__next__))
+    if any(t1[:2] != t3[:2] for t1, t3 in zip(tokens1, tokens3)):
+        print([(t1, t2) for t1, t2 in zip(tokens1, tokens3) if t1[:2] != t2[:2]][0])
+        return False
+    # Both modes pass.
+    return True
 
 # This is an example from the docs, set up as a doctest.
 def decistmt(s):
diff -r 74e79b2c114a -r 2f69f3679a41 Lib/tokenize.py
--- a/Lib/tokenize.py	Fri Aug 05 23:05:35 2011 +0200
+++ b/Lib/tokenize.py	Sat Aug 06 01:05:36 2011 +0100
@@ -158,44 +158,46 @@
         self.tokens = []
         self.prev_row = 1
         self.prev_col = 0
-        self.encoding = None
+        self.encoding = 'utf-8'
 
-    def add_whitespace(self, start):
+    def add_whitespace(self, tok_type, start):
         row, col = start
-        assert row <= self.prev_row
+        assert row >= self.prev_row
         col_offset = col - self.prev_col
-        if col_offset:
+        if col_offset > 0:
             self.tokens.append(" " * col_offset)
+        elif row > self.prev_row and tok_type not in (NEWLINE, NL, ENDMARKER):
+            # Line was backslash-continued.
+            self.tokens.append(" ")
 
-    def untokenize(self, iterable):
+    def untokenize(self, tokens):
+        iterable = iter(tokens)
         for t in iterable:
             if len(t) == 2:
                 self.compat(t, iterable)
                 break
-            tok_type, token, start, end, line = t
+            tok_type, token, start, end, *_ = t
             if tok_type == ENCODING:
                 self.encoding = token
                 continue
-            self.add_whitespace(start)
+            self.add_whitespace(tok_type, start)
             self.tokens.append(token)
             self.prev_row, self.prev_col = end
             if tok_type in (NEWLINE, NL):
                 self.prev_row += 1
                 self.prev_col = 0
-        return "".join(self.tokens)
+        return "".join(self.tokens).encode(self.encoding)
 
     def compat(self, token, iterable):
+        # This import is here to avoid problems when the itertools
+        # module is not built yet and tokenize is imported.
+        from itertools import chain
         startline = False
+        prevstring = False
         indents = []
         toks_append = self.tokens.append
-        toknum, tokval = token
 
-        if toknum in (NAME, NUMBER):
-            tokval += ' '
-        if toknum in (NEWLINE, NL):
-            startline = True
-        prevstring = False
-        for tok in iterable:
+        for tok in chain([token], iterable):
             toknum, tokval = tok[:2]
             if toknum == ENCODING:
                 self.encoding = tokval
@@ -226,31 +228,27 @@
             toks_append(tokval)
 
 
-def untokenize(iterable):
-    """Transform tokens back into Python source code.
-    It returns a bytes object, encoded using the ENCODING
-    token, which is the first token sequence output by tokenize.
+def untokenize(tokens):
+    """
+    Convert ``tokens`` (an iterable) back into Python source code. Return
+    a bytes object, encoded using the encoding specified by the last
+    ENCODING token in ``tokens``, or UTF-8 if no ENCODING token is found.
 
-    Each element returned by the iterable must be a token sequence
-    with at least two elements, a token number and token value.  If
-    only two tokens are passed, the resulting output is poor.
+    The result is guaranteed to tokenize back to match the input so that
+    the conversion is lossless and round-trips are assured.  The
+    guarantee applies only to the token type and token string as the
+    spacing between tokens (column positions) may change.
 
-    Round-trip invariant for full input:
-        Untokenized source will match input source exactly
+    :func:`untokenize` has two modes. If the input tokens are sequences
+    of length 2 (``type``, ``string``) then spaces are added as necessary to
+    preserve the round-trip property.
 
-    Round-trip invariant for limited intput:
-        # Output bytes will tokenize the back to the input
-        t1 = [tok[:2] for tok in tokenize(f.readline)]
-        newcode = untokenize(t1)
-        readline = BytesIO(newcode).readline
-        t2 = [tok[:2] for tok in tokenize(readline)]
-        assert t1 == t2
+    If the input tokens are sequences of length 4 or more (``type``,
+    ``string``, ``start``, ``end``), as returned by :func:`tokenize`, then
+    spaces are added so that each token appears in the result at the
+    position indicated by ``start`` and ``end``, if possible.
     """
-    ut = Untokenizer()
-    out = ut.untokenize(iterable)
-    if ut.encoding is not None:
-        out = out.encode(ut.encoding)
-    return out
+    return Untokenizer().untokenize(tokens)
 
 
 def _get_normal_name(orig_enc):