diff -r 1b8ba1346e67 Doc/library/tokenize.rst
--- a/Doc/library/tokenize.rst	Tue Feb 04 23:02:36 2014 +1000
+++ b/Doc/library/tokenize.rst	Wed Feb 05 10:52:45 2014 +0000
@@ -85,18 +85,29 @@
 
 .. function:: untokenize(iterable)
 
-    Converts tokens back into Python source code.  The *iterable* must return
-    sequences with at least two elements, the token type and the token string.
-    Any additional sequence elements are ignored.
+    """Convert tokens back into Python source code.
+    The *iterable* must yield sequences of tokens. The reconstructed
+    source code is returned as a bytes object, encoded using the
+    ENCODING token, which is the first token output by :func:`tokenize`.
 
-    The reconstructed script is returned as a single string.  The result is
-    guaranteed to tokenize back to match the input so that the conversion is
-    lossless and round-trips are assured.  The guarantee applies only to the
-    token type and token string as the spacing between tokens (column
-    positions) may change.
+    :func:`untokenize` has two modes of operation. In the first mode, each
+    input token must be a sequence of length 2, whose members are the
+    token type and the token string. In this mode, the reconstructed
+    source code is guaranteed to tokenize back to match the input, so
+    that the conversion is lossless and round-trips are assured. This
+    guarantee applies only to the token type and token string, as the
+    spacing between tokens (column positions) may change.
 
-    It returns bytes, encoded using the ENCODING token, which is the first
-    token sequence output by :func:`tokenize`.
+    Otherwise, each input token must be a sequence of length 5 with
+    these members: the token type; the token string; a 2-tuple ``(srow,
+    scol)`` of ints specifying the row and column where the token begins
+    in the source; a 2-tuple ``(erow, ecol)`` of ints specifying the row
+    and column where the token ends in the source; and the line on
+    which the token was found, as returned by :func:`tokenize`. In this mode
+    of operation, spaces are added to the result so that each token
+    appears at the given row and column, if possible. The output
+    satisfies the above round-trip property, and in addition, for
+    valid Python source code, ``untokenize(tokenize(source)) == source``.
 
 
 :func:`tokenize` needs to detect the encoding of source files it tokenizes. The
diff -r 1b8ba1346e67 Lib/test/test_tokenize.py
--- a/Lib/test/test_tokenize.py	Tue Feb 04 23:02:36 2014 +1000
+++ b/Lib/test/test_tokenize.py	Wed Feb 05 10:52:45 2014 +0000
@@ -2,7 +2,7 @@
 Tests for the tokenize module.
 
 The tests can be really simple. Given a small fragment of source
-code, print out a table with tokens. The ENDMARK is omitted for
+code, print out a table with tokens. The ENDMARKER is omitted for
 brevity.
 
     >>> dump_tokens("1 + 1")
@@ -558,18 +558,36 @@
 
 Backslash means line continuation, except for comments
 
-    >>> roundtrip("x=1+\\\\n"
-    ...           "1\\n"
-    ...           "# This is a comment\\\\n"
-    ...           "# This also\\n")
-    True
-    >>> roundtrip("# Comment \\\\nx = 0")
-    True
+    >>> dump_tokens("x=1+\\\\\\n"
+    ...             "1\\n"
+    ...             "# This is a comment\\\\\\n"
+    ...             "# This also\\n")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
+    NAME       'x'           (1, 0) (1, 1)
+    OP         '='           (1, 1) (1, 2)
+    NUMBER     '1'           (1, 2) (1, 3)
+    OP         '+'           (1, 3) (1, 4)
+    NUMBER     '1'           (2, 0) (2, 1)
+    NEWLINE    '\\n'          (2, 1) (2, 2)
+    COMMENT    '# This is a  (3, 0) (3, 20)
+    NL         '\\n'          (3, 20) (3, 21)
+    COMMENT    '# This also' (4, 0) (4, 11)
+    NL         '\\n'          (4, 11) (4, 12)
+
+    >>> dump_tokens("# Comment \\\\\\nx = 0")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
+    COMMENT    '# Comment \\\\ (1, 0) (1, 11)
+    NL         '\\n'          (1, 11) (1, 12)
+    NAME       'x'           (2, 0) (2, 1)
+    OP         '='           (2, 2) (2, 3)
+    NUMBER     '0'           (2, 4) (2, 5)
 
 Two string literals on the same line
 
-    >>> roundtrip("'' ''")
-    True
+    >>> dump_tokens("'' ''")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
+    STRING     "''"          (1, 0) (1, 2)
+    STRING     "''"          (1, 3) (1, 5)
 
 Test roundtrip on random python modules.
 pass the '-ucpu' option to process the full directory.
@@ -648,6 +666,8 @@
     """Print out the tokens in s in a table format.
 
     The ENDMARKER is omitted.
+
+    Also, check the round-trip property of s.
     """
     f = BytesIO(s.encode('utf-8'))
     for type, token, start, end, line in tokenize(f.readline):
@@ -655,6 +675,7 @@
             break
         type = tok_name[type]
         print("%(type)-10.10s %(token)-13.13r %(start)s %(end)s" % locals())
+    assert roundtrip(s)
 
 def roundtrip(f):
     """
@@ -669,11 +690,24 @@
         token_list = list(tokenize(f.readline))
     finally:
         f.close()
+
+    # The test is repeated for the two modes of `untokenize`: in
+    # "compatibility" mode we truncate each token to its first two
+    # elements (type, token).
     tokens1 = [tok[:2] for tok in token_list]
     new_bytes = untokenize(tokens1)
-    readline = (line for line in new_bytes.splitlines(keepends=True)).__next__
+    readline = iter(new_bytes.splitlines(keepends=True)).__next__
     tokens2 = [tok[:2] for tok in tokenize(readline)]
-    return tokens1 == tokens2
+
+    # In "full" mode we pass the tokens unchanged.
+    new_bytes = untokenize(token_list)
+    readline = iter(new_bytes.splitlines(keepends=True)).__next__
+    # For the moment we only compare the truncated tokens, leaving
+    # whitespace differences undetected. TODO: test the full tokens
+    # instead.
+    tokens3 = [tok[:2] for tok in tokenize(readline)]
+
+    return tokens1 == tokens2 == tokens3
 
 # This is an example from the docs, set up as a doctest.
 def decistmt(s):
diff -r 1b8ba1346e67 Lib/tokenize.py
--- a/Lib/tokenize.py	Tue Feb 04 23:02:36 2014 +1000
+++ b/Lib/tokenize.py	Wed Feb 05 10:52:45 2014 +0000
@@ -227,14 +227,21 @@
         self.prev_col = 0
         self.encoding = None
 
-    def add_whitespace(self, start):
+    def add_whitespace(self, start, tok_type, prev_tok_type):
         row, col = start
-        assert row <= self.prev_row
+        assert row >= self.prev_row
+        if (row > self.prev_row
+            and tok_type not in (DEDENT, ENDMARKER)
+            and prev_tok_type not in (NEWLINE, NL)):
+            # Line must have been backslash-continued.
+            self.tokens.append(" \\\n")
+            self.prev_col = 0
         col_offset = col - self.prev_col
         if col_offset:
             self.tokens.append(" " * col_offset)
 
     def untokenize(self, iterable):
+        prev_tok_type = None
         for t in iterable:
             if len(t) == 2:
                 self.compat(t, iterable)
@@ -243,12 +250,13 @@
             if tok_type == ENCODING:
                 self.encoding = token
                 continue
-            self.add_whitespace(start)
+            self.add_whitespace(start, tok_type, prev_tok_type)
             self.tokens.append(token)
             self.prev_row, self.prev_col = end
             if tok_type in (NEWLINE, NL):
                 self.prev_row += 1
                 self.prev_col = 0
+            prev_tok_type = tok_type
         return "".join(self.tokens)
 
     def compat(self, token, iterable):
@@ -294,24 +302,30 @@
 
 
 def untokenize(iterable):
-    """Transform tokens back into Python source code.
-    It returns a bytes object, encoded using the ENCODING
-    token, which is the first token sequence output by tokenize.
+    """Convert tokens back into Python source code.
+    The iterable must yield sequences of tokens. The reconstructed
+    source code is returned as a bytes object, encoded using the
+    ENCODING token, which is the first token output by tokenize().
 
-    Each element returned by the iterable must be a token sequence
-    with at least two elements, a token number and token value.  If
-    only two tokens are passed, the resulting output is poor.
+    untokenize() has two modes of operation. In the first mode, each
+    input token must be a sequence of length 2, whose members are the
+    token type and the token string. In this mode, the reconstructed
+    source code is guaranteed to tokenize back to match the input, so
+    that the conversion is lossless and round-trips are assured. This
+    guarantee applies only to the token type and token string, as the
+    spacing between tokens (column positions) may change.
 
-    Round-trip invariant for full input:
-        Untokenized source will match input source exactly
+    Otherwise, each input token must be a sequence of length 5 with
+    these members: the token type; the token string; a 2-tuple (srow,
+    scol) of ints specifying the row and column where the token begins
+    in the source; a 2-tuple (erow, ecol) of ints specifying the row
+    and column where the token ends in the source; and the line on
+    which the token was found, as returned by tokenize(). In this mode
+    of operation, spaces are added to the result so that each token
+    appears at the given row and column, if possible. The output
+    satisfies the above round-trip property, and in addition, for
+    valid Python source code, untokenize(tokenize(source)) == source.
 
-    Round-trip invariant for limited intput:
-        # Output bytes will tokenize the back to the input
-        t1 = [tok[:2] for tok in tokenize(f.readline)]
-        newcode = untokenize(t1)
-        readline = BytesIO(newcode).readline
-        t2 = [tok[:2] for tok in tokenize(readline)]
-        assert t1 == t2
     """
     ut = Untokenizer()
     out = ut.untokenize(iterable)