Index: Misc/ACKS
===================================================================
--- Misc/ACKS	(revision 61560)
+++ Misc/ACKS	(working copy)
@@ -752,3 +752,5 @@
 Mike Zarnstorff
 Siebren van der Zee
 Uwe Zessin
+Trent Nelson
+Michael Foord
Index: Misc/NEWS
===================================================================
--- Misc/NEWS	(revision 61560)
+++ Misc/NEWS	(working copy)
@@ -41,6 +41,12 @@
 
 - Issue #1202: zlib.crc32 and zlib.adler32 now return an unsigned value.
 
+- Issue #719888: Updated tokenize to use a bytes API. generate_tokens has been
+  renamed tokenize and now works with bytes rather than strings. A new
+  detect_encoding function has been added for determining source file encoding
+  according to PEP-0263. Token sequences returned by tokenize always start
+  with an ENCODING token which specifies the encoding used to decode the file.
+  This token is used to encode the output of untokenize back to bytes.
 
 What's New in Python 3.0a3?
 ===========================
@@ -175,7 +181,6 @@
 
 - Issue #1578: Problems in win_getpass.
 
-
 Build
 -----
 
Index: Tools/i18n/pygettext.py
===================================================================
--- Tools/i18n/pygettext.py	(revision 61560)
+++ Tools/i18n/pygettext.py	(working copy)
@@ -631,7 +631,9 @@
         try:
             eater.set_filename(filename)
             try:
-                tokenize.tokenize(fp.readline, eater)
+                tokens = tokenize.generate_tokens(fp.readline)
+                for _token in tokens:
+                    eater(*_token)
             except tokenize.TokenError as e:
                 print('%s: %s, line %d, column %d' % (
                     e.args[0], filename, e.args[1][0], e.args[1][1]),
Index: Tools/scripts/checkappend.py
===================================================================
--- Tools/scripts/checkappend.py	(revision 61560)
+++ Tools/scripts/checkappend.py	(working copy)
@@ -103,7 +103,9 @@
 
     def run(self):
         try:
-            tokenize.tokenize(self.file.readline, self.tokeneater)
+            tokens = tokenize.generate_tokens(self.file.readline)
+            for _token in tokens:
+                self.tokeneater(*_token)
         except tokenize.TokenError as msg:
             errprint("%r: Token Error: %s" % (self.fname, msg))
             self.nerrors = self.nerrors + 1
Index: Tools/scripts/reindent.py
===================================================================
--- Tools/scripts/reindent.py	(revision 61560)
+++ Tools/scripts/reindent.py	(working copy)
@@ -173,7 +173,9 @@
         self.stats = []
 
     def run(self):
-        tokenize.tokenize(self.getline, self.tokeneater)
+        tokens = tokenize.generate_tokens(self.getline)
+        for _token in tokens:
+            self.tokeneater(*_token)
         # Remove trailing empty lines.
         lines = self.lines
         while lines and lines[-1] == "\n":
Index: Doc/ACKS.txt
===================================================================
--- Doc/ACKS.txt	(revision 61560)
+++ Doc/ACKS.txt	(working copy)
@@ -209,3 +209,5 @@
    * Moshe Zadka
    * Milan Zamazal
    * Cheng Zhang
+   * Trent Nelson
+   * Michael Foord
Index: Doc/library/tokenize.rst
===================================================================
--- Doc/library/tokenize.rst	(revision 61560)
+++ Doc/library/tokenize.rst	(working copy)
@@ -9,50 +9,34 @@
 
 
 The :mod:`tokenize` module provides a lexical scanner for Python source code,
-implemented in Python.  The scanner in this module returns comments as tokens as
-well, making it useful for implementing "pretty-printers," including colorizers
-for on-screen displays.
+implemented in Python.  The scanner in this module returns comments as tokens
+as well, making it useful for implementing "pretty-printers," including
+colorizers for on-screen displays.
 
 The primary entry point is a :term:`generator`:
 
 
-.. function:: generate_tokens(readline)
+.. function:: tokenize(readline)
 
-   The :func:`generate_tokens` generator requires one argument, *readline*, which
+   The :func:`tokenize` generator requires one argument, *readline*, which
    must be a callable object which provides the same interface as the
    :meth:`readline` method of built-in file objects (see section
-   :ref:`bltin-file-objects`).  Each call to the function should return one line of
-   input as a string.
+   :ref:`bltin-file-objects`).  Each call to the function should return one 
+   line of input as bytes.
 
-   The generator produces 5-tuples with these members: the token type; the token
-   string; a 2-tuple ``(srow, scol)`` of ints specifying the row and column where
-   the token begins in the source; a 2-tuple ``(erow, ecol)`` of ints specifying
-   the row and column where the token ends in the source; and the line on which the
-   token was found. The line passed is the *logical* line; continuation lines are
-   included.
+   The generator produces 5-tuples with these members: the token type; the 
+   token string; a 2-tuple ``(srow, scol)`` of ints specifying the row and 
+   column where the token begins in the source; a 2-tuple ``(erow, ecol)`` of 
+   ints specifying the row and column where the token ends in the source; and 
+   the line on which the token was found. The line passed is the *logical* 
+   line; continuation lines are included.
+   
+   tokenize determines the source encoding of the file by looking for a utf-8
+   bom or encoding cookie, according to :pep:`263`.
 
 
-An older entry point is retained for backward compatibility:
-
-.. function:: tokenize(readline[, tokeneater])
-
-   The :func:`tokenize` function accepts two parameters: one representing the input
-   stream, and one providing an output mechanism for :func:`tokenize`.
-
-   The first parameter, *readline*, must be a callable object which provides the
-   same interface as the :meth:`readline` method of built-in file objects (see
-   section :ref:`bltin-file-objects`).  Each call to the function should return one
-   line of input as a string. Alternately, *readline* may be a callable object that
-   signals completion by raising :exc:`StopIteration`.
-
-   The second parameter, *tokeneater*, must also be a callable object.  It is
-   called once for each token, with five arguments, corresponding to the tuples
-   generated by :func:`generate_tokens`.
-
-
 All constants from the :mod:`token` module are also exported from
-:mod:`tokenize`, as are two additional token type values that might be passed to
-the *tokeneater* function by :func:`tokenize`:
+:mod:`tokenize`, as are three additional token type values:
 
 .. data:: COMMENT
 
@@ -62,55 +46,94 @@
 .. data:: NL
 
    Token value used to indicate a non-terminating newline.  The NEWLINE token
-   indicates the end of a logical line of Python code; NL tokens are generated when
-   a logical line of code is continued over multiple physical lines.
+   indicates the end of a logical line of Python code; NL tokens are generated 
+   when a logical line of code is continued over multiple physical lines.
 
-Another function is provided to reverse the tokenization process. This is useful
-for creating tools that tokenize a script, modify the token stream, and write
-back the modified script.
 
+.. data:: ENCODING
 
+    Token value that indicates the encoding used to decode the source bytes 
+    into text. The first token returned by :func:`tokenize` will always be an 
+    ENCODING token.
+
+
+Another function is provided to reverse the tokenization process. This is 
+useful for creating tools that tokenize a script, modify the token stream, and 
+write back the modified script.
+
+
 .. function:: untokenize(iterable)
 
-   Converts tokens back into Python source code.  The *iterable* must return
-   sequences with at least two elements, the token type and the token string.  Any
-   additional sequence elements are ignored.
+    Converts tokens back into Python source code.  The *iterable* must return
+    sequences with at least two elements, the token type and the token string. 
+    Any additional sequence elements are ignored.
+    
+    The reconstructed script is returned as a single string.  The result is
+    guaranteed to tokenize back to match the input so that the conversion is
+    lossless and round-trips are assured.  The guarantee applies only to the 
+    token type and token string as the spacing between tokens (column 
+    positions) may change.
+    
+    It returns bytes, encoded using the ENCODING token, which is the first 
+    token sequence output by :func:`tokenize`.
 
-   The reconstructed script is returned as a single string.  The result is
-   guaranteed to tokenize back to match the input so that the conversion is
-   lossless and round-trips are assured.  The guarantee applies only to the token
-   type and token string as the spacing between tokens (column positions) may
-   change.
 
+:func:`tokenize` needs to detect the encoding of source files it tokenizes. The
+function it uses to do this is available:
 
+.. function:: detect_encoding(readline)
+
+    The :func:`detect_encoding` function is used to detect the encoding that 
+    should be used to decode a Python source file. It requires one argment, 
+    readline, in the same way as the :func:`tokenize` generator.
+    
+    It will call readline a maximum of twice, and return the encoding used
+    (as a string) and a list of any lines (not decoded from bytes) it has read
+    in.
+    
+    It detects the encoding from the presence of a utf-8 bom or an encoding
+    cookie as specified in pep-0263. If both a bom and a cookie are present,
+    but disagree, a SyntaxError will be raised.
+    
+    If no encoding is specified, then the default of 'utf-8' will be returned. 
+
+    
 Example of a script re-writer that transforms float literals into Decimal
 objects::
 
-   def decistmt(s):
-       """Substitute Decimals for floats in a string of statements.
+    def decistmt(s):
+        """Substitute Decimals for floats in a string of statements.
+    
+        >>> from decimal import Decimal
+        >>> s = 'print(+21.3e-5*-.1234/81.7)'
+        >>> decistmt(s)
+        "print (+Decimal ('21.3e-5')*-Decimal ('.1234')/Decimal ('81.7'))"
+    
+        The format of the exponent is inherited from the platform C library.
+        Known cases are "e-007" (Windows) and "e-07" (not Windows).  Since
+        we're only showing 12 digits, and the 13th isn't close to 5, the
+        rest of the output should be platform-independent.
+    
+        >>> exec(s) #doctest: +ELLIPSIS
+        -3.21716034272e-0...7
+    
+        Output from calculations with Decimal should be identical across all
+        platforms.
+    
+        >>> exec(decistmt(s))
+        -3.217160342717258261933904529E-7
+        """
+        result = []
+        g = tokenize(BytesIO(s.encode('utf-8')).readline) # tokenize the string
+        for toknum, tokval, _, _, _  in g:
+            if toknum == NUMBER and '.' in tokval:  # replace NUMBER tokens
+                result.extend([
+                    (NAME, 'Decimal'),
+                    (OP, '('),
+                    (STRING, repr(tokval)),
+                    (OP, ')')
+                ])
+            else:
+                result.append((toknum, tokval))
+        return untokenize(result).decode('utf-8')
 
-       >>> from decimal import Decimal
-       >>> s = 'print(+21.3e-5*-.1234/81.7)'
-       >>> decistmt(s)
-       "print(+Decimal ('21.3e-5')*-Decimal ('.1234')/Decimal ('81.7'))"
-
-       >>> exec(s)
-       -3.21716034272e-007
-       >>> exec(decistmt(s))
-       -3.217160342717258261933904529E-7
-
-       """
-       result = []
-       g = generate_tokens(StringIO(s).readline)   # tokenize the string
-       for toknum, tokval, _, _, _  in g:
-           if toknum == NUMBER and '.' in tokval:  # replace NUMBER tokens
-               result.extend([
-                   (NAME, 'Decimal'),
-                   (OP, '('),
-                   (STRING, repr(tokval)),
-                   (OP, ')')
-               ])
-           else:
-               result.append((toknum, tokval))
-       return untokenize(result)
-
Index: Lib/idlelib/EditorWindow.py
===================================================================
--- Lib/idlelib/EditorWindow.py	(revision 61560)
+++ Lib/idlelib/EditorWindow.py	(working copy)
@@ -1437,7 +1437,9 @@
         _tokenize.tabsize = self.tabwidth
         try:
             try:
-                _tokenize.tokenize(self.readline, self.tokeneater)
+                tokens = _tokenize.generate_tokens(self.readline)
+                for token in tokens:
+                    self.tokeneater(*token)
             except _tokenize.TokenError:
                 # since we cut off the tokenizer early, we can trigger
                 # spurious errors
Index: Lib/tokenize.py
===================================================================
--- Lib/tokenize.py	(revision 61560)
+++ Lib/tokenize.py	(working copy)
@@ -1,8 +1,11 @@
 """Tokenization help for Python programs.
 
-generate_tokens(readline) is a generator that breaks a stream of
-text into Python tokens.  It accepts a readline-like method which is called
-repeatedly to get the next line of input (or "" for EOF).  It generates
+tokenize(readline) is a generator that breaks a stream of
+bytes into Python tokens. It decodes the bytes according to
+PEP-0263 for determining source file encoding.
+
+It accepts a readline-like method which is called
+repeatedly to get the next line of input (or b"" for EOF).  It generates
 5-tuples with these members:
 
     the token type (see token.py)
@@ -13,32 +16,32 @@
 
 It is designed to match the working of the Python tokenizer exactly, except
 that it produces COMMENT tokens for comments and gives type OP for all
-operators
+operators. Aditionally, all token lists start with an ENCODING token
+which tells you which encoding was used to decode the bytes stream."""
 
-Older entry points
-    tokenize_loop(readline, tokeneater)
-    tokenize(readline, tokeneater=printtoken)
-are the same, except instead of generating tokens, tokeneater is a callback
-function to which the 5 fields described above are passed as 5 arguments,
-each time a new token is found."""
-
 __author__ = 'Ka-Ping Yee <ping@lfw.org>'
-__credits__ = \
-    'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro, Raymond Hettinger'
+__credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '
+               'Skip Montanaro, Raymond Hettinger, Trent Nelson, '
+               'Michael Foord')
 
-import string, re
+import re, string, sys
 from token import *
+from codecs import lookup
+from itertools import chain, repeat
+cookie_re = re.compile("coding[:=]\s*([-\w.]+)")
 
 import token
 __all__ = [x for x in dir(token) if x[0] != '_'] + ["COMMENT", "tokenize",
-           "generate_tokens", "NL", "untokenize"]
+           "detect_encoding", "NL", "untokenize", "ENCODING"]
 del token
 
 COMMENT = N_TOKENS
 tok_name[COMMENT] = 'COMMENT'
 NL = N_TOKENS + 1
 tok_name[NL] = 'NL'
-N_TOKENS += 2
+ENCODING = N_TOKENS + 2
+tok_name[ENCODING] = 'ENCODING'
+N_TOKENS += 3
 
 def group(*choices): return '(' + '|'.join(choices) + ')'
 def any(*choices): return group(*choices) + '*'
@@ -132,40 +135,14 @@
 
 class StopTokenizing(Exception): pass
 
-def printtoken(type, token, startrowcol, endrowcol, line): # for testing
-    (srow, scol), (erow, ecol) = startrowcol, endrowcol
-    print("%d,%d-%d,%d:\t%s\t%s" % \
-        (srow, scol, erow, ecol, tok_name[type], repr(token)))
 
-def tokenize(readline, tokeneater=printtoken):
-    """
-    The tokenize() function accepts two parameters: one representing the
-    input stream, and one providing an output mechanism for tokenize().
-
-    The first parameter, readline, must be a callable object which provides
-    the same interface as the readline() method of built-in file objects.
-    Each call to the function should return one line of input as a string.
-
-    The second parameter, tokeneater, must also be a callable object. It is
-    called once for each token, with five arguments, corresponding to the
-    tuples generated by generate_tokens().
-    """
-    try:
-        tokenize_loop(readline, tokeneater)
-    except StopTokenizing:
-        pass
-
-# backwards compatible interface
-def tokenize_loop(readline, tokeneater):
-    for token_info in generate_tokens(readline):
-        tokeneater(*token_info)
-
 class Untokenizer:
 
     def __init__(self):
         self.tokens = []
         self.prev_row = 1
         self.prev_col = 0
+        self.encoding = None
 
     def add_whitespace(self, start):
         row, col = start
@@ -180,6 +157,9 @@
                 self.compat(t, iterable)
                 break
             tok_type, token, start, end, line = t
+            if tok_type == ENCODING:
+                self.encoding = token
+                continue
             self.add_whitespace(start)
             self.tokens.append(token)
             self.prev_row, self.prev_col = end
@@ -193,12 +173,16 @@
         indents = []
         toks_append = self.tokens.append
         toknum, tokval = token
+
         if toknum in (NAME, NUMBER):
             tokval += ' '
         if toknum in (NEWLINE, NL):
             startline = True
         for tok in iterable:
             toknum, tokval = tok[:2]
+            if toknum == ENCODING:
+                self.encoding = tokval
+                continue
 
             if toknum in (NAME, NUMBER):
                 tokval += ' '
@@ -216,8 +200,11 @@
                 startline = False
             toks_append(tokval)
 
+
 def untokenize(iterable):
     """Transform tokens back into Python source code.
+    It returns a bytes object, encoded using the ENCODING
+    token, which is the first token sequence output by tokenize.
 
     Each element returned by the iterable must be a token sequence
     with at least two elements, a token number and token value.  If
@@ -227,24 +214,89 @@
         Untokenized source will match input source exactly
 
     Round-trip invariant for limited intput:
-        # Output text will tokenize the back to the input
-        t1 = [tok[:2] for tok in generate_tokens(f.readline)]
+        # Output bytes will tokenize the back to the input
+        t1 = [tok[:2] for tok in tokenize(f.readline)]
         newcode = untokenize(t1)
-        readline = iter(newcode.splitlines(1)).__next__
-        t2 = [tok[:2] for tokin generate_tokens(readline)]
+        readline = BytesIO(newcode).readline
+        t2 = [tok[:2] for tok in tokenize(readline)]
         assert t1 == t2
     """
     ut = Untokenizer()
-    return ut.untokenize(iterable)
+    out = ut.untokenize(iterable)
+    if ut.encoding is not None:
+        out = out.encode(ut.encoding)
+    return out
 
-def generate_tokens(readline):
+
+def detect_encoding(readline):
     """
-    The generate_tokens() generator requires one argment, readline, which
+    The detect_encoding() function is used to detect the encoding that should
+    be used to decode a Python source file. It requires one argment, readline,
+    in the same way as the tokenize() generator.
+
+    It will call readline a maximum of twice, and return the encoding used
+    (as a string) and a list of any lines (left as bytes) it has read
+    in.
+
+    It detects the encoding from the presence of a utf-8 bom or an encoding
+    cookie as specified in pep-0263. If both a bom and a cookie are present,
+    but disagree, a SyntaxError will be raised.
+
+    If no encoding is specified, then the default of 'utf-8' will be returned.
+    """
+    utf8_bom = b'\xef\xbb\xbf'
+    bom_found = False
+    encoding = None
+    def read_or_stop():
+        try:
+            return readline()
+        except StopIteration:
+            return b''
+
+    def find_cookie(line):
+        try:
+            line_string = line.decode('ascii')
+        except UnicodeDecodeError:
+            pass
+        else:
+            matches = cookie_re.findall(line_string)
+            if matches:
+                encoding = matches[0]
+                if bom_found and lookup(encoding).name != 'utf-8':
+                    # This behaviour mimics the Python interpreter
+                    raise SyntaxError('encoding problem: utf-8')
+                return encoding
+
+    first = read_or_stop()
+    if first.startswith(utf8_bom):
+        bom_found = True
+        first = first[3:]
+    if not first:
+        return 'utf-8', []
+
+    encoding = find_cookie(first)
+    if encoding:
+        return encoding, [first]
+
+    second = read_or_stop()
+    if not second:
+        return 'utf-8', [first]
+
+    encoding = find_cookie(second)
+    if encoding:
+        return encoding, [first, second]
+
+    return 'utf-8', [first, second]
+
+
+def tokenize(readline):
+    """
+    The tokenize() generator requires one argment, readline, which
     must be a callable object which provides the same interface as the
     readline() method of built-in file objects. Each call to the function
-    should return one line of input as a string.  Alternately, readline
+    should return one line of input as bytes.  Alternately, readline
     can be a callable function terminating with StopIteration:
-        readline = open(myfile).__next__    # Example of alternate readline
+        readline = open(myfile, 'rb').__next__  # Example of alternate readline
 
     The generator produces 5-tuples with these members: the token type; the
     token string; a 2-tuple (srow, scol) of ints specifying the row and
@@ -252,18 +304,38 @@
     ints specifying the row and column where the token ends in the source;
     and the line on which the token was found. The line passed is the
     logical line; continuation lines are included.
+
+    The first token sequence will always be an ENCODING token
+    which tells you which encoding was used to decode the bytes stream.
     """
+    encoding, consumed = detect_encoding(readline)
+    def readline_generator():
+        while True:
+            try:
+                yield readline()
+            except StopIteration:
+                return
+    chained = chain(consumed, readline_generator())
+    return _tokenize(chained.__next__, encoding)
+
+
+def _tokenize(readline, encoding):
     lnum = parenlev = continued = 0
     namechars, numchars = string.ascii_letters + '_', '0123456789'
     contstr, needcont = '', 0
     contline = None
     indents = [0]
-
+    
+    if encoding is not None:
+        yield (ENCODING, encoding, (0, 0), (0, 0), '')
     while 1:                                   # loop over lines in stream
         try:
             line = readline()
         except StopIteration:
-            line = ''
+            line = b''
+
+        if encoding is not None:
+            line = line.decode(encoding)
         lnum = lnum + 1
         pos, max = 0, len(line)
 
@@ -385,7 +457,9 @@
         yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
     yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
 
-if __name__ == '__main__':                     # testing
-    import sys
-    if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
-    else: tokenize(sys.stdin.readline)
+
+# An undocumented, backwards compatible, API for all the places in the standard
+# library that expect to be able to use tokenize with strings
+def generate_tokens(readline):
+    return _tokenize(readline, None)
+    
\ No newline at end of file
Index: Lib/inspect.py
===================================================================
--- Lib/inspect.py	(revision 61560)
+++ Lib/inspect.py	(working copy)
@@ -657,7 +657,9 @@
     """Extract the block of code at the top of the given list of lines."""
     blockfinder = BlockFinder()
     try:
-        tokenize.tokenize(iter(lines).__next__, blockfinder.tokeneater)
+        tokens = tokenize.generate_tokens(iter(lines).__next__)
+        for _token in tokens:
+            blockfinder.tokeneater(*_token)
     except (EndOfBlock, IndentationError):
         pass
     return lines[:blockfinder.last]
Index: Lib/test/test_tokenize.py
===================================================================
--- Lib/test/test_tokenize.py	(revision 61560)
+++ Lib/test/test_tokenize.py	(working copy)
@@ -1,13 +1,14 @@
+# -*- coding: utf-8 -*-
+
 doctests = """
 Tests for the tokenize module.
 
-    >>> import glob, random, sys
-
 The tests can be really simple. Given a small fragment of source
 code, print out a table with tokens. The ENDMARK is omitted for
 brevity.
 
     >>> dump_tokens("1 + 1")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     NUMBER     '1'           (1, 0) (1, 1)
     OP         '+'           (1, 2) (1, 3)
     NUMBER     '1'           (1, 4) (1, 5)
@@ -15,6 +16,7 @@
     >>> dump_tokens("if False:\\n"
     ...             "    # NL\\n"
     ...             "    True = False # NEWLINE\\n")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     NAME       'if'          (1, 0) (1, 2)
     NAME       'False'       (1, 3) (1, 8)
     OP         ':'           (1, 8) (1, 9)
@@ -34,27 +36,12 @@
     ...     x += 2
     ...   x += 5
     ... \"""
-    >>> for tok in generate_tokens(StringIO(indent_error_file).readline): pass
+    >>> readline = BytesIO(indent_error_file.encode('utf-8')).readline
+    >>> for tok in tokenize(readline): pass
     Traceback (most recent call last):
         ...
     IndentationError: unindent does not match any outer indentation level
 
-Test roundtrip for `untokenize`. `f` is an open file or a string. The source
-code in f is tokenized, converted back to source code via tokenize.untokenize(),
-and tokenized again from the latter. The test fails if the second tokenization
-doesn't match the first.
-
-    >>> def roundtrip(f):
-    ...     if isinstance(f, str): f = StringIO(f)
-    ...     token_list = list(generate_tokens(f.readline))
-    ...     f.close()
-    ...     tokens1 = [tok[:2] for tok in token_list]
-    ...     new_text = untokenize(tokens1)
-    ...     readline = iter(new_text.splitlines(1)).__next__
-    ...     tokens2 = [tok[:2] for tok in generate_tokens(readline)]
-    ...     return tokens1 == tokens2
-    ...
-
 There are some standard formattig practises that are easy to get right.
 
     >>> roundtrip("if x == 1:\\n"
@@ -67,14 +54,14 @@
 Some people use different formatting conventions, which makes
 untokenize a little trickier. Note that this test involves trailing
 whitespace after the colon. Note that we use hex escapes to make the
-two trailing blanks apperant in the expected output.
+two trailing blanks apparent in the expected output.
 
     >>> roundtrip("if x == 1 : \\n"
     ...           "  print(x)\\n")
     True
 
     >>> f = test_support.findfile("tokenize_tests.txt")
-    >>> roundtrip(open(f))
+    >>> roundtrip(open(f, 'rb'))
     True
 
     >>> roundtrip("if x == 1:\\n"
@@ -122,27 +109,33 @@
 Ordinary integers and binary operators
 
     >>> dump_tokens("0xff <= 255")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     NUMBER     '0xff'        (1, 0) (1, 4)
     OP         '<='          (1, 5) (1, 7)
     NUMBER     '255'         (1, 8) (1, 11)
     >>> dump_tokens("0b10 <= 255")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     NUMBER     '0b10'        (1, 0) (1, 4)
     OP         '<='          (1, 5) (1, 7)
     NUMBER     '255'         (1, 8) (1, 11)
     >>> dump_tokens("0o123 <= 0O123")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     NUMBER     '0o123'       (1, 0) (1, 5)
     OP         '<='          (1, 6) (1, 8)
     NUMBER     '0O123'       (1, 9) (1, 14)
     >>> dump_tokens("1234567 > ~0x15")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     NUMBER     '1234567'     (1, 0) (1, 7)
     OP         '>'           (1, 8) (1, 9)
     OP         '~'           (1, 10) (1, 11)
     NUMBER     '0x15'        (1, 11) (1, 15)
     >>> dump_tokens("2134568 != 1231515")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     NUMBER     '2134568'     (1, 0) (1, 7)
     OP         '!='          (1, 8) (1, 10)
     NUMBER     '1231515'     (1, 11) (1, 18)
     >>> dump_tokens("(-124561-1) & 200000000")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     OP         '('           (1, 0) (1, 1)
     OP         '-'           (1, 1) (1, 2)
     NUMBER     '124561'      (1, 2) (1, 8)
@@ -152,15 +145,18 @@
     OP         '&'           (1, 12) (1, 13)
     NUMBER     '200000000'   (1, 14) (1, 23)
     >>> dump_tokens("0xdeadbeef != -1")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     NUMBER     '0xdeadbeef'  (1, 0) (1, 10)
     OP         '!='          (1, 11) (1, 13)
     OP         '-'           (1, 14) (1, 15)
     NUMBER     '1'           (1, 15) (1, 16)
     >>> dump_tokens("0xdeadc0de & 12345")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     NUMBER     '0xdeadc0de'  (1, 0) (1, 10)
     OP         '&'           (1, 11) (1, 12)
     NUMBER     '12345'       (1, 13) (1, 18)
     >>> dump_tokens("0xFF & 0x15 | 1234")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     NUMBER     '0xFF'        (1, 0) (1, 4)
     OP         '&'           (1, 5) (1, 6)
     NUMBER     '0x15'        (1, 7) (1, 11)
@@ -170,18 +166,22 @@
 Long integers
 
     >>> dump_tokens("x = 0")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     NAME       'x'           (1, 0) (1, 1)
     OP         '='           (1, 2) (1, 3)
     NUMBER     '0'           (1, 4) (1, 5)
     >>> dump_tokens("x = 0xfffffffffff")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     NAME       'x'           (1, 0) (1, 1)
     OP         '='           (1, 2) (1, 3)
     NUMBER     '0xffffffffff (1, 4) (1, 17)
     >>> dump_tokens("x = 123141242151251616110")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     NAME       'x'           (1, 0) (1, 1)
     OP         '='           (1, 2) (1, 3)
     NUMBER     '123141242151 (1, 4) (1, 25)
     >>> dump_tokens("x = -15921590215012591")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     NAME       'x'           (1, 0) (1, 1)
     OP         '='           (1, 2) (1, 3)
     OP         '-'           (1, 4) (1, 5)
@@ -190,32 +190,39 @@
 Floating point numbers
 
     >>> dump_tokens("x = 3.14159")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     NAME       'x'           (1, 0) (1, 1)
     OP         '='           (1, 2) (1, 3)
     NUMBER     '3.14159'     (1, 4) (1, 11)
     >>> dump_tokens("x = 314159.")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     NAME       'x'           (1, 0) (1, 1)
     OP         '='           (1, 2) (1, 3)
     NUMBER     '314159.'     (1, 4) (1, 11)
     >>> dump_tokens("x = .314159")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     NAME       'x'           (1, 0) (1, 1)
     OP         '='           (1, 2) (1, 3)
     NUMBER     '.314159'     (1, 4) (1, 11)
     >>> dump_tokens("x = 3e14159")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     NAME       'x'           (1, 0) (1, 1)
     OP         '='           (1, 2) (1, 3)
     NUMBER     '3e14159'     (1, 4) (1, 11)
     >>> dump_tokens("x = 3E123")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     NAME       'x'           (1, 0) (1, 1)
     OP         '='           (1, 2) (1, 3)
     NUMBER     '3E123'       (1, 4) (1, 9)
     >>> dump_tokens("x+y = 3e-1230")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     NAME       'x'           (1, 0) (1, 1)
     OP         '+'           (1, 1) (1, 2)
     NAME       'y'           (1, 2) (1, 3)
     OP         '='           (1, 4) (1, 5)
     NUMBER     '3e-1230'     (1, 6) (1, 13)
     >>> dump_tokens("x = 3.14e159")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     NAME       'x'           (1, 0) (1, 1)
     OP         '='           (1, 2) (1, 3)
     NUMBER     '3.14e159'    (1, 4) (1, 12)
@@ -223,6 +230,7 @@
 String literals
 
     >>> dump_tokens("x = ''; y = \\\"\\\"")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     NAME       'x'           (1, 0) (1, 1)
     OP         '='           (1, 2) (1, 3)
     STRING     "''"          (1, 4) (1, 6)
@@ -231,6 +239,7 @@
     OP         '='           (1, 10) (1, 11)
     STRING     '""'          (1, 12) (1, 14)
     >>> dump_tokens("x = '\\\"'; y = \\\"'\\\"")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     NAME       'x'           (1, 0) (1, 1)
     OP         '='           (1, 2) (1, 3)
     STRING     '\\'"\\''       (1, 4) (1, 7)
@@ -239,24 +248,28 @@
     OP         '='           (1, 11) (1, 12)
     STRING     '"\\'"'        (1, 13) (1, 16)
     >>> dump_tokens("x = \\\"doesn't \\\"shrink\\\", does it\\\"")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     NAME       'x'           (1, 0) (1, 1)
     OP         '='           (1, 2) (1, 3)
     STRING     '"doesn\\'t "' (1, 4) (1, 14)
     NAME       'shrink'      (1, 14) (1, 20)
     STRING     '", does it"' (1, 20) (1, 31)
     >>> dump_tokens("x = 'abc' + 'ABC'")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     NAME       'x'           (1, 0) (1, 1)
     OP         '='           (1, 2) (1, 3)
     STRING     "'abc'"       (1, 4) (1, 9)
     OP         '+'           (1, 10) (1, 11)
     STRING     "'ABC'"       (1, 12) (1, 17)
     >>> dump_tokens('y = "ABC" + "ABC"')
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     NAME       'y'           (1, 0) (1, 1)
     OP         '='           (1, 2) (1, 3)
     STRING     '"ABC"'       (1, 4) (1, 9)
     OP         '+'           (1, 10) (1, 11)
     STRING     '"ABC"'       (1, 12) (1, 17)
     >>> dump_tokens("x = r'abc' + r'ABC' + R'ABC' + R'ABC'")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     NAME       'x'           (1, 0) (1, 1)
     OP         '='           (1, 2) (1, 3)
     STRING     "r'abc'"      (1, 4) (1, 10)
@@ -267,6 +280,7 @@
     OP         '+'           (1, 29) (1, 30)
     STRING     "R'ABC'"      (1, 31) (1, 37)
     >>> dump_tokens('y = r"abc" + r"ABC" + R"ABC" + R"ABC"')
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     NAME       'y'           (1, 0) (1, 1)
     OP         '='           (1, 2) (1, 3)
     STRING     'r"abc"'      (1, 4) (1, 10)
@@ -280,6 +294,7 @@
 Operators
 
     >>> dump_tokens("def d22(a, b, c=2, d=2, *k): pass")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     NAME       'def'         (1, 0) (1, 3)
     NAME       'd22'         (1, 4) (1, 7)
     OP         '('           (1, 7) (1, 8)
@@ -301,6 +316,7 @@
     OP         ':'           (1, 27) (1, 28)
     NAME       'pass'        (1, 29) (1, 33)
     >>> dump_tokens("def d01v_(a=1, *k, **w): pass")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     NAME       'def'         (1, 0) (1, 3)
     NAME       'd01v_'       (1, 4) (1, 9)
     OP         '('           (1, 9) (1, 10)
@@ -321,6 +337,7 @@
 
     >>> dump_tokens("if 1 < 1 > 1 == 1 >= 5 <= 0x15 <= 0x12 != " +
     ...             "1 and 5 in 1 not in 1 is 1 or 5 is not 1: pass")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     NAME       'if'          (1, 0) (1, 2)
     NUMBER     '1'           (1, 3) (1, 4)
     OP         '<'           (1, 5) (1, 6)
@@ -357,6 +374,7 @@
 Shift
 
     >>> dump_tokens("x = 1 << 1 >> 5")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     NAME       'x'           (1, 0) (1, 1)
     OP         '='           (1, 2) (1, 3)
     NUMBER     '1'           (1, 4) (1, 5)
@@ -368,6 +386,7 @@
 Additive
 
     >>> dump_tokens("x = 1 - y + 15 - 1 + 0x124 + z + a[5]")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     NAME       'x'           (1, 0) (1, 1)
     OP         '='           (1, 2) (1, 3)
     NUMBER     '1'           (1, 4) (1, 5)
@@ -390,6 +409,7 @@
 Multiplicative
 
     >>> dump_tokens("x = 1//1*1/5*12%0x12")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     NAME       'x'           (1, 0) (1, 1)
     OP         '='           (1, 2) (1, 3)
     NUMBER     '1'           (1, 4) (1, 5)
@@ -407,6 +427,7 @@
 Unary
 
     >>> dump_tokens("~1 ^ 1 & 1 |1 ^ -1")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     OP         '~'           (1, 0) (1, 1)
     NUMBER     '1'           (1, 1) (1, 2)
     OP         '^'           (1, 3) (1, 4)
@@ -419,6 +440,7 @@
     OP         '-'           (1, 16) (1, 17)
     NUMBER     '1'           (1, 17) (1, 18)
     >>> dump_tokens("-1*1/1+1*1//1 - ---1**1")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     OP         '-'           (1, 0) (1, 1)
     NUMBER     '1'           (1, 1) (1, 2)
     OP         '*'           (1, 2) (1, 3)
@@ -442,6 +464,7 @@
 Selector
 
     >>> dump_tokens("import sys, time\\nx = sys.modules['time'].time()")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     NAME       'import'      (1, 0) (1, 6)
     NAME       'sys'         (1, 7) (1, 10)
     OP         ','           (1, 10) (1, 11)
@@ -463,6 +486,7 @@
 Methods
 
     >>> dump_tokens("@staticmethod\\ndef foo(x,y): pass")
+    ENCODING   'utf-8'       (0, 0) (0, 0)
     OP         '@'           (1, 0) (1, 1)
     NAME       'staticmethod (1, 1) (1, 13)
     NEWLINE    '\\n'          (1, 13) (1, 14)
@@ -485,42 +509,43 @@
     True
     >>> roundtrip("# Comment \\\\nx = 0")
     True
-
-    >>>
-    >>> tempdir = os.path.dirname(f) or os.curdir
-    >>> testfiles = glob.glob(os.path.join(tempdir, "test*.py"))
-    >>> if not test_support.is_resource_enabled("compiler"):
-    ...     testfiles = random.sample(testfiles, 10)
-    ...
-    >>> for testfile in testfiles:
-    ...     if not roundtrip(open(testfile)): break
-    ... else: True
-    True
 """
 
-
 from test import test_support
-from tokenize import (tokenize, untokenize, generate_tokens, NUMBER, NAME, OP,
-                     STRING, ENDMARKER, tok_name)
-from io import StringIO
-import os
+from tokenize import (tokenize, _tokenize, untokenize, NUMBER, NAME, OP,
+                     STRING, ENDMARKER, tok_name, detect_encoding)
+from io import BytesIO
+from unittest import TestCase
+import os, sys, glob
 
 def dump_tokens(s):
     """Print out the tokens in s in a table format.
 
     The ENDMARKER is omitted.
     """
-    f = StringIO(s)
-    for type, token, start, end, line in generate_tokens(f.readline):
+    f = BytesIO(s.encode('utf-8'))
+    for type, token, start, end, line in tokenize(f.readline):
         if type == ENDMARKER:
             break
         type = tok_name[type]
         print("%(type)-10.10s %(token)-13.13r %(start)s %(end)s" % locals())
 
-def roundtrip(s):
-    f = StringIO(s)
-    source = untokenize(generate_tokens(f.readline))
-    print(source, end="")
+def roundtrip(f):
+    """
+    Test roundtrip for `untokenize`. `f` is an open file or a string.
+    The source code in f is tokenized, converted back to source code via
+    tokenize.untokenize(), and tokenized again from the latter. The test
+    fails if the second tokenization doesn't match the first.
+    """
+    if isinstance(f, str):
+        f = BytesIO(f.encode('utf-8'))
+    token_list = list(tokenize(f.readline))
+    f.close()
+    tokens1 = [tok[:2] for tok in token_list]
+    new_bytes = untokenize(tokens1)
+    readline = (line for line in new_bytes.splitlines(1)).__next__
+    tokens2 = [tok[:2] for tok in tokenize(readline)]
+    return tokens1 == tokens2
 
 # This is an example from the docs, set up as a doctest.
 def decistmt(s):
@@ -545,9 +570,8 @@
     >>> exec(decistmt(s))
     -3.217160342717258261933904529E-7
     """
-
     result = []
-    g = generate_tokens(StringIO(s).readline)   # tokenize the string
+    g = tokenize(BytesIO(s.encode('utf-8')).readline)   # tokenize the string
     for toknum, tokval, _, _, _  in g:
         if toknum == NUMBER and '.' in tokval:  # replace NUMBER tokens
             result.extend([
@@ -558,14 +582,249 @@
             ])
         else:
             result.append((toknum, tokval))
-    return untokenize(result)
+    return untokenize(result).decode('utf-8')
 
 
+class TestTokenizerAdheresToPep0263(TestCase):
+    """
+    Test that tokenizer adheres to the coding behaviour stipulated in PEP 0263.
+    """
+
+    def _testFile(self, filename):
+        path = os.path.join(os.path.dirname(__file__), filename)
+        return roundtrip(open(path, 'rb'))
+
+    def test_utf8_coding_cookie_and_no_utf8_bom(self):
+        f = 'tokenize_tests-utf8-coding-cookie-and-utf8-bom-sig.txt'
+        self.assertTrue(self._testFile(f))
+
+    def test_latin1_coding_cookie_and_utf8_bom(self):
+        """
+        As per PEP 0263, if a file starts with a utf-8 BOM signature, the only
+        allowed encoding for the comment is 'utf-8'.  The text file used in
+        this test starts with a BOM signature, but specifies latin1 as the
+        coding, so verify that a SyntaxError is raised, which matches the
+        behaviour of the interpreter when it encounters a similar condition.
+        """
+        f = 'tokenize_tests-latin1-coding-cookie-and-utf8-bom-sig.txt'
+        self.failUnlessRaises(SyntaxError, self._testFile, f)
+
+    def test_no_coding_cookie_and_utf8_bom(self):
+        f = 'tokenize_tests-no-coding-cookie-and-utf8-bom-sig-only.txt'
+        self.assertTrue(self._testFile(f))
+
+    def test_utf8_coding_cookie_and_utf8_bom(self):
+        f = 'tokenize_tests-utf8-coding-cookie-and-utf8-bom-sig.txt'
+        self.assertTrue(self._testFile(f))
+
+
+class Test_Tokenize(TestCase):
+
+    def test__tokenize_decodes_with_specified_encoding(self):
+        literal = '"ЉЊЈЁЂ"'
+        line = literal.encode('utf-8')
+        first = False
+        def readline():
+            nonlocal first
+            if not first:
+                first = True
+                return line
+            else:
+                return b''
+
+        # skip the initial encoding token and the end token
+        tokens = list(_tokenize(readline, encoding='utf-8'))[1:-1]
+        expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]
+        self.assertEquals(tokens, expected_tokens,
+                          "bytes not decoded with encoding")
+
+    def test__tokenize_does_not_decode_with_encoding_none(self):
+        literal = '"ЉЊЈЁЂ"'
+        first = False
+        def readline():
+            nonlocal first
+            if not first:
+                first = True
+                return literal
+            else:
+                return b''
+
+        # skip the end token
+        tokens = list(_tokenize(readline, encoding=None))[:-1]
+        expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]
+        self.assertEquals(tokens, expected_tokens,
+                          "string not tokenized when encoding is None")
+
+
+class TestDetectEncoding(TestCase):
+
+    def get_readline(self, lines):
+        index = 0
+        def readline():
+            nonlocal index
+            if index == len(lines):
+                raise StopIteration
+            line = lines[index]
+            index += 1
+            return line
+        return readline
+
+    def test_no_bom_no_encoding_cookie(self):
+        lines = (
+            b'# something\n',
+            b'print(something)\n',
+            b'do_something(else)\n'
+        )
+        encoding, consumed_lines = detect_encoding(self.get_readline(lines))
+        self.assertEquals(encoding, 'utf-8')
+        self.assertEquals(consumed_lines, list(lines[:2]))
+
+    def test_bom_no_cookie(self):
+        lines = (
+            b'\xef\xbb\xbf# something\n',
+            b'print(something)\n',
+            b'do_something(else)\n'
+        )
+        encoding, consumed_lines = detect_encoding(self.get_readline(lines))
+        self.assertEquals(encoding, 'utf-8')
+        self.assertEquals(consumed_lines,
+                          [b'# something\n', b'print(something)\n'])
+
+    def test_cookie_first_line_no_bom(self):
+        lines = (
+            b'# -*- coding: latin-1 -*-\n',
+            b'print(something)\n',
+            b'do_something(else)\n'
+        )
+        encoding, consumed_lines = detect_encoding(self.get_readline(lines))
+        self.assertEquals(encoding, 'latin-1')
+        self.assertEquals(consumed_lines, [b'# -*- coding: latin-1 -*-\n'])
+
+    def test_matched_bom_and_cookie_first_line(self):
+        lines = (
+            b'\xef\xbb\xbf# coding=utf-8\n',
+            b'print(something)\n',
+            b'do_something(else)\n'
+        )
+        encoding, consumed_lines = detect_encoding(self.get_readline(lines))
+        self.assertEquals(encoding, 'utf-8')
+        self.assertEquals(consumed_lines, [b'# coding=utf-8\n'])
+
+    def test_mismatched_bom_and_cookie_first_line_raises_syntaxerror(self):
+        lines = (
+            b'\xef\xbb\xbf# vim: set fileencoding=ascii :\n',
+            b'print(something)\n',
+            b'do_something(else)\n'
+        )
+        readline = self.get_readline(lines)
+        self.assertRaises(SyntaxError, detect_encoding, readline)
+
+    def test_cookie_second_line_no_bom(self):
+        lines = (
+            b'#! something\n',
+            b'# vim: set fileencoding=ascii :\n',
+            b'print(something)\n',
+            b'do_something(else)\n'
+        )
+        encoding, consumed_lines = detect_encoding(self.get_readline(lines))
+        self.assertEquals(encoding, 'ascii')
+        expected = [b'#! something\n', b'# vim: set fileencoding=ascii :\n']
+        self.assertEquals(consumed_lines, expected)
+
+    def test_matched_bom_and_cookie_second_line(self):
+        lines = (
+            b'\xef\xbb\xbf#! something\n',
+            b'f# coding=utf-8\n',
+            b'print(something)\n',
+            b'do_something(else)\n'
+        )
+        encoding, consumed_lines = detect_encoding(self.get_readline(lines))
+        self.assertEquals(encoding, 'utf-8')
+        self.assertEquals(consumed_lines,
+                          [b'#! something\n', b'f# coding=utf-8\n'])
+
+    def test_mismatched_bom_and_cookie_second_line_raises_syntaxerror(self):
+        lines = (
+            b'\xef\xbb\xbf#! something\n',
+            b'# vim: set fileencoding=ascii :\n',
+            b'print(something)\n',
+            b'do_something(else)\n'
+        )
+        readline = self.get_readline(lines)
+        self.assertRaises(SyntaxError, detect_encoding, readline)
+
+    def test_short_files(self):
+        readline = self.get_readline((b'print(something)\n',))
+        encoding, consumed_lines = detect_encoding(readline)
+        self.assertEquals(encoding, 'utf-8')
+        self.assertEquals(consumed_lines, [b'print(something)\n'])
+
+        encoding, consumed_lines = detect_encoding(self.get_readline(()))
+        self.assertEquals(encoding, 'utf-8')
+        self.assertEquals(consumed_lines, [])
+
+        readline = self.get_readline((b'\xef\xbb\xbfprint(something)\n',))
+        encoding, consumed_lines = detect_encoding(readline)
+        self.assertEquals(encoding, 'utf-8')
+        self.assertEquals(consumed_lines, [b'print(something)\n'])
+
+        readline = self.get_readline((b'\xef\xbb\xbf',))
+        encoding, consumed_lines = detect_encoding(readline)
+        self.assertEquals(encoding, 'utf-8')
+        self.assertEquals(consumed_lines, [])
+
+
+class TestTokenize(TestCase):
+
+    def test_tokenize(self):
+        import tokenize as tokenize_module
+        encoding = object()
+        encoding_used = None
+        def mock_detect_encoding(readline):
+            return encoding, ['first', 'second']
+
+        def mock__tokenize(readline, encoding):
+            nonlocal encoding_used
+            encoding_used = encoding
+            out = []
+            while True:
+                next_line = readline()
+                if next_line:
+                    out.append(next_line)
+                    continue
+                return out
+
+        counter = 0
+        def mock_readline():
+            nonlocal counter
+            counter += 1
+            if counter == 5:
+                return b''
+            return counter
+
+        orig_detect_encoding = tokenize_module.detect_encoding
+        orig__tokenize = tokenize_module._tokenize
+        tokenize_module.detect_encoding = mock_detect_encoding
+        tokenize_module._tokenize = mock__tokenize
+        try:
+            results = tokenize(mock_readline)
+            self.assertEquals(list(results), ['first', 'second', 1, 2, 3, 4])
+        finally:
+            tokenize_module.detect_encoding = orig_detect_encoding
+            tokenize_module._tokenize = orig__tokenize
+
+        self.assertTrue(encoding_used, encoding)
+
+
 __test__ = {"doctests" : doctests, 'decistmt': decistmt}
 
 def test_main():
     from test import test_tokenize
     test_support.run_doctest(test_tokenize, True)
+    test_support.run_unittest(TestTokenizerAdheresToPep0263)
+    test_support.run_unittest(Test_Tokenize)
+    test_support.run_unittest(TestDetectEncoding)
+    test_support.run_unittest(TestTokenize)
 
 if __name__ == "__main__":
     test_main()