Index: cgi.py
===================================================================
--- cgi.py	(revision 87986)
+++ cgi.py	(working copy)
@@ -31,13 +31,14 @@
 # Imports
 # =======
 
-from io import StringIO
+from io import StringIO, BytesIO, TextIOBase, TextIOWrapper
 import sys
 import os
 import urllib.parse
-import email.parser
+import email.parser,email.feedparser
 from warnings import warn
 import html
+import locale
 
 __all__ = ["MiniFieldStorage", "FieldStorage",
            "parse", "parse_qs", "parse_qsl", "parse_multipart",
@@ -109,7 +110,7 @@
 
         Arguments, all optional:
 
-        fp              : file pointer; default: sys.stdin
+        fp              : file pointer; default: sys.stdin.buffer
 
         environ         : environment dictionary; default: os.environ
 
@@ -126,6 +127,28 @@
     """
     if fp is None:
         fp = sys.stdin
+    if fp is sys.stdin:
+        # Windows needs stdio set for binary mode. This will be useless
+        # for Python 3.2+ : binary mode will be the default
+        try: 
+            import msvcrt
+            msvcrt.setmode (0, os.O_BINARY) # stdin  = 0
+            msvcrt.setmode (1, os.O_BINARY) # stdout = 1
+            msvcrt.setmode (2, os.O_BINARY) # stderr = 2
+        except ImportError:
+            pass
+
+    # field keys and values (except for files) are returned as strings
+    # an encoding is required to decode the bytes read from self.fp
+    if hasattr(fp,'encoding'):
+        encoding = fp.encoding
+    else:
+        encoding = 'latin-1' # ?
+
+    # fp.read() must return bytes
+    if isinstance(fp,TextIOBase):
+        fp = fp.buffer
+    
     if not 'REQUEST_METHOD' in environ:
         environ['REQUEST_METHOD'] = 'GET'       # For testing stand-alone
     if environ['REQUEST_METHOD'] == 'POST':
@@ -136,7 +159,7 @@
             clength = int(environ['CONTENT_LENGTH'])
             if maxlen and clength > maxlen:
                 raise ValueError('Maximum content length exceeded')
-            qs = fp.read(clength)
+            qs = fp.read(clength).decode(encoding) # str
         else:
             qs = ''                     # Unknown content-type
         if 'QUERY_STRING' in environ:
@@ -305,6 +328,36 @@
     return key, pdict
 
 
+class IOMix():
+
+    def __init__( self, fh, encoding="UTF-8"):
+        if hasattr( fh, 'buffer'):
+            self._bio = fh.buffer
+            fh.flush()
+            self._last = 'b'
+            self._txt = TextIOWrapper( self._bio, encoding, None, '\r\n')
+            self._encoding = encoding
+        else:
+            raise ValueError("not a buffered stream")
+
+    def write( self, param ):
+        if isinstance( param, str ):
+            self._last = 't'
+            self._txt.write( param )
+        else:
+            if self._last == 't':
+                self._txt.flush()
+            self._last = 'b'
+            self._bio.write( param )
+
+    def flush( self ):
+        self._txt.flush()
+
+    def close( self ):
+        self.flush()
+        self._txt.close()
+        self._bio.close()
+
 # Classes for field storage
 # =========================
 
@@ -352,9 +405,10 @@
 
     value: the value as a *string*; for file uploads, this
         transparently reads the file every time you request the value
+        and returns *bytes*
 
-    file: the file(-like) object from which you can read the data;
-        None if the data is stored a simple string
+    file: the file(-like) object from which you can read the data *as
+        bytes* ; None if the data is stored a simple string
 
     type: the content-type, or None if not specified
 
@@ -375,15 +429,22 @@
     directory and unlinking them as soon as they have been opened.
 
     """
-
     def __init__(self, fp=None, headers=None, outerboundary="",
-                 environ=os.environ, keep_blank_values=0, strict_parsing=0):
+                 environ=os.environ, keep_blank_values=0, strict_parsing=0,
+                 limit=None, stream_encoding = 'utf-8',
+                 charset = None):
         """Constructor.  Read multipart/* until last part.
 
         Arguments, all optional:
 
         fp              : file pointer; default: sys.stdin
             (not used when the request method is GET)
+            Can be :
+            1. an instance of (a subclass of) TextIOBase, in this case it must
+            must provide an attribute "buffer" = the binary layer that returns 
+            bytes from its read() method, and preferably an attribute 
+            "encoding" (defaults to latin-1)
+            2. an object whose read() and readline() methods return bytes
 
         headers         : header dictionary-like object; default:
             taken from environ as per CGI spec
@@ -404,6 +465,18 @@
             If false (the default), errors are silently ignored.
             If true, errors raise a ValueError exception.
 
+        limit : used internally to read parts of multipart/form-data forms, 
+            to exit from the reading loop when reached. It is the difference 
+            between the form content-length and the number of bytes already
+            read
+        
+        stream_encoding : the encoding used to decode the binary stream to
+            strings. Must be the same as the charset defined for the page
+            sending the form (content-type : meta http-equiv or header)
+        
+        charset : the encoding used to print the values in the CGI script.
+            Must be the same as the charset defined in the content-type
+            header of this script
         """
         method = 'GET'
         self.keep_blank_values = keep_blank_values
@@ -418,7 +491,7 @@
                 qs = sys.argv[1]
             else:
                 qs = ""
-            fp = StringIO(qs)
+            fp = BytesIO(qs.encode(locale.getpreferredencoding(), 'surrogateescape')) # bytes
             if headers is None:
                 headers = {'content-type':
                            "application/x-www-form-urlencoded"}
@@ -433,9 +506,35 @@
                 self.qs_on_post = environ['QUERY_STRING']
             if 'CONTENT_LENGTH' in environ:
                 headers['content-length'] = environ['CONTENT_LENGTH']
-        self.fp = fp or sys.stdin
+        if fp is None:
+            fp = sys.stdin
+        if fp is sys.stdin:
+            # Windows needs stdio set for binary mode. This will be useless
+            # for Python 3.2+ : binary mode will be the default
+            try: 
+                import msvcrt
+                msvcrt.setmode (0, os.O_BINARY) # stdin  = 0
+                msvcrt.setmode (1, os.O_BINARY) # stdout = 1
+                msvcrt.setmode (2, os.O_BINARY) # stderr = 2
+            except ImportError:
+                pass
+        # self.fp.read() must return bytes
+        if isinstance(fp,TextIOBase):
+            self.fp = fp.buffer
+        else:
+            self.fp = fp
+
+        self.stream_encoding = stream_encoding
+
+        if charset is not None:
+            sys.stdout = IOMix( sys.stdout, charset )
+            sys.stderr = IOMix( sys.stderr, charset )
+
         self.headers = headers
         self.outerboundary = outerboundary
+        
+        self.bytes_read = 0
+        self.limit = limit
 
         # Process content-disposition header
         cdisp, pdict = "", {}
@@ -482,6 +581,8 @@
             if maxlen and clen > maxlen:
                 raise ValueError('Maximum content length exceeded')
         self.length = clen
+        if self.limit is None and clen:
+            self.limit = clen
 
         self.list = self.file = None
         self.done = 0
@@ -531,7 +632,7 @@
         """Dictionary style get() method, including 'value' lookup."""
         if key in self:
             value = self[key]
-            if type(value) is type([]):
+            if isinstance(value,list):
                 return [x.value for x in value]
             else:
                 return value.value
@@ -542,7 +643,7 @@
         """ Return the first value received."""
         if key in self:
             value = self[key]
-            if type(value) is type([]):
+            if isinstance(value,list):
                 return value[0].value
             else:
                 return value.value
@@ -553,7 +654,7 @@
         """ Return list of received values."""
         if key in self:
             value = self[key]
-            if type(value) is type([]):
+            if isinstance(value,list):
                 return [x.value for x in value]
             else:
                 return [value.value]
@@ -581,13 +682,13 @@
 
     def read_urlencoded(self):
         """Internal: read data in query string format."""
-        qs = self.fp.read(self.length)
+        qs = self.fp.read(self.length).decode(self.stream_encoding) # str
         if self.qs_on_post:
             qs += '&' + self.qs_on_post
-        self.list = list = []
+        self.list = []
         for key, value in urllib.parse.parse_qsl(qs, self.keep_blank_values,
                                 self.strict_parsing):
-            list.append(MiniFieldStorage(key, value))
+            self.list.append(MiniFieldStorage(key, value))
         self.skip_lines()
 
     FieldStorageClass = None
@@ -600,23 +701,36 @@
         self.list = []
         if self.qs_on_post:
             for key, value in urllib.parse.parse_qsl(self.qs_on_post,
-                                    self.keep_blank_values, self.strict_parsing):
+                                self.keep_blank_values, self.strict_parsing):
                 self.list.append(MiniFieldStorage(key, value))
             FieldStorageClass = None
 
         klass = self.FieldStorageClass or self.__class__
-        parser = email.parser.FeedParser()
-        # Create bogus content-type header for proper multipart parsing
-        parser.feed('Content-Type: %s; boundary=%s\r\n\r\n' % (self.type, ib))
-        parser.feed(self.fp.read())
-        full_msg = parser.close()
-        # Get subparts
-        msgs = full_msg.get_payload()
-        for msg in msgs:
-            fp = StringIO(msg.get_payload())
-            part = klass(fp, msg, ib, environ, keep_blank_values,
-                         strict_parsing)
+        first_line = self.fp.readline() # bytes
+        self.bytes_read += len(first_line)
+        # first line holds boundary ; ignore it, or check that
+        # "--"+ib == first_line.decode('ascii').strip() ?
+        while True:
+            parser = email.parser.FeedParser()
+            hdr_text = b""
+            while True:
+                data = self.fp.readline()
+                hdr_text += data
+                if not data.strip():
+                    break
+            if not hdr_text:
+                break
+            # parser takes strings, not bytes
+            self.bytes_read += len(hdr_text)
+            parser.feed(hdr_text.decode(self.stream_encoding))
+            headers = parser.close()
+            part = klass(self.fp, headers, ib, environ, keep_blank_values,
+                         strict_parsing,self.limit-self.bytes_read,
+                         self.stream_encoding)
+            self.bytes_read += part.bytes_read
             self.list.append(part)
+            if self.bytes_read >= self.length:
+                break
         self.skip_lines()
 
     def read_single(self):
@@ -636,7 +750,8 @@
         todo = self.length
         if todo >= 0:
             while todo > 0:
-                data = self.fp.read(min(todo, self.bufsize))
+                data = self.fp.read(min(todo, self.bufsize)) # bytes
+                self.bytes_read += len(data)
                 if not data:
                     self.done = -1
                     break
@@ -645,42 +760,57 @@
 
     def read_lines(self):
         """Internal: read lines until EOF or outerboundary."""
-        self.file = self.__file = StringIO()
+        if self.filename is not None:
+            self.file = self.__file = BytesIO() # store data as bytes for files
+        else:
+            self.file = self.__file = StringIO() # as strings for other fields
         if self.outerboundary:
             self.read_lines_to_outerboundary()
         else:
             self.read_lines_to_eof()
 
     def __write(self, line):
+        """line is always bytes, not string"""
         if self.__file is not None:
             if self.__file.tell() + len(line) > 1000:
                 self.file = self.make_file()
                 data = self.__file.getvalue()
                 self.file.write(data)
                 self.__file = None
-        self.file.write(line)
-
+        if self.filename is not None:
+            self.file.write(line) # keep bytes
+        else:
+            self.file.write(line.decode(self.stream_encoding)) # decode to string
+       
     def read_lines_to_eof(self):
         """Internal: read lines until EOF."""
-        while 1:
-            line = self.fp.readline(1<<16)
+        while True:
+            line = self.fp.readline(1<<16) # bytes
+            self.bytes_read += len(line)
             if not line:
                 self.done = -1
                 break
             self.__write(line)
 
     def read_lines_to_outerboundary(self):
-        """Internal: read lines until outerboundary."""
-        next = "--" + self.outerboundary
-        last = next + "--"
-        delim = ""
+        """Internal: read lines until outerboundary.
+        Data is read as bytes : boundaries and line ends must be converted
+        to bytes for comparisons"""
+        next = b"--" + self.outerboundary.encode(self.stream_encoding)
+        last = next + b"--"
+        delim = b""
         last_line_lfend = True
-        while 1:
-            line = self.fp.readline(1<<16)
+        _read = 0
+        while True:
+            if _read >= self.limit:
+                break
+            line = self.fp.readline(1<<16) # bytes
+            self.bytes_read += len(line)
+            _read += len(line)
             if not line:
                 self.done = -1
                 break
-            if line[:2] == "--" and last_line_lfend:
+            if line[:2] == b"--" and last_line_lfend:
                 strippedline = line.strip()
                 if strippedline == next:
                     break
@@ -688,16 +818,16 @@
                     self.done = 1
                     break
             odelim = delim
-            if line[-2:] == "\r\n":
-                delim = "\r\n"
+            if line.endswith(b"\r\n"):
+                delim = b"\r\n"
                 line = line[:-2]
                 last_line_lfend = True
-            elif line[-1] == "\n":
-                delim = "\n"
+            elif line.endswith(b"\n"):
+                delim = b"\n"
                 line = line[:-1]
                 last_line_lfend = True
             else:
-                delim = ""
+                delim = b""
                 last_line_lfend = False
             self.__write(odelim + line)
 
@@ -705,22 +835,23 @@
         """Internal: skip lines until outer boundary if defined."""
         if not self.outerboundary or self.done:
             return
-        next = "--" + self.outerboundary
-        last = next + "--"
+        next = b"--" + self.outerboundary.encode(self.stream_encoding)
+        last = next + b"--"
         last_line_lfend = True
-        while 1:
+        while True:
             line = self.fp.readline(1<<16)
+            self.bytes_read += len(line)
             if not line:
                 self.done = -1
                 break
-            if line[:2] == "--" and last_line_lfend:
+            if line.endswith(b"--") and last_line_lfend:
                 strippedline = line.strip()
                 if strippedline == next:
                     break
                 if strippedline == last:
                     self.done = 1
                     break
-            last_line_lfend = line.endswith('\n')
+            last_line_lfend = line.endswith(b'\n')
 
     def make_file(self):
         """Overridable: return a readable & writable file.
@@ -730,7 +861,8 @@
         - seek(0)
         - data is read from it
 
-        The file is always opened in text mode.
+        The file is opened in binary mode for files, in text mode
+        for other fields
 
         This version opens a temporary file for reading and writing,
         and immediately deletes (unlinks) it.  The trick (on Unix!) is
@@ -746,7 +878,12 @@
 
         """
         import tempfile
-        return tempfile.TemporaryFile("w+", encoding="utf-8", newline="\n")
+        if self.filename is not None:
+            return tempfile.TemporaryFile("wb+")
+        else:
+            return tempfile.TemporaryFile("w+",
+                encoding=self.stream_encoding,
+                newline = '\n')
 
 
 # Test/debug code
Index: test/test_cgi.py
===================================================================
--- test/test_cgi.py	(revision 87986)
+++ test/test_cgi.py	(working copy)
@@ -4,7 +4,7 @@
 import sys
 import tempfile
 import unittest
-from io import StringIO
+from io import StringIO, BytesIO
 
 class HackedSysModule:
     # The regression test will have real values in sys.argv, which
@@ -14,7 +14,6 @@
 
 cgi.sys = HackedSysModule()
 
-
 class ComparableException:
     def __init__(self, err):
         self.err = err
@@ -38,7 +37,7 @@
         env['REQUEST_METHOD'] = 'GET'
         env['QUERY_STRING'] = buf
     elif method == "POST":
-        fp = StringIO(buf)
+        fp = BytesIO(buf.encode('latin-1')) # FieldStorage expects bytes
         env['REQUEST_METHOD'] = 'POST'
         env['CONTENT_TYPE'] = 'application/x-www-form-urlencoded'
         env['CONTENT_LENGTH'] = str(len(buf))
@@ -106,9 +105,10 @@
     return [(p[0], p[1][0]) for p in list]
 
 def gen_result(data, environ):
-    fake_stdin = StringIO(data)
+    fake_stdin = BytesIO(data.encode('latin-1')) #StringIO(data)
     fake_stdin.seek(0)
-    form = cgi.FieldStorage(fp=fake_stdin, environ=environ)
+    form = cgi.FieldStorage(fp=fake_stdin, environ=environ,
+        stream_encoding="latin-1")
 
     result = {}
     for k, v in dict(form).items():
@@ -122,9 +122,9 @@
         for orig, expect in parse_strict_test_cases:
             # Test basic parsing
             d = do_test(orig, "GET")
-            self.assertEqual(d, expect, "Error parsing %s" % repr(orig))
+            self.assertEqual(d, expect, "Error parsing %s method GET" % repr(orig))
             d = do_test(orig, "POST")
-            self.assertEqual(d, expect, "Error parsing %s" % repr(orig))
+            self.assertEqual(d, expect, "Error parsing %s method POST" % repr(orig))
 
             env = {'QUERY_STRING': orig}
             fs = cgi.FieldStorage(environ=env)
@@ -181,9 +181,9 @@
                     setattr(self, name, a)
                 return a
 
-        f = TestReadlineFile(tempfile.TemporaryFile("w+"))
+        f = TestReadlineFile(tempfile.TemporaryFile("wb+"))
         self.addCleanup(f.close)
-        f.write('x' * 256 * 1024)
+        f.write(b'x' * 256 * 1024)
         f.seek(0)
         env = {'REQUEST_METHOD':'PUT'}
         fs = cgi.FieldStorage(fp=f, environ=env)
@@ -216,11 +216,12 @@
  Add\x20
 -----------------------------721837373350705526688164684--
 """
-        fs = cgi.FieldStorage(fp=StringIO(postdata), environ=env)
+        fp = BytesIO(postdata.encode('latin-1'))
+        fs = cgi.FieldStorage(fp, environ=env)
         self.assertEqual(len(fs.list), 4)
         expect = [{'name':'id', 'filename':None, 'value':'1234'},
                   {'name':'title', 'filename':None, 'value':''},
-                  {'name':'file', 'filename':'test.txt', 'value':'Testing 123.'},
+                  {'name':'file', 'filename':'test.txt', 'value':b'Testing 123.\n'},
                   {'name':'submit', 'filename':None, 'value':' Add '}]
         for x in range(len(fs.list)):
             for k, exp in expect[x].items():
@@ -245,8 +246,7 @@
         self.assertEqual(self._qs_result, v)
 
     def testQSAndFormData(self):
-        data = """
----123
+        data = """---123
 Content-Disposition: form-data; name="key2"
 
 value2y
@@ -270,8 +270,7 @@
         self.assertEqual(self._qs_result, v)
 
     def testQSAndFormDataFile(self):
-        data = """
----123
+        data = """---123
 Content-Disposition: form-data; name="key2"
 
 value2y
@@ -299,7 +298,7 @@
         }
         result = self._qs_result.copy()
         result.update({
-            'upload': 'this is the content of the fake file'
+            'upload': b'this is the content of the fake file\n' # bytes
         })
         v = gen_result(data, environ)
         self.assertEqual(result, v)