diff -r ec3077e23b7d Lib/cgi.py --- a/Lib/cgi.py Sun Oct 05 21:20:51 2014 -0400 +++ b/Lib/cgi.py Tue Oct 14 17:06:00 2014 -0700 @@ -31,7 +31,7 @@ # Imports # ======= -from io import StringIO, BytesIO, TextIOWrapper +from io import StringIO, BytesIO, TextIOWrapper, BufferedReader from collections import Mapping import sys import os @@ -42,6 +42,8 @@ import html import locale import tempfile +import enum +import re __all__ = ["MiniFieldStorage", "FieldStorage", "parse", "parse_qs", "parse_qsl", "parse_multipart", @@ -359,6 +361,211 @@ return "MiniFieldStorage(%r, %r)" % (self.name, self.value) +class _CgiFp: + """ Support class for FieldStorage. This class should not be used directly. + fp -> instance of BufferedReader + boundary -> multipart boundary *including* the leading '--' delimiter + + All methods advance the file-pointer only if they have determined that + the current payload in the buffer (of BufferedReader) does not run over + the next boundary in a multipart. + There are cases, where multi_read seeks the file pointer beyond the data + it has just returned. It does this knowing that the seek never crosses + the next boundary and it mantains the difference in an overlapped buffer. + This overlapped buffer is part of the next payload. This is done to + resolve cases where a boundary may be between two buffers. + """ + + def __init__(self, fp, boundary): + """ + self._buffer = self._overlapped + self._peek + self._peek = peeked data + self._eof = Indicates a read has returned an EOF + """ + self._fp = fp + self._overlapped = b'' + self._boundary = boundary + self._eof = False + # create self._bufer and self._peek + self._remove_x_keep_buffer(0) + + self._max_search = len(b'\r\n' + boundary + b'--\r\n') + self._initial = True + + # RFC 2616 + if len(boundary) > 72: + raise ValueError("multipart boundary too large") + + self._initial = True + + #All buffers + pattern_noeof = re.escape(b'\n'+boundary+b'\r\n') \ + + b'|' + re.escape(b'\n'+boundary+b'\n') \ + + b'|' + re.escape(b'\n'+boundary+b'--\n') \ + + b'|' + re.escape(b'\n'+boundary+b'--\r\n') + self._re_noeof = re.compile(pattern_noeof) + + #First Buffer + pattern_eof = pattern_noeof \ + + b'|' + re.escape(b'\n'+boundary) + b'$' \ + + b'|' + re.escape(b'\n'+boundary) + b'$' \ + + b'|' + re.escape(b'\n'+boundary+b'--') + b'$' \ + + b'|' + re.escape(b'\n'+boundary+b'--') + b'$' + self._re_eof = re.compile(pattern_eof) + + #Last Buffer + pattern_first_noeof = pattern_noeof \ + + b'|^' + re.escape(boundary+b'\r\n') \ + + b'|^' + re.escape(boundary+b'\n') \ + + b'|^' + re.escape(boundary+b'--\n') \ + + b'|^' + re.escape(boundary+b'--\r\n') + self._re_first_noeof = re.compile(pattern_first_noeof) + + def _keep_x_forward_buffer(self, x): + """ Keep the last x bytes from _buffer and drain the read data. + This fuction is used when we have determined that self._buffer + does not have the next_boundary. + """ + lenbuffer = len(self._buffer) + if x > lenbuffer: + x = lenbuffer + start = lenbuffer - x + self._overlapped = self._buffer[start:lenbuffer] + if not self._eof: + drain = self._fp.read(len(self._peek)) + if not drain: + self._eof = True + self._peek = b'' + else: + self._peek = self._fp.peek() + self._buffer = self._overlapped + self._peek + + + def _remove_x_keep_buffer(self, x): + """ Remove the fist x bytes of the buffer. If x < len(self._buffer), + read pointer is not forwarded + This is also called by the constructor + """ + len_of_overlapped = len(self._overlapped) + if x < len_of_overlapped: + self._overlapped = self._overlapped[x:] + self._buffer = self._overlapped + self._peek + else: + self._overlapped = b'' + data_to_drain = x - len_of_overlapped + if not self._eof and data_to_drain: + drain = self._fp.read(data_to_drain) + if not drain: + self._eof = True + self._peek = self._fp.peek() + self._buffer = self._overlapped + self._peek + + + class MultiType(enum.Enum): + """ Enumeration for defining the type of data returned by multi_read. + next_boundary = a boundary is found + last_boundary = last boundary is found + """ + + next_boundary = 1 + last_boundary = 2 + + + def _check_boundary_head(self, hindex): + ''' Boundary has been located. If boundary is of pattern,\nboundary, + claim the leading \r if associated with \n as part of the boundary. + hindex = index of start of boundary in buffer. This could be either + boundary when found just after initialization or could be + \nbuffer. + returns index of the boundary including the leading CR. + ''' + if hindex >= 1 and self._buffer[hindex-1 : hindex+1] == b'\r\n': + return hindex - 1 + return hindex + + def _check_boundary_trail(self, tindex): + ''' tindex = last element of the boundary. + + returns the type of boundary. + + A boundary could end with CRLF or LF or in case of EOF just the boundary. + Test if the boundary ends with --[CRLF|LF] to indicate last boundary. + ''' + if self._buffer[tindex] == ord('\n'): + if tindex > 0: + if self._buffer[tindex - 1] == ord('\r'): + if (tindex > 2 and + self._buffer[tindex - 2] == ord(b'-') and + self._buffer[tindex - 3] == ord(b'-')): + #Ends as Boundary--CRLF + return self.MultiType.last_boundary + #Ends as BoundaryCRLF + return self.MultiType.next_boundary + if (tindex > 1 and + self._buffer[tindex - 1] == ord(b'-') and + self._buffer[tindex - 2] == ord(b'-')): + #Ends as Boundary--LF + return self.MultiType.last_boundary + #Ends as BoundaryLF + return self.MultiType.next_boundary + if (tindex > 0 and + self._buffer[tindex] == ord(b'-') and + self._buffer[tindex - 1] == ord(b'-')): + #Ends as Boundary-- + return self.MultiType.last_boundary + #Ends as Boundary + return self.MultiType.next_boundary + + def multi_read(self): + ''' returns chunk, (boundary_type, boundary) + boundary or chunk can be empty. When both are empty EOF + is reached. + type = next_boundary indicates the buffer is next boundary. + type = last_boundary indicates the buffer is last boundary. + ''' + + rsearch = None + if self._initial: + self._initial = False + rsearch = self._re_first_noeof.search(self._buffer) + else: + if self._eof: + rsearch = self._re_eof.search(self._buffer) + else: + rsearch = self._re_noeof.search(self._buffer) + + if rsearch: + # boundary is found + start_index = self._check_boundary_head(rsearch.start()) + mtype = self._check_boundary_trail(rsearch.end()-1) + rdata = ( \ + self._buffer[:start_index],\ + (mtype, self._buffer[start_index:rsearch.end()])) + self._remove_x_keep_buffer(rsearch.end()) + return rdata + else: + if self._eof: + rdata = self._buffer, (None, b'') + #clear the buffer + self._keep_x_forward_buffer(0) + return rdata + else: + chunk = self._buffer[:-1 * self._max_search] + self._keep_x_forward_buffer(self._max_search) + if chunk: + return chunk, (None, b'') + else: + # we have to recurse. + # we recurse because theoretically peek() does not make + # a promise on the length of the buffer except it cannot + # be zero. This recursion will happen in theory if + # after we read the entire data using read,a peek returns + # a buffer of length less than max_search_len. + # The maximum depth of recursion is max_search_len (when + # peek keeps returning us a buffer of length 1) + # max_search_len < 78 + return self.multi_read() + class FieldStorage: """Store a sequence of fields, reading multipart/form-data. @@ -789,47 +996,67 @@ """ next_boundary = b"--" + self.outerboundary last_boundary = next_boundary + b"--" - delim = b"" - last_line_lfend = True _read = 0 - while 1: - if _read >= self.limit: - break - line = self.fp.readline(1<<16) # bytes - self.bytes_read += len(line) - _read += len(line) - if not line: - self.done = -1 - break - if delim == b"\r": - line = delim + line - delim = b"" - if line.startswith(b"--") and last_line_lfend: - strippedline = line.rstrip() - if strippedline == next_boundary: + if isinstance(self.fp, BufferedReader): + cgi_fp = _CgiFp(self.fp, next_boundary) + while 1: + if _read >= self.limit: break - if strippedline == last_boundary: - self.done = 1 + line, (btype, boundary) = cgi_fp.multi_read() + self.bytes_read += len(line) + self.bytes_read += len(boundary) + _read += len(line) + _read += len(boundary) + if not line and not boundary: + self._done = -1 break - odelim = delim - if line.endswith(b"\r\n"): - delim = b"\r\n" - line = line[:-2] - last_line_lfend = True - elif line.endswith(b"\n"): - delim = b"\n" - line = line[:-1] - last_line_lfend = True - elif line.endswith(b"\r"): - # We may interrupt \r\n sequences if they span the 2**16 - # byte boundary - delim = b"\r" - line = line[:-1] - last_line_lfend = False - else: - delim = b"" - last_line_lfend = False - self.__write(odelim + line) + if line: + self.__write(line) + if boundary: + if btype == cgi_fp.MultiType.last_boundary: + self.done = 1 + break + else: + delim = b"" + last_line_lfend = True + while 1: + if _read >= self.limit: + break + line = self.fp.readline(1<<16) # bytes + self.bytes_read += len(line) + _read += len(line) + if not line: + self.done = -1 + break + if delim == b"\r": + line = delim + line + delim = b"" + if line.startswith(b"--") and last_line_lfend: + strippedline = line.rstrip() + if strippedline == next_boundary: + break + if strippedline == last_boundary: + self.done = 1 + break + odelim = delim + if line.endswith(b"\r\n"): + delim = b"\r\n" + line = line[:-2] + last_line_lfend = True + elif line.endswith(b"\n"): + delim = b"\n" + line = line[:-1] + last_line_lfend = True + elif line.endswith(b"\r"): + # We may interrupt \r\n sequences if they span the 2**16 + # byte boundary + delim = b"\r" + line = line[:-1] + last_line_lfend = False + else: + delim = b"" + last_line_lfend = False + self.__write(odelim + line) def skip_lines(self): """Internal: skip lines until outer boundary if defined.""" diff -r ec3077e23b7d Lib/test/test_cgi.py --- a/Lib/test/test_cgi.py Sun Oct 05 21:20:51 2014 -0400 +++ b/Lib/test/test_cgi.py Tue Oct 14 17:06:00 2014 -0700 @@ -6,7 +6,7 @@ import unittest import warnings from collections import namedtuple -from io import StringIO, BytesIO +from io import StringIO, BytesIO, BufferedReader class HackedSysModule: # The regression test will have real values in sys.argv, which @@ -16,6 +16,10 @@ cgi.sys = HackedSysModule() +def fake_stdins(b, encoding='latin-1'): + yield BytesIO(b.encode(encoding)) + yield BufferedReader(BytesIO(b.encode(encoding))) + class ComparableException: def __init__(self, err): self.err = err @@ -106,11 +110,8 @@ def first_second_elts(list): return [(p[0], p[1][0]) for p in list] -def gen_result(data, environ): - encoding = 'latin-1' - fake_stdin = BytesIO(data.encode(encoding)) - fake_stdin.seek(0) - form = cgi.FieldStorage(fp=fake_stdin, environ=environ, encoding=encoding) +def gen_result(fake_fp, environ, encoding='latin-1'): + form = cgi.FieldStorage(fp=fake_fp, environ=environ, encoding=encoding) result = {} for k, v in dict(form).items(): @@ -280,8 +281,9 @@ 'CONTENT_TYPE': 'multipart/form-data; boundary=-123', 'REQUEST_METHOD': 'POST', } - self.assertEqual(gen_result(data, environ), - {'upload': content.encode('latin1')}) + for fake_stdin in fake_stdins(data): + self.assertEqual(gen_result(fake_stdin, environ), + {'upload': content.encode('latin1')}) check('x' * (maxline - 1)) check('x' * (maxline - 1) + '\r') check('x' * (maxline - 1) + '\r' + 'y' * (maxline - 1)) @@ -321,8 +323,9 @@ 'QUERY_STRING': 'key1=value1&key2=value2y', 'REQUEST_METHOD': 'POST', } - v = gen_result(data, environ) - self.assertEqual(self._qs_result, v) + for fake_stdin in fake_stdins(data): + v = gen_result(fake_stdin, environ) + self.assertEqual(self._qs_result, v) def testQSAndFormData(self): data = """---123 @@ -345,8 +348,9 @@ 'QUERY_STRING': 'key1=value1&key2=value2x', 'REQUEST_METHOD': 'POST', } - v = gen_result(data, environ) - self.assertEqual(self._qs_result, v) + for fake_stdin in fake_stdins(data): + v = gen_result(fake_stdin, environ) + self.assertEqual(self._qs_result, v) def testQSAndFormDataFile(self): data = """---123 @@ -379,8 +383,9 @@ result.update({ 'upload': b'this is the content of the fake file\n' }) - v = gen_result(data, environ) - self.assertEqual(result, v) + for fake_stdin in fake_stdins(data): + v = gen_result(fake_stdin, environ) + self.assertEqual(result, v) def test_deprecated_parse_qs(self): # this func is moved to urllib.parse, this is just a sanity check