diff -r 64a54f0c87d7 Lib/cgi.py --- a/Lib/cgi.py Sat Nov 01 22:48:24 2014 -0500 +++ b/Lib/cgi.py Tue Nov 04 05:49:42 2014 -0800 @@ -31,7 +31,7 @@ # Imports # ======= -from io import StringIO, BytesIO, TextIOWrapper +from io import StringIO, BytesIO, TextIOWrapper, BufferedReader from collections import Mapping import sys import os @@ -42,6 +42,7 @@ import html import locale import tempfile +import re __all__ = ["MiniFieldStorage", "FieldStorage", "parse", "parse_qs", "parse_qsl", "parse_multipart", @@ -359,6 +360,187 @@ return "MiniFieldStorage(%r, %r)" % (self.name, self.value) +class _CgiFp: + """ Support class for FieldStorage. This class should not be used directly. + fp -> instance of BufferedReader + boundary -> multipart boundary *including* the leading '--' delimiter + + All methods advance the file-pointer only if they have determined that + the current payload in the buffer (of BufferedReader) does not run over + the next boundary in a multipart. + There are cases, where multi_read seeks the file pointer beyond the + data it has just returned. It does this knowing that the seek never + crossesthe next boundary and it mantains the difference in an + overlapped buffer. This overlapped buffer is part of the next payload. + This is done to resolve cases where a boundary may be between two + buffers. + """ + + def __init__(self, fp, boundary): + self._fp = fp + self._overlapped = b'' + self._boundary = boundary + # self._eof = Indicates a read has returned an EOF + self._eof = False + # self._buffer = self._overlapped + self._peek + # self._peek = peeked data + # create self._bufer and self._peek using _remove_x_keep_buffer + self._remove_x_keep_buffer(0) + + self._max_search = len(b'\r\n' + boundary + b'--\r\n') + self._initial = True + + # RFC 2616 + if len(boundary) > 72: + raise ValueError("multipart boundary too large") + + self._initial = True + + #Patter for all buffers + pattern_noeof = b'\n' + re.escape(boundary) + b'(?:--)?\r?\n' + self._re_noeof = re.compile(pattern_noeof) + + #First Buffer + #We have a special pattern for the first buffer. The boundary at the + #start of the payload will begin without the CRLF. + #We should use this pattern only for the first buffer, as the + #subsequent buffer's "start of buffer" could be part of the payload, + #because we dont test for the leading CRLF/LF, when breaking a buffer. + pattern_first_noeof = pattern_noeof \ + + b'|^' + re.escape(boundary) + b'(?:--)?\r?\n' + self._re_first_noeof = re.compile(pattern_first_noeof, re.M) + + #Last Buffer + #Similary last buffer can end without a CRLF and is valid only for the + #last buffer. Also take care of the corner case where the payload is + #just the boundary without a trailing CRLF/LF + pattern_eof = pattern_noeof \ + + b'|\n' + re.escape(boundary) + b'(?:--)?$' \ + + b'|^' + re.escape(boundary) + b'(?:--)?$' + self._re_eof = re.compile(pattern_eof, re.M) + + + def _keep_x_forward_buffer(self, x): + """ Keep the last x bytes from _buffer and drain the read data. + This fuction is used when we have determined that self._buffer + does not have the next_boundary. + """ + lenbuffer = len(self._buffer) + if x > lenbuffer: + x = lenbuffer + start = lenbuffer - x + self._overlapped = self._buffer[start:lenbuffer] + if not self._eof: + drain = self._fp.read(len(self._peek)) + if not drain: + self._eof = True + self._peek = b'' + else: + self._peek = self._fp.peek() + self._buffer = self._overlapped + self._peek + + def _remove_x_keep_buffer(self, x): + """ Remove the fist x bytes of the buffer. + x >= len_of_overlapped + This is also called by the constructor + """ + len_of_overlapped = len(self._overlapped) + self._overlapped = b'' + # Note that x >= len_of_overlapped + # overlapped is only created when boundary is not found + # remove_x is called when boundary is found and x is at + # the end of the boundary. Hence x > len_of_overlapped. + # x = len_of_overlapped when x = 0 + data_to_drain = x - len_of_overlapped + if not self._eof and data_to_drain: + drain = self._fp.read(data_to_drain) + self._peek = self._fp.peek() + self._buffer = self._overlapped + self._peek + + def _check_boundary_head(self, hindex): + ''' Boundary has been located. If boundary is of pattern,\nboundary, + claim the leading \r if associated with \n as part of the boundary. + hindex = index of start of boundary in buffer. This could be either + boundary when found just after initialization or could be + \nbuffer. + returns index of the boundary including the leading CR. + ''' + if hindex >= 1 and self._buffer[hindex-1 : hindex+1] == b'\r\n': + return hindex - 1 + return hindex + + def _check_boundary_trail(self, tindex): + ''' tindex = last element of the boundary. + + returns True if the boundary is a last boundary, false otherwise. + + A boundary could end with CRLF or LF or in case of EOF just the boundary. + Test if the boundary ends with --[CRLF|LF] to indicate last boundary. + ''' + if self._buffer[tindex] == b'\n'[0]: + if tindex > 0: + if self._buffer[tindex - 1] == b'\r'[0]: + if (tindex > 2 and + self._buffer[tindex - 3:tindex - 1] == b'--'): + #Ends as Boundary--CRLF + return True + elif tindex > 1 and self._buffer[tindex - 2:tindex] == b'--': + #Ends as Boundary--LF + return True + elif (tindex > 0 and self._buffer[tindex-1:tindex + 1] == b'--'): + #Ends as Boundary-- + return True + #Ends as Boundary + return False + + def multi_read(self): + ''' returns chunk, (boundary, islastboundary) + boundary or chunk can be empty. When both are empty EOF + is reached. + ''' + + rsearch = None + if self._initial: + self._initial = False + rsearch = self._re_first_noeof.search(self._buffer) + else: + if self._eof: + rsearch = self._re_eof.search(self._buffer) + else: + rsearch = self._re_noeof.search(self._buffer) + + if rsearch: + # boundary is found + start_index = self._check_boundary_head(rsearch.start()) + islastboundary = self._check_boundary_trail(rsearch.end()-1) + rdata = ( \ + self._buffer[:start_index],\ + (self._buffer[start_index:rsearch.end()], islastboundary)) + self._remove_x_keep_buffer(rsearch.end()) + return rdata + else: + if self._eof: + rdata = self._buffer, (b'', None) + #clear the buffer + self._keep_x_forward_buffer(0) + return rdata + else: + chunk = self._buffer[:-self._max_search] + self._keep_x_forward_buffer(self._max_search) + if chunk: + return chunk, (b'', None) + else: + # we have to recurse. + # we recurse because theoretically peek() does not make + # a promise on the length of the buffer except it cannot + # be zero. This recursion will happen in theory if + # after we read the entire data using read,a peek returns + # a buffer of length less than max_search_len. + # The maximum depth of recursion is max_search_len (when + # peek keeps returning us a buffer of length 1) + # max_search_len < 78 + return self.multi_read() + class FieldStorage: """Store a sequence of fields, reading multipart/form-data. @@ -789,47 +971,67 @@ """ next_boundary = b"--" + self.outerboundary last_boundary = next_boundary + b"--" - delim = b"" - last_line_lfend = True _read = 0 - while 1: - if _read >= self.limit: - break - line = self.fp.readline(1<<16) # bytes - self.bytes_read += len(line) - _read += len(line) - if not line: - self.done = -1 - break - if delim == b"\r": - line = delim + line - delim = b"" - if line.startswith(b"--") and last_line_lfend: - strippedline = line.rstrip() - if strippedline == next_boundary: + if isinstance(self.fp, BufferedReader): + cgi_fp = _CgiFp(self.fp, next_boundary) + while 1: + if _read >= self.limit: break - if strippedline == last_boundary: - self.done = 1 + line, (boundary, islastboundary) = cgi_fp.multi_read() + self.bytes_read += len(line) + self.bytes_read += len(boundary) + _read += len(line) + _read += len(boundary) + if not line and not boundary: + self._done = -1 break - odelim = delim - if line.endswith(b"\r\n"): - delim = b"\r\n" - line = line[:-2] - last_line_lfend = True - elif line.endswith(b"\n"): - delim = b"\n" - line = line[:-1] - last_line_lfend = True - elif line.endswith(b"\r"): - # We may interrupt \r\n sequences if they span the 2**16 - # byte boundary - delim = b"\r" - line = line[:-1] - last_line_lfend = False - else: - delim = b"" - last_line_lfend = False - self.__write(odelim + line) + if line: + self.__write(line) + if boundary: + if islastboundary: + self.done = 1 + break + else: + delim = b"" + last_line_lfend = True + while 1: + if _read >= self.limit: + break + line = self.fp.readline(1<<16) # bytes + self.bytes_read += len(line) + _read += len(line) + if not line: + self.done = -1 + break + if delim == b"\r": + line = delim + line + delim = b"" + if line.startswith(b"--") and last_line_lfend: + strippedline = line.rstrip() + if strippedline == next_boundary: + break + if strippedline == last_boundary: + self.done = 1 + break + odelim = delim + if line.endswith(b"\r\n"): + delim = b"\r\n" + line = line[:-2] + last_line_lfend = True + elif line.endswith(b"\n"): + delim = b"\n" + line = line[:-1] + last_line_lfend = True + elif line.endswith(b"\r"): + # We may interrupt \r\n sequences if they span the 2**16 + # byte boundary + delim = b"\r" + line = line[:-1] + last_line_lfend = False + else: + delim = b"" + last_line_lfend = False + self.__write(odelim + line) def skip_lines(self): """Internal: skip lines until outer boundary if defined.""" diff -r 64a54f0c87d7 Lib/test/test_cgi.py --- a/Lib/test/test_cgi.py Sat Nov 01 22:48:24 2014 -0500 +++ b/Lib/test/test_cgi.py Tue Nov 04 05:49:42 2014 -0800 @@ -3,10 +3,11 @@ import os import sys import tempfile +import random import unittest import warnings from collections import namedtuple -from io import StringIO, BytesIO +from io import StringIO, BytesIO, BufferedReader class HackedSysModule: # The regression test will have real values in sys.argv, which @@ -16,6 +17,10 @@ cgi.sys = HackedSysModule() +def fake_stdins(b, encoding='latin-1'): + yield BytesIO(b.encode(encoding)) + yield BufferedReader(BytesIO(b.encode(encoding))) + class ComparableException: def __init__(self, err): self.err = err @@ -106,11 +111,8 @@ def first_second_elts(list): return [(p[0], p[1][0]) for p in list] -def gen_result(data, environ): - encoding = 'latin-1' - fake_stdin = BytesIO(data.encode(encoding)) - fake_stdin.seek(0) - form = cgi.FieldStorage(fp=fake_stdin, environ=environ, encoding=encoding) +def gen_result(fake_fp, environ, encoding='latin-1'): + form = cgi.FieldStorage(fp=fake_fp, environ=environ, encoding=encoding) result = {} for k, v in dict(form).items(): @@ -118,8 +120,107 @@ return result +def make_str_with_lfs(total_size, plf): + # generates a random string + # '\n' is filled with a probability of plf + nonlf_bytes = [i for i in range(256) if i != b'\n'[0]] + ba = bytearray(b'\x00' * total_size) + random_size = int(255 / ( 1 - plf)) - 1 + num_lf = 0 + for i in range(total_size): + r = random.randint(0, random_size) + if (r <= 254): + ba[i] = nonlf_bytes[r] + else: + ba[i] = ord('\n') + num_lf += 1 + return ba, num_lf + +def write_file_multi(temp_file_name, prologue, epilogue, input_array): + with open(temp_file_name, 'wb') as outfile: + outfile.write(prologue) + outfile.write(input_array) + outfile.write(epilogue) + +def run_bad_outerboundary_test(testcase, in_byte): + prologue = b'''------WebKitFormBoundaryeHCgrOGACrcYuuB5 +Content-Disposition: form-data; name="textline" + +form +------WebKitFormBoundaryeHCgrOGACrcYuuB5 +Content-Disposition: form-data; name="datafile"; filename="somefile.bin" +Content-Type: application/octet-stream + +''' + epilogue = b'\r\n' + + write_file_multi(testcase.tempfile, prologue, epilogue, in_byte) + size = os.stat(testcase.tempfile).st_size + environ = {} + environ["CONTENT_TYPE"] = """\ +multipart/form-data; boundary=----WebKitFormBoundaryeHCgrOGACrcYuuB5""" + environ["REQUEST_METHOD"] = "POST" + environ["CONTENT_LENGTH"] = str(size) + with open(testcase.tempfile,'rb') as fp: + fs = cgi.FieldStorage(fp, None, environ=environ) + testcase.assertTrue(len(fs['datafile'].value) > len(in_byte)) + +def run_multi_test(testcase, in_byte, buffer_size, + outer_boundary_head, outer_boundary_tail, + outer_boundary_delimiter): + + prologue = b'''------WebKitFormBoundaryeHCgrOGACrcYuuB5 +Content-Disposition: form-data; name="textline" + +form +------WebKitFormBoundaryeHCgrOGACrcYuuB5 +Content-Disposition: form-data; name="datafile"; filename="somefile.bin" +Content-Type: application/octet-stream + +''' + epilogue = outer_boundary_head + \ + b'------WebKitFormBoundaryeHCgrOGACrcYuuB5' + \ + outer_boundary_delimiter + outer_boundary_tail + + write_file_multi(testcase.tempfile, prologue, epilogue, in_byte) + size = os.stat(testcase.tempfile).st_size + environ = {} + environ["CONTENT_TYPE"] = """\ +multipart/form-data; boundary=----WebKitFormBoundaryeHCgrOGACrcYuuB5""" + environ["REQUEST_METHOD"] = "POST" + environ["CONTENT_LENGTH"] = str(size) + testcase.assertEqual(size, len(epilogue) + len(prologue) + len(in_byte)) + with open(testcase.tempfile,'rb', buffer_size) as fp: + fs = cgi.FieldStorage(fp, None, environ=environ) + testcase.assertEqual(fs['datafile'].value, in_byte) + +def multi_test(testcase, in_byte, buffer_size, + outer_boundary_head=b'\r\n', outer_boundary_tail=b'\r\n', + outer_boundary_delimiter=b'--'): + try: + run_multi_test(testcase, in_byte, buffer_size, outer_boundary_head, + outer_boundary_tail, outer_boundary_delimiter) + except: + print(repr(in_byte)) + raise + +def bad_outerboundary_test(testcase, in_byte): + try: + run_bad_outerboundary_test(testcase, in_byte) + except: + print(repr(in_byte)) + raise + class CgiTests(unittest.TestCase): + def setUp(self): + fd, self.tempfile = tempfile.mkstemp() + os.close(fd) + self.addCleanup(os.unlink, self.tempfile) + + def tearDown(self): + pass + def test_parse_multipart(self): fp = BytesIO(POSTDATA.encode('latin1')) env = {'boundary': BOUNDARY.encode('latin1'), @@ -280,8 +381,9 @@ 'CONTENT_TYPE': 'multipart/form-data; boundary=-123', 'REQUEST_METHOD': 'POST', } - self.assertEqual(gen_result(data, environ), - {'upload': content.encode('latin1')}) + for fake_stdin in fake_stdins(data): + self.assertEqual(gen_result(fake_stdin, environ), + {'upload': content.encode('latin1')}) check('x' * (maxline - 1)) check('x' * (maxline - 1) + '\r') check('x' * (maxline - 1) + '\r' + 'y' * (maxline - 1)) @@ -321,8 +423,9 @@ 'QUERY_STRING': 'key1=value1&key2=value2y', 'REQUEST_METHOD': 'POST', } - v = gen_result(data, environ) - self.assertEqual(self._qs_result, v) + for fake_stdin in fake_stdins(data): + v = gen_result(fake_stdin, environ) + self.assertEqual(self._qs_result, v) def testQSAndFormData(self): data = """---123 @@ -345,8 +448,9 @@ 'QUERY_STRING': 'key1=value1&key2=value2x', 'REQUEST_METHOD': 'POST', } - v = gen_result(data, environ) - self.assertEqual(self._qs_result, v) + for fake_stdin in fake_stdins(data): + v = gen_result(fake_stdin, environ) + self.assertEqual(self._qs_result, v) def testQSAndFormDataFile(self): data = """---123 @@ -379,8 +483,9 @@ result.update({ 'upload': b'this is the content of the fake file\n' }) - v = gen_result(data, environ) - self.assertEqual(result, v) + for fake_stdin in fake_stdins(data): + v = gen_result(fake_stdin, environ) + self.assertEqual(result, v) def test_deprecated_parse_qs(self): # this func is moved to urllib.parse, this is just a sanity check @@ -425,6 +530,56 @@ cgi.parse_header('form-data; name="files"; filename="fo\\"o;bar"'), ("form-data", {"name": "files", "filename": 'fo"o;bar'})) + def test_end_boundary_overlaps_buffer(self): + buffer_size = 1 << 9 + in_byte_len = buffer_size - 3 + in_byte, _ = make_str_with_lfs(in_byte_len, 0.3) + multi_test(self, in_byte, buffer_size) + + def test_payload_less_than_buffer(self): + buffer_size = 1 << 10 + in_byte_len = 1 << 9 + in_byte, _ = make_str_with_lfs(in_byte_len, 0.4) + multi_test(self, in_byte, buffer_size) + + def test_payload_less_than_buffer_lf(self): + buffer_size = 1 << 10 + in_byte_len = 1 << 9 + in_byte, _ = make_str_with_lfs(in_byte_len, 0.3) + multi_test(self, in_byte, buffer_size, + outer_boundary_head=b'\n', outer_boundary_tail=b'\n') + + def test_payload_bigger_than_buffer(self): + buffer_size = 1 << 7 + in_byte_len = 1 << 9 + in_byte, _ = make_str_with_lfs(in_byte_len, 0.3) + multi_test(self, in_byte, buffer_size) + + def test_no_crlf_after_boundary(self): + buffer_size = 1 << 5 + in_byte_len = 1 << 9 + in_byte, _ = make_str_with_lfs(in_byte_len, 0.3) + multi_test(self, in_byte, buffer_size, + outer_boundary_head=b'\n', outer_boundary_tail=b'', + outer_boundary_delimiter=b'') + + def test_no_crlf_after_outerboundary(self): + buffer_size = 1 << 5 + in_byte_len = 1 << 9 + in_byte, _ = make_str_with_lfs(in_byte_len, 0.3) + multi_test(self, in_byte, buffer_size, + outer_boundary_head=b'\n', outer_boundary_tail=b'') + + def test_very_small_buffer(self): + buffer_size = 2 + in_byte_len = 1 << 5 + in_byte, _ = make_str_with_lfs(in_byte_len, 0.3) + multi_test(self, in_byte, buffer_size) + + def test_badlyformed_multipart(self): + in_byte_len = 1 << 9 + in_byte, _ = make_str_with_lfs(in_byte_len, 0.3) + bad_outerboundary_test(self, in_byte) BOUNDARY = "---------------------------721837373350705526688164684"