diff -r 64a54f0c87d7 Lib/cgi.py --- a/Lib/cgi.py Sat Nov 01 22:48:24 2014 -0500 +++ b/Lib/cgi.py Sat Nov 08 06:43:10 2014 -0800 @@ -31,7 +31,7 @@ # Imports # ======= -from io import StringIO, BytesIO, TextIOWrapper +from io import StringIO, BytesIO, TextIOWrapper, BufferedReader from collections import Mapping import sys import os @@ -42,6 +42,7 @@ import html import locale import tempfile +import re __all__ = ["MiniFieldStorage", "FieldStorage", "parse", "parse_qs", "parse_qsl", "parse_multipart", @@ -359,6 +360,164 @@ return "MiniFieldStorage(%r, %r)" % (self.name, self.value) +class _CgiFp: + """ Support class for FieldStorage. This class should not be used directly. + fp -> instance of BufferedReader + boundary -> multipart boundary *including* the leading '--' delimiter + + All methods advance the file-pointer only if they have determined that + the current payload in the buffer (of BufferedReader) does not run over + the next boundary in a multipart. + There are cases, where multi_read seeks the file pointer beyond the + data it has just returned. It does this knowing that the seek never + crosses the next boundary and it mantains the difference in an + overlapped buffer. This overlapped buffer is part of the next payload. + """ + + def __init__(self, fp, boundary): + self._fp = fp + self._overlapped = b'' + self._initial = True + + #Pattern for all buffers + pattern_noeof = b'\n' + re.escape(boundary) + b'(?:--)?\r?\n' + self._re_noeof = re.compile(pattern_noeof) + + #First Buffer + #We have a special pattern for the first buffer. The boundary at the + #start of the payload will begin without the CRLF. + #We should use this pattern only for the first buffer, as the + #subsequent buffer's "start of buffer" could be part of the payload, + #because we dont test for the leading CRLF/LF, when breaking a buffer. + pattern_first_noeof = pattern_noeof \ + + b'|^' + re.escape(boundary) + b'(?:--)?\r?\n' + self._re_first_noeof = re.compile(pattern_first_noeof, re.M) + + #Last Buffer + #Similary last buffer can end without a CRLF and is valid only for the + #last buffer. Also take care of the corner case where the payload is + #just the boundary without a trailing CRLF/LF + pattern_eof = pattern_noeof \ + + b'|\n' + re.escape(boundary) + b'(?:--)?$' \ + + b'|^' + re.escape(boundary) + b'(?:--)?$' + self._re_eof = re.compile(pattern_eof, re.M) + + def _check_boundary_head(self, buffer, hindex): + ''' Boundary has been located. If boundary is of pattern,\nboundary, + claim the leading \r if associated with \n as part of the boundary. + hindex = index of start of boundary in buffer. This could be either + boundary when found just after initialization or could be + \nbuffer. + returns index of the boundary including the leading CR. + ''' + if hindex >= 1 and buffer[hindex-1 : hindex+1] == b'\r\n': + return hindex - 1 + return hindex + + def _check_boundary_trail(self, buffer, tindex): + ''' tindex = last element of the boundary. + + returns True if the boundary is a last boundary, false otherwise. + + A boundary could end with CRLF or LF or in case of EOF just the boundary. + Test if the boundary ends with --[CRLF|LF] to indicate last boundary. + ''' + if buffer[tindex] == b'\n'[0]: + if tindex > 0: + if buffer[tindex - 1] == b'\r'[0]: + if (tindex > 2 and + buffer[tindex - 3:tindex - 1] == b'--'): + #Ends as Boundary--CRLF + return True + elif tindex > 1 and buffer[tindex - 2:tindex] == b'--': + #Ends as Boundary--LF + return True + elif (tindex > 0 and buffer[tindex-1:tindex + 1] == b'--'): + #Ends as Boundary-- + return True + #Ends as Boundary + return False + + def multi_read(self): + ''' returns chunk, (boundary, islastboundary) + boundary or chunk can be empty. When both are empty EOF + is reached. + ''' + + eof = False + #Pass 1 Search for boundary in just the buffer + peek_buffer = self._fp.peek() + if not peek_buffer: + eof = True + buffer = self._overlapped + peek_buffer + rsearch = None + if self._initial: + self._initial = False + rsearch = self._re_first_noeof.search(buffer) + else: + if eof: + rsearch = self._re_eof.search(buffer) + else: + rsearch = self._re_noeof.search(buffer) + + if rsearch: + # boundary is found + start_index = self._check_boundary_head(buffer, rsearch.start()) + islastboundary = self._check_boundary_trail(buffer, rsearch.end()-1) + rdata = ( \ + buffer[:start_index],\ + (buffer[start_index:rsearch.end()], islastboundary)) + # Drain + self._fp.read(rsearch.end() - len(self._overlapped)) + self._overlapped = b'' + return rdata + else: + if eof: + rdata = buffer, (b'', None) + #clear the overlapped + self._overlapped = b'' + return rdata + + # Boundary not found in the peek buffer. + # Pass 2 Drain the peek buffer and read upto nextline + self._fp.read(len(peek_buffer)) + next_line = self._fp.readline(1<<16) + buffer += next_line + if not next_line or \ + (len(next_line) < (1<<16) and not next_line.endswith(b"\n")): + eof = True + if eof: + rsearch = self._re_eof.search(buffer) + else: + rsearch = self._re_noeof.search(buffer) + + if rsearch: + # boundary is found + start_index = self._check_boundary_head(buffer, rsearch.start()) + islastboundary = self._check_boundary_trail(buffer, rsearch.end()-1) + rdata = ( \ + buffer[:start_index],\ + (buffer[start_index:rsearch.end()], islastboundary)) + self._overlapped = b'' + return rdata + else: + if eof: + rdata = buffer, (b'', None) + #clear the overlapped + self._overlapped = b'' + return rdata + else: + if buffer.endswith(b"\r\n"): + self._overlapped = b"\r\n" + return buffer[:-2], (b'', None) + if buffer.endswith(b"\n"): + self._overlapped = b"\n" + return buffer[:-1], (b'', None) + if buffer.endswith(b"\r"): + self._overlapped = b"\r" + return buffer[:-1], (b'', None) + return buffer, (b'',None) + class FieldStorage: """Store a sequence of fields, reading multipart/form-data. @@ -789,47 +948,67 @@ """ next_boundary = b"--" + self.outerboundary last_boundary = next_boundary + b"--" - delim = b"" - last_line_lfend = True _read = 0 - while 1: - if _read >= self.limit: - break - line = self.fp.readline(1<<16) # bytes - self.bytes_read += len(line) - _read += len(line) - if not line: - self.done = -1 - break - if delim == b"\r": - line = delim + line - delim = b"" - if line.startswith(b"--") and last_line_lfend: - strippedline = line.rstrip() - if strippedline == next_boundary: + if isinstance(self.fp, BufferedReader): + cgi_fp = _CgiFp(self.fp, next_boundary) + while 1: + if _read >= self.limit: break - if strippedline == last_boundary: - self.done = 1 + line, (boundary, islastboundary) = cgi_fp.multi_read() + self.bytes_read += len(line) + self.bytes_read += len(boundary) + _read += len(line) + _read += len(boundary) + if not line and not boundary: + self._done = -1 break - odelim = delim - if line.endswith(b"\r\n"): - delim = b"\r\n" - line = line[:-2] - last_line_lfend = True - elif line.endswith(b"\n"): - delim = b"\n" - line = line[:-1] - last_line_lfend = True - elif line.endswith(b"\r"): - # We may interrupt \r\n sequences if they span the 2**16 - # byte boundary - delim = b"\r" - line = line[:-1] - last_line_lfend = False - else: - delim = b"" - last_line_lfend = False - self.__write(odelim + line) + if line: + self.__write(line) + if boundary: + if islastboundary: + self.done = 1 + break + else: + delim = b"" + last_line_lfend = True + while 1: + if _read >= self.limit: + break + line = self.fp.readline(1<<16) # bytes + self.bytes_read += len(line) + _read += len(line) + if not line: + self.done = -1 + break + if delim == b"\r": + line = delim + line + delim = b"" + if line.startswith(b"--") and last_line_lfend: + strippedline = line.rstrip() + if strippedline == next_boundary: + break + if strippedline == last_boundary: + self.done = 1 + break + odelim = delim + if line.endswith(b"\r\n"): + delim = b"\r\n" + line = line[:-2] + last_line_lfend = True + elif line.endswith(b"\n"): + delim = b"\n" + line = line[:-1] + last_line_lfend = True + elif line.endswith(b"\r"): + # We may interrupt \r\n sequences if they span the 2**16 + # byte boundary + delim = b"\r" + line = line[:-1] + last_line_lfend = False + else: + delim = b"" + last_line_lfend = False + self.__write(odelim + line) def skip_lines(self): """Internal: skip lines until outer boundary if defined.""" diff -r 64a54f0c87d7 Lib/test/test_cgi.py --- a/Lib/test/test_cgi.py Sat Nov 01 22:48:24 2014 -0500 +++ b/Lib/test/test_cgi.py Sat Nov 08 06:43:10 2014 -0800 @@ -3,10 +3,11 @@ import os import sys import tempfile +import random import unittest import warnings from collections import namedtuple -from io import StringIO, BytesIO +from io import StringIO, BytesIO, BufferedReader class HackedSysModule: # The regression test will have real values in sys.argv, which @@ -16,6 +17,10 @@ cgi.sys = HackedSysModule() +def fake_stdins(b, encoding='latin-1'): + yield BytesIO(b.encode(encoding)) + yield BufferedReader(BytesIO(b.encode(encoding))) + class ComparableException: def __init__(self, err): self.err = err @@ -106,11 +111,8 @@ def first_second_elts(list): return [(p[0], p[1][0]) for p in list] -def gen_result(data, environ): - encoding = 'latin-1' - fake_stdin = BytesIO(data.encode(encoding)) - fake_stdin.seek(0) - form = cgi.FieldStorage(fp=fake_stdin, environ=environ, encoding=encoding) +def gen_result(fake_fp, environ, encoding='latin-1'): + form = cgi.FieldStorage(fp=fake_fp, environ=environ, encoding=encoding) result = {} for k, v in dict(form).items(): @@ -118,8 +120,107 @@ return result +def make_str_with_lfs(total_size, plf): + # generates a random string + # '\n' is filled with a probability of plf + nonlf_bytes = [i for i in range(256) if i != b'\n'[0]] + ba = bytearray(b'\x00' * total_size) + random_size = int(255 / ( 1 - plf)) - 1 + num_lf = 0 + for i in range(total_size): + r = random.randint(0, random_size) + if (r <= 254): + ba[i] = nonlf_bytes[r] + else: + ba[i] = ord('\n') + num_lf += 1 + return ba, num_lf + +def write_file_multi(temp_file_name, prologue, epilogue, input_array): + with open(temp_file_name, 'wb') as outfile: + outfile.write(prologue) + outfile.write(input_array) + outfile.write(epilogue) + +def run_bad_outerboundary_test(testcase, in_byte): + prologue = b'''------WebKitFormBoundaryeHCgrOGACrcYuuB5 +Content-Disposition: form-data; name="textline" + +form +------WebKitFormBoundaryeHCgrOGACrcYuuB5 +Content-Disposition: form-data; name="datafile"; filename="somefile.bin" +Content-Type: application/octet-stream + +''' + epilogue = b'\r\n' + + write_file_multi(testcase.tempfile, prologue, epilogue, in_byte) + size = os.stat(testcase.tempfile).st_size + environ = {} + environ["CONTENT_TYPE"] = """\ +multipart/form-data; boundary=----WebKitFormBoundaryeHCgrOGACrcYuuB5""" + environ["REQUEST_METHOD"] = "POST" + environ["CONTENT_LENGTH"] = str(size) + with open(testcase.tempfile,'rb') as fp: + fs = cgi.FieldStorage(fp, None, environ=environ) + testcase.assertTrue(len(fs['datafile'].value) > len(in_byte)) + +def run_multi_test(testcase, in_byte, buffer_size, + outer_boundary_head, outer_boundary_tail, + outer_boundary_delimiter): + + prologue = b'''------WebKitFormBoundaryeHCgrOGACrcYuuB5 +Content-Disposition: form-data; name="textline" + +form +------WebKitFormBoundaryeHCgrOGACrcYuuB5 +Content-Disposition: form-data; name="datafile"; filename="somefile.bin" +Content-Type: application/octet-stream + +''' + epilogue = outer_boundary_head + \ + b'------WebKitFormBoundaryeHCgrOGACrcYuuB5' + \ + outer_boundary_delimiter + outer_boundary_tail + + write_file_multi(testcase.tempfile, prologue, epilogue, in_byte) + size = os.stat(testcase.tempfile).st_size + environ = {} + environ["CONTENT_TYPE"] = """\ +multipart/form-data; boundary=----WebKitFormBoundaryeHCgrOGACrcYuuB5""" + environ["REQUEST_METHOD"] = "POST" + environ["CONTENT_LENGTH"] = str(size) + testcase.assertEqual(size, len(epilogue) + len(prologue) + len(in_byte)) + with open(testcase.tempfile,'rb', buffer_size) as fp: + fs = cgi.FieldStorage(fp, None, environ=environ) + testcase.assertEqual(fs['datafile'].value, in_byte) + +def multi_test(testcase, in_byte, buffer_size, + outer_boundary_head=b'\r\n', outer_boundary_tail=b'\r\n', + outer_boundary_delimiter=b'--'): + try: + run_multi_test(testcase, in_byte, buffer_size, outer_boundary_head, + outer_boundary_tail, outer_boundary_delimiter) + except: + print(repr(in_byte)) + raise + +def bad_outerboundary_test(testcase, in_byte): + try: + run_bad_outerboundary_test(testcase, in_byte) + except: + print(repr(in_byte)) + raise + class CgiTests(unittest.TestCase): + def setUp(self): + fd, self.tempfile = tempfile.mkstemp() + os.close(fd) + self.addCleanup(os.unlink, self.tempfile) + + def tearDown(self): + pass + def test_parse_multipart(self): fp = BytesIO(POSTDATA.encode('latin1')) env = {'boundary': BOUNDARY.encode('latin1'), @@ -280,8 +381,9 @@ 'CONTENT_TYPE': 'multipart/form-data; boundary=-123', 'REQUEST_METHOD': 'POST', } - self.assertEqual(gen_result(data, environ), - {'upload': content.encode('latin1')}) + for fake_stdin in fake_stdins(data): + self.assertEqual(gen_result(fake_stdin, environ), + {'upload': content.encode('latin1')}) check('x' * (maxline - 1)) check('x' * (maxline - 1) + '\r') check('x' * (maxline - 1) + '\r' + 'y' * (maxline - 1)) @@ -321,8 +423,9 @@ 'QUERY_STRING': 'key1=value1&key2=value2y', 'REQUEST_METHOD': 'POST', } - v = gen_result(data, environ) - self.assertEqual(self._qs_result, v) + for fake_stdin in fake_stdins(data): + v = gen_result(fake_stdin, environ) + self.assertEqual(self._qs_result, v) def testQSAndFormData(self): data = """---123 @@ -345,8 +448,9 @@ 'QUERY_STRING': 'key1=value1&key2=value2x', 'REQUEST_METHOD': 'POST', } - v = gen_result(data, environ) - self.assertEqual(self._qs_result, v) + for fake_stdin in fake_stdins(data): + v = gen_result(fake_stdin, environ) + self.assertEqual(self._qs_result, v) def testQSAndFormDataFile(self): data = """---123 @@ -379,8 +483,9 @@ result.update({ 'upload': b'this is the content of the fake file\n' }) - v = gen_result(data, environ) - self.assertEqual(result, v) + for fake_stdin in fake_stdins(data): + v = gen_result(fake_stdin, environ) + self.assertEqual(result, v) def test_deprecated_parse_qs(self): # this func is moved to urllib.parse, this is just a sanity check @@ -425,6 +530,56 @@ cgi.parse_header('form-data; name="files"; filename="fo\\"o;bar"'), ("form-data", {"name": "files", "filename": 'fo"o;bar'})) + def test_end_boundary_overlaps_buffer(self): + buffer_size = 1 << 9 + in_byte_len = buffer_size - 3 + in_byte, _ = make_str_with_lfs(in_byte_len, 0.3) + multi_test(self, in_byte, buffer_size) + + def test_payload_less_than_buffer(self): + buffer_size = 1 << 10 + in_byte_len = 1 << 9 + in_byte, _ = make_str_with_lfs(in_byte_len, 0.4) + multi_test(self, in_byte, buffer_size) + + def test_payload_less_than_buffer_lf(self): + buffer_size = 1 << 10 + in_byte_len = 1 << 9 + in_byte, _ = make_str_with_lfs(in_byte_len, 0.3) + multi_test(self, in_byte, buffer_size, + outer_boundary_head=b'\n', outer_boundary_tail=b'\n') + + def test_payload_bigger_than_buffer(self): + buffer_size = 1 << 7 + in_byte_len = 1 << 9 + in_byte, _ = make_str_with_lfs(in_byte_len, 0.3) + multi_test(self, in_byte, buffer_size) + + def test_no_crlf_after_boundary(self): + buffer_size = 1 << 5 + in_byte_len = 1 << 9 + in_byte, _ = make_str_with_lfs(in_byte_len, 0.3) + multi_test(self, in_byte, buffer_size, + outer_boundary_head=b'\n', outer_boundary_tail=b'', + outer_boundary_delimiter=b'') + + def test_no_crlf_after_outerboundary(self): + buffer_size = 1 << 5 + in_byte_len = 1 << 9 + in_byte, _ = make_str_with_lfs(in_byte_len, 0.3) + multi_test(self, in_byte, buffer_size, + outer_boundary_head=b'\n', outer_boundary_tail=b'') + + def test_very_small_buffer(self): + buffer_size = 2 + in_byte_len = 1 << 5 + in_byte, _ = make_str_with_lfs(in_byte_len, 0.3) + multi_test(self, in_byte, buffer_size) + + def test_badlyformed_multipart(self): + in_byte_len = 1 << 9 + in_byte, _ = make_str_with_lfs(in_byte_len, 0.3) + bad_outerboundary_test(self, in_byte) BOUNDARY = "---------------------------721837373350705526688164684"