diff -r e7e8a218737a Lib/cgi.py --- a/Lib/cgi.py Sun Oct 05 11:47:01 2014 -0400 +++ b/Lib/cgi.py Mon Oct 13 01:48:39 2014 -0700 @@ -42,6 +42,8 @@ import html import locale import tempfile +import enum +import re __all__ = ["MiniFieldStorage", "FieldStorage", "parse", "parse_qs", "parse_qsl", "parse_multipart", @@ -359,6 +361,345 @@ return "MiniFieldStorage(%r, %r)" % (self.name, self.value) +class _CgiFp: + """ Support class for FieldStorage. This class should not be used directly. + + All methods that return data, first check for data in the internal + buffer and then call the underlying file pointers read and readline, + if neccessary. + + fp - any object that implements readline and read. + read_block_size is used to modify the maximum data loaded in buffer. + """ + + read_block_size = 1 << 17 + + def __init__(self,fp): + # self._eof indicates that underling fp class has reached EOF. + # However, there might still be data available in the internal buffer. + # self._eof is set to True when f.read(x) return "" or less than x + # + # self._last_lf indicates, that last returned data ended with lf. + # This indicates we should test for pattern ^Boundary in the next buffer. + # + self._fp = fp + self._buffer = b'' + self._eof = False + self._last_lf = False + self._boundary = None + self._init_called = False + + class MultiType(enum.Enum): + """ Enumeration for defining the type of data returned by multi_read. + chunk = encapsulation per RFC 1341. It is typically multi-line. + chunk is not guarenteed to be the complete encapsulation. + eg. chunk can be less than the encapsulation when the + internal buffer is smaller than encapsulation. + next_boundary = a boundary is found + last_boundary = last boundary is found + """ + + chunk = 1 + next_boundary = 2 + last_boundary = 3 + + def _clear_buffer(self): + self._buffer = b'' + + def _cut_buffer(self, x): + self._buffer = self._buffer[x:] + + def _fill_buffer(self, x): + """ fill buffer upto x bytes. + if buffer already has data, x-len(data) will be read and added. + """ + + if self._eof: + return + rlen = x - len(self._buffer) + if rlen > 0: + rdata = self._fp.read(rlen) + if not rdata: + self._eof = True + return + if len(rdata) < rlen: + self._eof = True + self._buffer += rdata + + + def read(self, x): + """ x - amount of data to read + + if x is 0, returns b"". This does not neccessarily mean EOF. + + if x is positive - returns data of length x.If returned data is + less than x it indicates an EOF. Return value of b"" indicates + empty string. + + if x is -1, return all data upto EOF. + + if x <= -2, error. + """ + + if x is not None and x == 0: + return b'' + + if -1 == x: + data = b'' + while True: + rdata = self.read(self.read_block_size) + if not rdata: + self._eof = True + return data + if len(rdata) < (self.read_block_size): + self._eof = True + data += rdata + + if 0 > x: + raise ValueError('Invalid size '+str(x)) + + if self._eof: + if not self._buffer: + return b'' + if x > len(buffer): + x = len(buffer) + + if len(self._buffer) >= x: + data = self._buffer[0:x] + self._cut_buffer(x) + return data + else: + rlen = x - len(self._buffer) + rdata = self._fp.read(rlen) + if not rdata or len(rdata) < rlen: + self._eof = True + data = self._buffer + rdata + self._clear_buffer() + return data + + def readline(self, x=None): + """ x - optional amount of data to read + + Returns a line begining at the current offset. + + if x is not present, returns the buffer upto next LF. + Buffer without LF indicates EOF. + + if x is 0, returns b"". This does not neccessarily mean EOF. + + if x is positive - returns the buffer upto next LF, if found in + the next x bytes. + Buffer of length less than x, which dont end with LF indicates EOF. + + if x < 0, same as x is not present. + """ + + if x is not None and x == 0: + return b'' + if x and 0 > x: + x = None + data_from_buffer = b'' + + if self._buffer: + if x: + lf = self._buffer.find(b'\n', 0, x) + if -1 != lf: + data = self._buffer[0 : lf+1] + self._cut_buffer(lf + 1) + return data + if x <= len(self._buffer): + data = self._buffer[0:x] + self._cut_buffer(x) + return data + data_from_buffer = self._buffer + self._clear_buffer() + else: + lf = self._buffer.find(b'\n') + if -1 != lf: + data = self._buffer[0 : lf+1] + self._cut_buffer(lf + 1) + return data + data_from_buffer = self._buffer + self._clear_buffer() + + #we have read as much as we can from buffer + if self._eof: + return data_from_buffer + + if x: + rlen = x - len(data_from_buffer) + r_data = self._fp.readline(rlen) + else: + r_data = self._fp.readline() + + if not r_data: + self._eof = True + + return data_from_buffer + r_data + + def _check_boundary_head(self, hindex): + ''' Boundary has been located. If boundary is of pattern,\nboundary, + claim the leading \r if associated with \n as part of the boundary. + hindex = index of start of boundary in buffer. This could be either + boundary when found just after initialization or could be + \nbuffer. + returns index of the boundary including the leading CR. + ''' + if hindex >= 1 and self._buffer[hindex-1 : hindex+1] == b'\r\n': + return hindex - 1 + return hindex + + def _check_boundary_trail(self, tindex): + ''' tindex = last element of the boundary. + + returns the type of boundary. + + A boundary could end with CRLF or LF or in case of EOF just the boundary. + Test if the boundary ends with --[CRLF|LF] to indicate last boundary. + ''' + if self._buffer[tindex] == ord('\n'): + if tindex > 0: + if self._buffer[tindex - 1] == ord('\r'): + if (tindex > 2 and + self._buffer[tindex - 2] == ord(b'-') and + self._buffer[tindex - 3] == ord(b'-')): + #Ends as Boundary--CRLF + return self.MultiType.last_boundary + #Ends as BoundaryCRLF + return self.MultiType.next_boundary + if (tindex > 1 and + self._buffer[tindex - 1] == ord(b'-') and + self._buffer[tindex - 2] == ord(b'-')): + #Ends as Boundary--LF + return self.MultiType.last_boundary + #Ends as BoundaryLF + return self.MultiType.next_boundary + if (tindex > 0 and + self._buffer[tindex] == ord(b'-') and + self._buffer[tindex - 1] == ord(b'-')): + #Ends as Boundary-- + return self.MultiType.last_boundary + #Ends as Boundary + return self.MultiType.next_boundary + + def init_multi_read(self,boundary): + ''' boundary = boundary including the prefix delimiter '--' + + This function is required to be called to indicate start + of multi-part reading. + This indicates that the boundary can be legally present at the + start of the current offset. + + The function also sets up compiled regexes. + None of the regex use expansion or wild characters. + + Raise a value error if the boundary is too large. + ''' + + if (len(boundary) >= 1024 or + (len(boundary) + 6) >= self.read_block_size): + raise ValueError("multipart boundary too large") + + self._init_called = True + + if boundary == self._boundary: + return + self._boundary = boundary + + pattern_noeof = re.escape(b'\n'+boundary+b'\r\n') \ + + b'|' + re.escape(b'\n'+boundary+b'\n') \ + + b'|' + re.escape(b'\n'+boundary+b'--\n') \ + + b'|' + re.escape(b'\n'+boundary+b'--\r\n') + self._re_noeof = re.compile(pattern_noeof) + + pattern_eof = pattern_noeof \ + + b'|' + re.escape(b'\n'+boundary) + b'$' \ + + b'|' + re.escape(b'\n'+boundary) + b'$' \ + + b'|' + re.escape(b'\n'+boundary+b'--') + b'$' \ + + b'|' + re.escape(b'\n'+boundary+b'--') + b'$' + self._re_eof = re.compile(pattern_eof) + + pattern_first_noeof = pattern_noeof \ + + b'|^' + re.escape(boundary+b'\r\n') \ + + b'|^' + re.escape(boundary+b'\n') \ + + b'|^' + re.escape(boundary+b'--\n') \ + + b'|^' + re.escape(boundary+b'--\r\n') + self._re_first_noeof = re.compile(pattern_first_noeof) + + pattern_first_eof = pattern_first_noeof \ + + b'|' + re.escape(b'\n'+boundary) + b'$' \ + + b'|' + re.escape(b'\n'+boundary) + b'$' \ + + b'|' + re.escape(b'\n'+boundary+b'--') + b'$' \ + + b'|' + re.escape(b'\n'+boundary+b'--') + b'$' \ + + b'|^' + re.escape(boundary) + b'$' \ + + b'|^' + re.escape(boundary) + b'$' \ + + b'|^' + re.escape(boundary+b'--') + b'$' \ + + b'|^' + re.escape(boundary+b'--') + b'$' + self._re_first_eof = re.compile(pattern_first_eof) + + def multi_read(self): + ''' returns a tuple of type, buffer + type = chunk indicates encapsulation buffer + type = next_boundary indicates the buffer is next boundary. + type = last_boundary indicates the buffer is last boundary. + + boundary buffer leading (CR)LF could have been + consumed by a direct call to readline before init_multi + call(reading headers of empty body part). + The first returned boundary could be without leading CRLF. + Subsequent buffers are of for (CR)LFBoundary(CR)LF or without + any trailing (CR)LF if EOF is reached. + + This function runs a regex on the entire buffer. + + When a boundary is not found in the non EOF block, the last + few elements of the boundary are retained in the buffer. + This ensures that the boundary does not overlap two big + blocks. These elements include leading/trailing CRLFS, + the end delimiter -- and the boundary itself. + ''' + + self._fill_buffer(self.read_block_size) + if not self._buffer: + return self.MultiType.chunk, b'' + rsearch = None + if self._init_called: + self._init_called = False + if self._eof: + rsearch = self._re_first_eof.search(self._buffer) + else: + rsearch = self._re_first_noeof.search(self._buffer) + else: + if self._eof: + rsearch = self._re_eof.search(self._buffer) + else: + rsearch = self._re_noeof.search(self._buffer) + + if rsearch: + # boundary is found + start_index = self._check_boundary_head(rsearch.start()) + if start_index: + # return the pre boundary data first + rdata = self._buffer[:start_index] + self._cut_buffer(start_index) + return self.MultiType.chunk, rdata + else: + mtype = self._check_boundary_trail(rsearch.end()-1) + rdata = self._buffer[:rsearch.end()] + self._cut_buffer(rsearch.end()) + return mtype, rdata + else: + if self._eof: + rdata = self._buffer + self._clear_buffer() + return self.MultiType.chunk, rdata + else: + # remove space for len(boundary)+4(CRLF)+2(--) + len_to_slice = len(self._buffer) - len(self._boundary) - 6 + rdata = self._buffer[0:len_to_slice] + self._cut_buffer(len_to_slice) + return self.MultiType.chunk, rdata + class FieldStorage: """Store a sequence of fields, reading multipart/form-data. @@ -480,14 +821,16 @@ "email.message.Message") self.headers = headers if fp is None: - self.fp = sys.stdin.buffer + self.fp = _CgiFp(sys.stdin.buffer) # self.fp.read() must return bytes elif isinstance(fp, TextIOWrapper): - self.fp = fp.buffer + self.fp = _CgiFp(fp.buffer) + elif isinstance(fp, _CgiFp): + self.fp = fp else: if not (hasattr(fp, 'read') and hasattr(fp, 'readline')): raise TypeError("fp must be file pointer") - self.fp = fp + self.fp = _CgiFp(fp) self.encoding = encoding self.errors = errors @@ -788,48 +1131,24 @@ to bytes for comparisons. """ next_boundary = b"--" + self.outerboundary - last_boundary = next_boundary + b"--" - delim = b"" - last_line_lfend = True + self.fp.init_multi_read(next_boundary) _read = 0 while 1: if _read >= self.limit: break - line = self.fp.readline(1<<16) # bytes + buffertype, line = self.fp.multi_read() + _read += len(line) self.bytes_read += len(line) - _read += len(line) if not line: self.done = -1 break - if delim == b"\r": - line = delim + line - delim = b"" - if line.startswith(b"--") and last_line_lfend: - strippedline = line.rstrip() - if strippedline == next_boundary: - break - if strippedline == last_boundary: - self.done = 1 - break - odelim = delim - if line.endswith(b"\r\n"): - delim = b"\r\n" - line = line[:-2] - last_line_lfend = True - elif line.endswith(b"\n"): - delim = b"\n" - line = line[:-1] - last_line_lfend = True - elif line.endswith(b"\r"): - # We may interrupt \r\n sequences if they span the 2**16 - # byte boundary - delim = b"\r" - line = line[:-1] - last_line_lfend = False - else: - delim = b"" - last_line_lfend = False - self.__write(odelim + line) + if self.fp.MultiType.chunk == buffertype: + self.__write(line) + elif self.fp.MultiType.next_boundary == buffertype: + break + elif self.fp.MultiType.last_boundary == buffertype: + self.done = 1 + break def skip_lines(self): """Internal: skip lines until outer boundary if defined.""" diff -r e7e8a218737a Lib/test/test_cgi.py --- a/Lib/test/test_cgi.py Sun Oct 05 11:47:01 2014 -0400 +++ b/Lib/test/test_cgi.py Mon Oct 13 01:48:39 2014 -0700 @@ -5,6 +5,7 @@ import tempfile import unittest import warnings +import random from collections import namedtuple from io import StringIO, BytesIO @@ -118,8 +119,198 @@ return result +class CgiMultiPartTestMethods: + + nonlf_bytes = [i for i in range(256) if i != ord(b'\n')] + bytes_list = [i for i in range(256) if i != 10] + + @classmethod + def make_rand_str(cls, target_len): + return bytearray([random.choice(cls.bytes_list) + for i in range(target_len)]) + + @classmethod + def make_str_with_lfs(cls, total_size, plf): + # generates a random string + # '\n' is filled with a probability of plf + ba = bytearray(b'\x00' * total_size) + random_size = int(255 / ( 1 - plf)) - 1 + num_lf = 0 + for i in range(total_size): + r = random.randint(0, random_size) + if (r <= 254): + ba[i] = cls.nonlf_bytes[r] + else: + ba[i] = ord('\n') + num_lf += 1 + return bytearray(ba), num_lf + + @classmethod + def write_file(cls, testcase, prologue, epilogue, input_array): + with open(testcase.tempfile, 'wb') as outfile: + outfile.write(prologue) + outfile.write(input_array) + outfile.write(epilogue) + + @classmethod + def run_bad_outerboundary_test(cls, testcase, in_byte): + prologue = b'''------WebKitFormBoundaryeHCgrOGACrcYuuB5 +Content-Disposition: form-data; name="textline" + +form +------WebKitFormBoundaryeHCgrOGACrcYuuB5 +Content-Disposition: form-data; name="datafile"; filename="somefile.bin" +Content-Type: application/octet-stream + +''' + + epilogue = b''' +---''' + + cls.write_file(testcase, prologue, epilogue, in_byte) + size = os.stat(testcase.tempfile).st_size + environ = {} + environ["CONTENT_TYPE"] = """\ +multipart/form-data; boundary=----WebKitFormBoundaryeHCgrOGACrcYuuB5""" + environ["REQUEST_METHOD"] = "POST" + environ["CONTENT_LENGTH"] = str(size) + testcase.assertEqual(size, len(epilogue) + len(prologue) + len(in_byte)) + + try: + with open(testcase.tempfile,'rb') as fp: + fs = cgi.FieldStorage(fp, None, environ=environ) + testcase.assertTrue(len(fs['datafile'].value) > len(in_byte)) + except: + with open(testcase.tempfile, 'rb') as f: + while 1: + buffer = f.readline() + if not buffer: + break + print(repr(buffer)) + raise + + @classmethod + def run_multi_test(cls, testcase, in_byte, use_lf=False, + outer_boundary_no_trail=False, + no_boundary_delimiter=False): + + trail = b'\r\n' + outer_boundary_trail = b'\r\n' + outer_boundary_delimiter = b'--' + + if use_lf: + trail = b'\n' + outer_boundary_trail = b'\n' + + if outer_boundary_no_trail: + outer_boundary_trail = b'' + + if no_boundary_delimiter: + outer_boundary_delimiter = b'' + + prologue = b'------WebKitFormBoundaryeHCgrOGACrcYuuB5' + trail + prologue += b'Content-Disposition: form-data; name="textline"' + trail + prologue += trail + prologue += b'form_file'+trail + prologue += b'------WebKitFormBoundaryeHCgrOGACrcYuuB5' + trail + prologue += b'Content-Disposition: form-data;' + \ + b'name="datafile"; filename="somefile.bin"' + prologue += trail + prologue += b'Content-Type: application/octet-stream' + trail + prologue += trail + + epilogue = trail + \ + b'------WebKitFormBoundaryeHCgrOGACrcYuuB5' + \ + outer_boundary_delimiter+outer_boundary_trail + + cls.write_file(testcase, prologue,epilogue,in_byte) + size = os.stat(testcase.tempfile).st_size + environ = {} + environ["CONTENT_TYPE"] = """\ +multipart/form-data; boundary=----WebKitFormBoundaryeHCgrOGACrcYuuB5""" + environ["REQUEST_METHOD"] = "POST" + environ["CONTENT_LENGTH"] = str(size) + testcase.assertEqual(size, len(epilogue) + len(prologue) + len(in_byte)) + with open(testcase.tempfile,'rb') as fp: + fs = cgi.FieldStorage(fp, None, environ=environ) + testcase.assertEqual(fs['datafile'].value, in_byte) + + @classmethod + def multi_test(cls, testcase, in_byte, use_lf=False, + outer_boundary_no_trail=False, + no_boundary_delimiter=False): + try: + cls.run_multi_test( + testcase, in_byte, use_lf, + outer_boundary_no_trail, no_boundary_delimiter) + except: + print(repr(in_byte)) + raise + + @classmethod + def bad_outerboundary_test(cls, testcase, in_byte): + try: + cls.run_bad_outerboundary_test(testcase, in_byte) + except: + print(repr(in_byte)) + raise + + @classmethod + def cgifp_testfile(cls, testcase, x=None, useRead=False): + testcase.assertTrue(useRead == False or (useRead == True and x)) + tsize = os.stat(testcase.tempfile).st_size + size_buffered = random.randint(0,tsize) + values_of_x = [] + try: + with open(testcase.tempfile, 'rb') as fp1,\ + open(testcase.tempfile,'rb') as fp2: + cgi_fp = cgi._CgiFp(fp1) + cgi_fp._buffer = fp1.read(tsize) + while True: + lx = x + if x and x >= 0: + lx = random.randint(0,x) + values_of_x.append(lx) + cgi_buffer = fp2_buffer = None + if (useRead): + cgi_buffer = cgi_fp.read(lx) + fp2_buffer = fp2.read(lx) + else: + cgi_buffer = cgi_fp.readline(lx) + fp2_buffer = fp2.readline(lx) + testcase.assertEqual(cgi_buffer, fp2_buffer) + if not cgi_buffer: + break + except: + print(locals()) + with open(testcase.tempfile, 'rb') as f: + while 1: + buffer = f.readline() + if not buffer: + break + print(repr(buffer)) + raise + + @classmethod + def cgifp_testfile_readline(cls, testcase, x=None): + CgiMultiPartTestMethods.cgifp_testfile(testcase, x, False) + + @classmethod + def cgifp_testfile_read(cls, testcase, x): + CgiMultiPartTestMethods.cgifp_testfile(testcase, x, True) + class CgiTests(unittest.TestCase): + def setUp(self): + fd, self.tempfile = tempfile.mkstemp() + os.close(fd) + self.addCleanup(os.unlink,self.tempfile) + self._cgifp_read_block_size = cgi._CgiFp.read_block_size + + def tearDown(self): + cgi._CgiFp.read_block_size = self._cgifp_read_block_size + pass + def test_parse_multipart(self): fp = BytesIO(POSTDATA.encode('latin1')) env = {'boundary': BOUNDARY.encode('latin1'), @@ -395,6 +586,8 @@ 'parse_qsl instead', DeprecationWarning)): self.assertEqual([('a', 'A1'), ('b', 'B2'), ('B', 'B3')], cgi.parse_qsl('a=A1&b=B2&B=B3')) + def test_boundary_overlaps_buffer(self): + cgi._CgiFp.read_block_size = 1<<9 def test_parse_header(self): self.assertEqual( @@ -425,6 +618,83 @@ cgi.parse_header('form-data; name="files"; filename="fo\\"o;bar"'), ("form-data", {"name": "files", "filename": 'fo"o;bar'})) + def test_boundary_overlaps_buffer(self): + cgi._CgiFp.read_block_size = 1<<9 + in_byte_len = (1 << 9) - 3 + in_byte, lfs = CgiMultiPartTestMethods.make_str_with_lfs( + in_byte_len, 0.3) + CgiMultiPartTestMethods.multi_test(self, in_byte) + + def test_payload_less_than_buffer(self): + in_byte_len = (1 << 9) + in_byte, lfs = CgiMultiPartTestMethods.make_str_with_lfs( + in_byte_len, 0.4) + CgiMultiPartTestMethods.multi_test(self, in_byte) + + def test_payload_less_than_buffer_lf(self): + in_byte_len = (1 << 9) + in_byte, lfs = CgiMultiPartTestMethods.make_str_with_lfs( + in_byte_len, 0.3) + CgiMultiPartTestMethods.multi_test(self, in_byte,use_lf=True) + + def test_no_outer_boundary(self): + in_byte_len = (1 << 9) + in_byte, lfs = CgiMultiPartTestMethods.make_str_with_lfs( + in_byte_len, 0.3) + CgiMultiPartTestMethods.multi_test(self, in_byte,use_lf=True, + outer_boundary_no_trail=True, + no_boundary_delimiter=True) + + def test_nocrlf_at_outerboundary(self): + in_byte_len = (1 << 9) + in_byte, lfs = CgiMultiPartTestMethods.make_str_with_lfs( + in_byte_len, 0.3) + CgiMultiPartTestMethods.multi_test(self, in_byte, + use_lf=True, + outer_boundary_no_trail=True) + + def test_file_biffer_than_buffer(self): + cgi._CgiFp.read_block_size = 1 << 7 + in_byte_len = (1 << 9) + in_byte, lfs = CgiMultiPartTestMethods.make_str_with_lfs( + in_byte_len, 0.3) + CgiMultiPartTestMethods.multi_test(self, in_byte) + + def test_badlyformed_multipart(self): + in_byte_len = (1 << 9) + in_byte, lfs = CgiMultiPartTestMethods.make_str_with_lfs( + in_byte_len, 0.3) + CgiMultiPartTestMethods.bad_outerboundary_test(self, in_byte) + + def test_cgifp(self): + num_lines = [20, 256] + for numline in num_lines: + line_max_len = random.randint(1,200) + max_buffer_size = random.randint(1,300) + with open(self.tempfile, 'wb') as temp_f: + for i in range(numline): + temp_f.write( + CgiMultiPartTestMethods.make_rand_str( + random.randint( + 1, line_max_len))) + temp_f.write(b'\n') + with self.subTest('readline(None, ' + str(numline) + ')'): + CgiMultiPartTestMethods.cgifp_testfile_readline(self) + with self.subTest('readline(' + str(max_buffer_size) + \ + ', ' + str (numline) + ')'): + CgiMultiPartTestMethods.cgifp_testfile_readline( + self, max_buffer_size) + with self.subTest('read(' + str(max_buffer_size) + \ + ', ' + str(numline) + ')'): + CgiMultiPartTestMethods.cgifp_testfile_read( + self, max_buffer_size) + with self.subTest('readline(-1, ' + str(numline) + ')'): + CgiMultiPartTestMethods.cgifp_testfile_readline(self, -1) + with self.subTest('read(-1, ' + str(numline) + ')'): + CgiMultiPartTestMethods.cgifp_testfile_read(self, -1) + with self.subTest('readline(-2, ' + str(numline) + ')'): + CgiMultiPartTestMethods.cgifp_testfile_readline(self, -2) + BOUNDARY = "---------------------------721837373350705526688164684"