diff -r 7ed567ad8b4c Doc/library/http.client.rst --- a/Doc/library/http.client.rst Tue Mar 31 22:03:59 2015 +0200 +++ b/Doc/library/http.client.rst Wed Apr 01 16:54:54 2015 -0700 @@ -174,6 +174,15 @@ A subclass of :exc:`HTTPException`. Raised if an excessively long line is received in the HTTP protocol from the server. +.. exception:: HTTPEncodingException + + A subclass of :exc:`HTTPException`. Raised if HTTP-specific encoding errors + are encountered, such as multiple "chunked" entries in a "Transfer-Encoding" + header. + +.. versionadded:: 3.5 + :class:`HTTPEncodingException` + The constants defined in this module are: @@ -204,39 +213,52 @@ :class:`HTTPConnection` instances have the following methods: -.. method:: HTTPConnection.request(method, url, body=None, headers={}) +.. method:: HTTPConnection.request(method, url, body=None, headers=None) This will send a request to the server using the HTTP request method *method* and the selector *url*. - If *body* is specified, the specified data is sent after the headers are - finished. It may be a string, a :term:`bytes-like object`, an open - :term:`file object`, or an iterable of :term:`bytes-like object`\s. If - *body* is a string, it is encoded as ISO-8851-1, the default for HTTP. If - it is a bytes-like object the bytes are sent as is. If it is a :term:`file - object`, the contents of the file is sent; this file object should support - at least the ``read()`` method. If the file object has a ``mode`` - attribute, the data returned by the ``read()`` method will be encoded as - ISO-8851-1 unless the ``mode`` attribute contains the substring ``b``, - otherwise the data returned by ``read()`` is sent as is. If *body* is an - iterable, the elements of the iterable are sent as is until the iterable is - exhausted. + If the *body* argument is present, it can be any of the following types: + *string*, *bytes*, iterables comprised of either *string* or *bytes* (note + the special handling of iterators detailed below), :term:`file object` + (the object should support ``fileno()`` and ``read()`` methods) or objects + implementing the :ref:`buffer interface ` such as + :class:`array.array`. - The *headers* argument should be a mapping of extra HTTP - headers to send with the request. + Unencoded *string* objects are encoded as ISO-8859-1 (latin-1), the default + charset for HTTP. To use other encodings, *bytes* objects must be used. - If *headers* does not contain a Content-Length item, one is added - automatically if possible. If *body* is ``None``, the Content-Length header - is set to ``0`` for methods that expect a body (``PUT``, ``POST``, and - ``PATCH``). If *body* is a string or bytes object, the Content-Length - header is set to its length. If *body* is a :term:`file object` and it - works to call :func:`~os.fstat` on the result of its ``fileno()`` method, - then the Content-Length header is set to the ``st_size`` reported by the - ``fstat`` call. Otherwise no Content-Length header is added. + The *headers* argument should be a mapping of extra HTTP headers to send + with the request. If *headers* does not contain Content-Length, + one is added automatically if possible. If *body* is ``None``, the + Content-Length header is set to ``0`` for methods that expect a body + (``PUT``, ``POST``, and ``PATCH``). If *body* is a string or bytes object, + the Content-Length header is set to its length. If *body* is a :term:`file + object` supporting :func:`~os.fstat` or :meth:`~io.IOBase.seek`, + Content-Length will be set to the result of ``st_size`` reported by + ``fstat`` or ``seek`` respectively. Otherwise, the Content-Length header is + not added automatically. + + In cases where body implements the :ref:`iterator protocol ` + and determining the Content-Length up front is not possible, the body will + be chunk encoded and the Transfer-Encoding header will automatically be set. + If Content-Length is supplied in *headers*, chunk encoding will not be + applied. + + If Transfer-Encoding is specified in *headers*, the client assumes that all + encoding is handled by the calling code. In this case, Content-Length will + not be automatically added to *headers*. .. versionadded:: 3.2 *body* can now be an iterable. + .. versionadded:: 3.5 + If neither Content-Length nor Transfer-Encoding are not set in headers + and Content-Length cannot be determined (i.e. *body* is a generator), + *body* will now be automatically chunk encoded. + *headers* now defaults to None to prevent unintended side effects when + :meth:`~request` is called repeatedly with user supplied headers. + .. method:: HTTPConnection.getresponse() Should be called after a request is sent to get the response from the server. @@ -314,7 +336,7 @@ an argument. -.. method:: HTTPConnection.endheaders(message_body=None) +.. method:: HTTPConnection.endheaders(message_body=None, encode_chunked=False) Send a blank line to the server, signalling the end of the headers. The optional *message_body* argument can be used to pass a message body @@ -322,12 +344,37 @@ packet as the message headers if it is string, otherwise it is sent in a separate packet. -.. method:: HTTPConnection.send(data) + The *encode_chunked* flag is passed directly into :meth:`send` (see + :meth:`send` documentation for details). + + .. versionadded:: 3.5 + The *encode_chunked* parameter was added. + + +.. method:: HTTPConnection.send(data, encode_chunked=False) Send data to the server. This should be used directly only after the :meth:`endheaders` method has been called and before :meth:`getresponse` is called. + If *encode_chunked* is ``True``, the result of each iteration of *data* will + be chunk encoded as specified in :rfc:`7230`, Section 3.3.1. How the data is + encoded is dependent on the type of *data*. If *data* implements the + :ref:`buffer interface `, or is a :class:`str`, the encoding + will result in a single chunk. If *data* is a :class:`collections.Iterable`, + each iteration of *data* will result in a chunk. If *data* is a + :term:`file object`, each call to ``.read()`` will result in a chunk. + :meth:`send` automatically signals the end of the chunk encoded data + immediately after *data*. + + .. note:: Due to the chunked encoding spec, empty chunks yielded by an + iterator body will be ignored by the chunk encoder. This is to avoid + premature termination of the read of the request by the target server due + to malformed encoding. + + .. versionadded:: 3.5 + Chunked encoding support. + .. _httpresponse-objects: diff -r 7ed567ad8b4c Doc/whatsnew/3.5.rst --- a/Doc/whatsnew/3.5.rst Tue Mar 31 22:03:59 2015 +0200 +++ b/Doc/whatsnew/3.5.rst Wed Apr 01 16:54:54 2015 -0700 @@ -298,6 +298,14 @@ subdirectories using the "``**``" pattern. (Contributed by Serhiy Storchaka in :issue:`13968`.) +http.client +----------- + +* :meth:`~http.client.HTTPConnection.request` and + :meth:`~http.client.HTTPConnection.send` both now support chunked encoding + request bodies. + (Contibuted by Demian Brecht in :issue:`12319`.) + imaplib ------- diff -r 7ed567ad8b4c Lib/http/client.py --- a/Lib/http/client.py Tue Mar 31 22:03:59 2015 +0200 +++ b/Lib/http/client.py Wed Apr 01 16:54:54 2015 -0700 @@ -74,6 +74,7 @@ import re import socket import collections +import types from urllib.parse import urlsplit # HTTPMessage, parse_headers(), and the HTTP status code constants are @@ -83,7 +84,8 @@ "UnknownTransferEncoding", "UnimplementedFileMode", "IncompleteRead", "InvalidURL", "ImproperConnectionState", "CannotSendRequest", "CannotSendHeader", "ResponseNotReady", - "BadStatusLine", "LineTooLong", "error", "responses"] + "BadStatusLine", "LineTooLong", "error", "responses", + "HTTPEncodingError"] HTTP_PORT = 80 HTTPS_PORT = 443 @@ -95,6 +97,7 @@ _CS_REQ_STARTED = 'Request-started' _CS_REQ_SENT = 'Request-sent' +_DEFAULT_ENCODING = 'latin-1' # hack to maintain backwards compatibility globals().update(http.HTTPStatus.__members__) @@ -142,6 +145,9 @@ # servers will otherwise respond with a 411 _METHODS_EXPECTING_BODY = {'PATCH', 'POST', 'PUT'} +class HTTPEncodingError(ValueError): + pass + class HTTPMessage(email.message.Message): # XXX The only usage of this method is in @@ -707,6 +713,49 @@ def getcode(self): return self.status + +def _get_content_length(body, method): + # Get the content-length based on the body. If the body is "empty", we + # set Content-Length: 0 for methods that expect a body (RFC 7230, + # Section 3.3.2). If the body is set for other methods, we set the + # header provided we can figure out what the length is. + if not body: + # do an explicit check for not None here to distinguish between unset + # and set but empty + if method.upper() in _METHODS_EXPECTING_BODY or body is not None: + return 0 + return + + if hasattr(body, 'read'): + try: + return os.fstat(body.fileno()).st_size + except AttributeError: + # is the object seekable? + try: + curpos = body.tell() + sz = body.seek(0, io.SEEK_END) + except (TypeError, AttributeError): + if self.debuglevel > 0: + print('Unable to determine size of %r' % body) + return + else: + body.seek(curpos) + return sz + + if hasattr(body, '__iter__'): + try: + # is body a string or bytes type? + ord(body[0]) + except TypeError: + # are we looking at an iterable of ints? + if isinstance(body[0], int): + return len(body) + # nope, this is likely an iterable of iterables + return sum(len(line) for line in body) + else: + return len(body) + + class HTTPConnection: _http_vsn = 11 @@ -833,7 +882,37 @@ self.__response = None self.__state = _CS_IDLE - def send(self, data): + def _read_readable(self, readable): + blocksize = 8192 + if self.debuglevel > 0: + print("sendIng a read()able") + encode = False + try: + mode = readable.mode + except AttributeError: + # io.BytesIO and other file-like objects don't have a `mode` + # attribute. + pass + else: + if "b" not in mode: + encode = True + if self.debuglevel > 0: + print("encoding file using iso-8859-1") + while True: + datablock = readable.read(blocksize) + if not datablock: + break + if encode: + datablock = datablock.encode(_DEFAULT_ENCODING) + yield datablock + + def _read_iterable(self, iterable): + for line in iterable: + if isinstance(line, str): + line = line.encode(_DEFAULT_ENCODING) + yield line + + def send(self, data, encode_chunked=False): """Send `data' to the server. ``data`` can be a string object, a bytes object, an array object, a file-like object that supports a .read() method, or an iterable object. @@ -847,39 +926,45 @@ if self.debuglevel > 0: print("send:", repr(data)) - blocksize = 8192 - if hasattr(data, "read") : - if self.debuglevel > 0: - print("sendIng a read()able") - encode = False - try: - mode = data.mode - except AttributeError: - # io.BytesIO and other file-like objects don't have a `mode` - # attribute. - pass - else: - if "b" not in mode: - encode = True - if self.debuglevel > 0: - print("encoding file using iso-8859-1") - while 1: - datablock = data.read(blocksize) - if not datablock: - break - if encode: - datablock = datablock.encode("iso-8859-1") - self.sock.sendall(datablock) - return + + # create a consistent interface to the data try: - self.sock.sendall(data) + # this is solely to check to see if data implements the buffer API. + # it /would/ be easier to capture if PyObject_CheckBuffer was + # exposed to Python + memoryview(data) except TypeError: - if isinstance(data, collections.Iterable): - for d in data: - self.sock.sendall(d) + if isinstance(data, str): + read = lambda data: (data.encode(_DEFAULT_ENCODING),) + elif hasattr(data, 'read'): + read = self._read_readable + elif isinstance(data, collections.Iterable): + read = self._read_iterable else: raise TypeError("data should be a bytes-like object " "or an iterable, got %r" % type(data)) + else: + # the object implements the buffer interface and can be passed + # directly into socket methods + read = lambda data: (data,) + + for line in read(data): + if not line: + if self.debuglevel > 0: + print('Zero length line ignored') + continue + + if encode_chunked and self._http_vsn == 11: + # chunked encoding + line = b'\r\n'.join(( + format(len(line), 'X').encode('ascii'), + line, + b'')) + self.sock.sendall(line) + + if encode_chunked and self._http_vsn == 11: + # end chunked transfer + self.sock.sendall(b'0\r\n\r\n') def _output(self, s): """Add a line of output to the current request buffer. @@ -888,7 +973,7 @@ """ self._buffer.append(s) - def _send_output(self, message_body=None): + def _send_output(self, message_body=None, encode_chunked=False): """Send the currently buffered request and clear the buffer. Appends an extra \\r\\n to the buffer. @@ -900,7 +985,7 @@ self.send(msg) if message_body is not None: - self.send(message_body) + self.send(message_body, encode_chunked=encode_chunked) def putrequest(self, method, url, skip_host=0, skip_accept_encoding=0): """Send a request to the server. @@ -1053,7 +1138,7 @@ header = header + b': ' + value self._output(header) - def endheaders(self, message_body=None): + def endheaders(self, message_body=None, encode_chunked=False): """Indicate that the last header line has been sent to the server. This method sends the request to the server. The optional message_body @@ -1066,39 +1151,15 @@ self.__state = _CS_REQ_SENT else: raise CannotSendHeader() - self._send_output(message_body) + self._send_output(message_body, encode_chunked=encode_chunked) - def request(self, method, url, body=None, headers={}): + def request(self, method, url, body=None, headers=None): """Send a complete request to the server.""" - self._send_request(method, url, body, headers) - - def _set_content_length(self, body, method): - # Set the content-length based on the body. If the body is "empty", we - # set Content-Length: 0 for methods that expect a body (RFC 7230, - # Section 3.3.2). If the body is set for other methods, we set the - # header provided we can figure out what the length is. - thelen = None - method_expects_body = method.upper() in _METHODS_EXPECTING_BODY - if body is None and method_expects_body: - thelen = '0' - elif body is not None: - try: - thelen = str(len(body)) - except TypeError: - # If this is a file-like object, try to - # fstat its file descriptor - try: - thelen = str(os.fstat(body.fileno()).st_size) - except (AttributeError, OSError): - # Don't send a length if this failed - if self.debuglevel > 0: print("Cannot stat!!") - - if thelen is not None: - self.putheader('Content-Length', thelen) + self._send_request(method, url, body, headers or {}) def _send_request(self, method, url, body, headers): # Honor explicitly requested Host: and Accept-Encoding: headers. - header_names = dict.fromkeys([k.lower() for k in headers]) + header_names = {k.lower(): k for k in headers.keys()} skips = {} if 'host' in header_names: skips['skip_host'] = 1 @@ -1107,15 +1168,59 @@ self.putrequest(method, url, **skips) + # chunked encoding will happen under the following conditions: + # 1. content-length has not been explicitly set + # 2. body is a generator + # 3. HTTP/1.1 is used + # 4. Transfer-Encoding has NOT been explicitly set by the caller + + encode_chunked = False if 'content-length' not in header_names: - self._set_content_length(body, method) + # only chunk body if not explicitly set for backwards + # compatibility, assuming the client code is already handling the + # chunking + if 'transfer-encoding' not in header_names: + # if content-length cannot be automatically determined, fall + # back to chunked encoding + try: + content_length = _get_content_length(body, method) + if content_length is None: + raise TypeError + except TypeError: + if body: + # content length is applied to requests for which the + # method expects a body (i.e. PUT, POST). such requests + # should not have chunked encoding applied. + encode_chunked = True + self.putheader('Transfer-Encoding', 'chunked') + else: + self.putheader('Content-Length', str(content_length)) + else: + # transfer-encoding is specified, do some validation + + # RFC 7230, Section 3.3.1 + # A sender MUST NOT apply chunked more than once to a + # message body (i.e., chunking an already chunked message + # is not allowed). + enc = headers[header_names['transfer-encoding']].split(',') + if len([e for e in enc if e == 'chunked']) > 1: + raise HTTPEncodingError( + 'Multiple chunked encodings found. Expected 1.') + + # RFC 7230, Section 3.3.1 + # If any transfer coding other than + # chunked is applied to a request payload body, the sender + # MUST apply chunked as the final transfer coding to ensure + # that the message is properly framed. + if enc[-1] != 'chunked': + raise HTTPEncodingError( + 'Chunked encoding expected as the final ' + 'Transfer-Encoding.') + + for hdr, value in headers.items(): self.putheader(hdr, value) - if isinstance(body, str): - # RFC 2616 Section 3.7.1 says that text default has a - # default charset of iso-8859-1. - body = body.encode('iso-8859-1') - self.endheaders(body) + self.endheaders(body, encode_chunked) def getresponse(self): """Get the response from the server. diff -r 7ed567ad8b4c Lib/test/test_httplib.py --- a/Lib/test/test_httplib.py Tue Mar 31 22:03:59 2015 +0200 +++ b/Lib/test/test_httplib.py Wed Apr 01 16:54:54 2015 -0700 @@ -297,6 +297,125 @@ conn.putheader(name, value) +class TransferEncodingTest(TestCase): + expected_body = b"It's just a flesh wound" + + def test_chunked(self): + conn = client.HTTPConnection('example.com') + conn.sock = FakeSocket(None) + conn.send(self._make_body(), encode_chunked=True) + + body = self._parse_chunked(conn.sock.data) + self.assertEqual(body, self.expected_body) + + def test_explicit_headers(self): + # explicit chunked + conn = client.HTTPConnection('example.com') + conn.sock = FakeSocket(None) + # this shouldn't actually be automatically chunk encoded because the + # calling code has explicitly stated that it's taking care of it + conn.request( + 'POST', '/', self._make_body(), {'Transfer-Encoding': 'chunked'}) + + _, headers, body = self._parse_request(conn.sock.data) + self.assertNotIn('content-length', [k.lower() for k in headers.keys()]) + self.assertEqual(headers['Transfer-Encoding'], 'chunked') + self.assertEqual(body, self.expected_body) + + # explicit chunked, string body + conn = client.HTTPConnection('example.com') + conn.sock = FakeSocket(None) + conn.request( + 'POST', '/', self.expected_body.decode('latin-1'), + {'Transfer-Encoding': 'chunked'}) + + _, headers, body = self._parse_request(conn.sock.data) + self.assertNotIn('content-length', [k.lower() for k in headers.keys()]) + self.assertEqual(headers['Transfer-Encoding'], 'chunked') + self.assertEqual(body, self.expected_body) + + # invalid ordering + conn = client.HTTPConnection('example.com') + conn.sock = FakeSocket(None) + with self.assertRaises(client.HTTPEncodingError): + conn.request( + 'POST', '/', self._make_body(), + {'Transfer-Encoding': 'chunked,gzip'}) + + # multiple chunk encodings found + conn = client.HTTPConnection('example.com') + conn.sock = FakeSocket(None) + with self.assertRaises(client.HTTPEncodingError): + conn.request( + 'POST', '/', self._make_body(), + {'Transfer-Encoding': 'chunked,gzip,chunked'}) + + def test_request(self): + for val in (False, True,): + conn = client.HTTPConnection('example.com') + conn.sock = FakeSocket(None) + conn.request( + 'POST', '/', self._make_body(empty_lines=val)) + + _, headers, body = self._parse_request(conn.sock.data) + body = self._parse_chunked(body) + self.assertEqual(body, self.expected_body) + + # Content-Length and Transfer-Encoding SHOULD not be sent in the + # same request + self.assertNotIn( + b'content-length', [h.lower() for h in headers.keys()]) + + def _make_body(self, empty_lines=False): + lines = self.expected_body.split(b' ') + for idx, line in enumerate(lines): + # for testing handling empty lines + if empty_lines and idx % 2: + yield b'' + if idx < len(lines) - 1: + yield line + b' ' + else: + yield line + + def _parse_request(self, data): + lines = data.split(b'\r\n') + request = lines[0] + headers = {} + n = 1 + while n < len(lines) and len(lines[n]) > 0: + key, val = lines[n].split(b':') + headers[key.decode('latin-1')] = val.decode('latin-1').strip() + n += 1 + + return request, headers, b'\r\n'.join(lines[n + 1:]) + + def _parse_chunked(self, data): + body = [] + trailers = {} + n = 0 + lines = data.split(b'\r\n') + # parse body + while True: + size, chunk = lines[n:n+2] + size = int(size, 16) + + if size == 0: + n += 1 + break + + self.assertEqual(size, len(chunk)) + body.append(chunk) + + n += 2 + # we /should/ hit the end chunk, but check against the size of + # lines so we're not stuck in an infinite loop should we get + # malformed data + if n > len(lines): + break + + return b''.join(body) + + class BasicTest(TestCase): def test_status_lines(self): # Test HTTP status lines @@ -1025,7 +1144,7 @@ # intentionally omitted for simplicity blacklist = {"HTTPMessage", "parse_headers"} for name in dir(client): - if name in blacklist: + if name in blacklist or name.startswith('_'): continue module_object = getattr(client, name) if getattr(module_object, "__module__", None) == "http.client": @@ -1344,6 +1463,25 @@ message = client.parse_headers(f) return message, f + def test_list_body(self): + cases = ( + ([b'foo', b'bar'], b'foobar'), + ((b'foo', b'bar'), b'foobar'), + ((b'foo', 'bar'), b'foobar'), + ([b'foo', 'bar'], b'foobar'), + ) + for body, expected in cases: + with self.subTest(body): + self.conn = client.HTTPConnection('example.com') + self.conn.sock = self.sock = FakeSocket('') + + self.conn.request('PUT', '/url', body) + msg, f = self.get_headers_and_fp() + self.assertNotIn('Content-Type', msg) + self.assertIsNone(msg.get_charset()) + self.assertEqual(len(expected), int(msg.get('content-length'))) + self.assertEqual(expected, f.read()) + def test_manual_content_length(self): # Set an incorrect content-length so that we can verify that # it will not be over-ridden by the library. diff -r 7ed567ad8b4c Misc/NEWS --- a/Misc/NEWS Tue Mar 31 22:03:59 2015 +0200 +++ b/Misc/NEWS Wed Apr 01 16:54:54 2015 -0700 @@ -222,6 +222,9 @@ The usage of os.scandir() reduces the number of calls to os.stat(). Initial patch written by Ben Hoyt. +- Issue #12319: Chunked transfer encoding support added to + http.client.HTTPConnection requests + Build -----