Index: Lib/urllib/request.py =================================================================== --- Lib/urllib/request.py (revision 70664) +++ Lib/urllib/request.py (working copy) @@ -1,6 +1,3 @@ -# Issues in merging urllib and urllib2: -# 1. They both define a function named urlopen() - """An extensible library for opening URLs using a variety of protocols The simplest way to use this module is to call the urlopen function, @@ -83,6 +80,7 @@ # abstract factory for opener import base64 +import bisect import email import hashlib import http.client @@ -94,7 +92,6 @@ import socket import sys import time -import bisect from urllib.error import URLError, HTTPError, ContentTooShortError from urllib.parse import ( @@ -149,7 +146,7 @@ comparison. """ - url = request.get_full_url() + url = request.full_url host = urlparse(url)[1] if host == "": host = request.get_header("Host", "") @@ -163,11 +160,7 @@ def __init__(self, url, data=None, headers={}, origin_req_host=None, unverifiable=False): # unwrap('') --> 'type://host/path' - self.__original = unwrap(url) - self.type = None - # self.__r_type is what's left after doing the splittype - self.host = None - self.port = None + self.full_url = unwrap(url) self.data = data self.headers = {} for key, value in headers.items(): @@ -177,26 +170,23 @@ origin_req_host = request_host(self) self.origin_req_host = origin_req_host self.unverifiable = unverifiable + self._parse() - def __getattr__(self, attr): - # XXX this is a fallback mechanism to guard against these - # methods getting called in a non-standard order. this may be - # too complicated and/or unnecessary. - # XXX should the __r_XXX attributes be public? - if attr[:12] == '_Request__r_': - name = attr[12:] - if hasattr(Request, 'get_' + name): - getattr(self, 'get_' + name)() - return getattr(self, attr) - raise AttributeError(attr) + def _parse(self): + self.type, rest = splittype(self.full_url) + if self.type is None: + raise ValueError("unknown url type: %s" % self.full_url) + self.host, self.selector = splithost(rest) + if self.host: + self.host = unquote(self.host) def get_method(self): - if self.has_data(): + if self.data is not None: return "POST" else: return "GET" - # XXX these helper methods are lame + # Begin deprecated methods def add_data(self, data): self.data = data @@ -208,38 +198,32 @@ return self.data def get_full_url(self): - return self.__original + return self.full_url def get_type(self): - if self.type is None: - self.type, self.__r_type = splittype(self.__original) - if self.type is None: - raise ValueError("unknown url type: %s" % self.__original) return self.type def get_host(self): - if self.host is None: - self.host, self.__r_host = splithost(self.__r_type) - if self.host: - self.host = unquote(self.host) return self.host def get_selector(self): - return self.__r_host + return self.selector - def set_proxy(self, host, type): - self.host, self.type = host, type - self.__r_host = self.__original + def is_unverifiable(self): + return self.unverifiable - def has_proxy(self): - return self.__r_host == self.__original - def get_origin_req_host(self): return self.origin_req_host - def is_unverifiable(self): - return self.unverifiable + # End deprecated methods + def set_proxy(self, host, type): + self.host, self.type = host, type + self.selector = self.full_url + + def has_proxy(self): + return self.selector == self.full_url + def add_header(self, key, val): # useful for something like authentication self.headers[key.capitalize()] = val @@ -344,10 +328,10 @@ else: req = fullurl if data is not None: - req.add_data(data) + req.data = data req.timeout = timeout - protocol = req.get_type() + protocol = req.type # pre-process request meth_name = protocol+"_request" @@ -371,7 +355,7 @@ if result: return result - protocol = req.get_type() + protocol = req.type result = self._call_chain(self.handle_open, protocol, protocol + '_open', req) if result: @@ -481,7 +465,7 @@ class HTTPDefaultErrorHandler(BaseHandler): def http_error_default(self, req, fp, code, msg, hdrs): - raise HTTPError(req.get_full_url(), code, msg, hdrs, fp) + raise HTTPError(req.full_url, code, msg, hdrs, fp) class HTTPRedirectHandler(BaseHandler): # maximum number of redirections to any single URL @@ -504,7 +488,7 @@ m = req.get_method() if (not (code in (301, 302, 303, 307) and m in ("GET", "HEAD") or code in (301, 302, 303) and m == "POST")): - raise HTTPError(req.get_full_url(), code, msg, headers, fp) + raise HTTPError(req.full_url, code, msg, headers, fp) # Strictly (according to RFC 2616), 301 or 302 in response to # a POST MUST NOT cause a redirection without confirmation @@ -518,7 +502,7 @@ if k.lower() not in CONTENT_HEADERS) return Request(newurl, headers=newheaders, - origin_req_host=req.get_origin_req_host(), + origin_req_host=req.origin_req_host, unverifiable=True) # Implementation note: To avoid the server sending us into an @@ -542,7 +526,7 @@ urlparts[2] = "/" newurl = urlunparse(urlparts) - newurl = urljoin(req.get_full_url(), newurl) + newurl = urljoin(req.full_url, newurl) # XXX Probably want to forget about the state of the current # request, although that might interact poorly with other @@ -557,7 +541,7 @@ visited = new.redirect_dict = req.redirect_dict if (visited.get(newurl, 0) >= self.max_repeats or len(visited) >= self.max_redirections): - raise HTTPError(req.get_full_url(), code, + raise HTTPError(req.full_url, code, self.inf_msg + msg, headers, fp) else: visited = new.redirect_dict = req.redirect_dict = {} @@ -664,7 +648,7 @@ meth(r, proxy, type)) def proxy_open(self, req, proxy, type): - orig_type = req.get_type() + orig_type = req.type proxy_type, user, password, hostport = _parse_proxy(proxy) if proxy_type is None: proxy_type = orig_type @@ -811,7 +795,7 @@ auth_header = 'Authorization' def http_error_401(self, req, fp, code, msg, headers): - url = req.get_full_url() + url = req.full_url return self.http_error_auth_reqed('www-authenticate', url, req, headers) @@ -825,7 +809,7 @@ # authority. Assume there isn't one, since urllib.request does not (and # should not, RFC 3986 s. 3.2.1) support requests for URLs containing # userinfo. - authority = req.get_host() + authority = req.host return self.http_error_auth_reqed('proxy-authenticate', authority, req, headers) @@ -864,7 +848,7 @@ # prompting for the information. Crap. This isn't great # but it's better than the current 'repeat until recursion # depth exceeded' approach - raise HTTPError(req.get_full_url(), 401, "digest auth failed", + raise HTTPError(req.full_url, 401, "digest auth failed", headers, None) else: self.retried += 1 @@ -912,20 +896,20 @@ if H is None: return None - user, pw = self.passwd.find_user_password(realm, req.get_full_url()) + user, pw = self.passwd.find_user_password(realm, req.full_url) if user is None: return None # XXX not implemented yet - if req.has_data(): - entdig = self.get_entity_digest(req.get_data(), chal) + if req.data is not None: + entdig = self.get_entity_digest(req.data, chal) else: entdig = None A1 = "%s:%s:%s" % (user, realm, pw) A2 = "%s:%s" % (req.get_method(), # XXX selector: what about proxies and full urls - req.get_selector()) + req.selector) if qop == 'auth': self.nonce_count += 1 ncvalue = '%08x' % self.nonce_count @@ -941,7 +925,7 @@ # XXX should the partial digests be encoded too? base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \ - 'response="%s"' % (user, realm, nonce, req.get_selector(), + 'response="%s"' % (user, realm, nonce, req.selector, respdig) if opaque: base += ', opaque="%s"' % opaque @@ -978,7 +962,7 @@ handler_order = 490 # before Basic auth def http_error_401(self, req, fp, code, msg, headers): - host = urlparse(req.get_full_url())[1] + host = urlparse(req.full_url)[1] retry = self.http_error_auth_reqed('www-authenticate', host, req, headers) self.reset_retry_count() @@ -991,7 +975,7 @@ handler_order = 490 # before Basic auth def http_error_407(self, req, fp, code, msg, headers): - host = req.get_host() + host = req.host retry = self.http_error_auth_reqed('proxy-authenticate', host, req, headers) self.reset_retry_count() @@ -1006,12 +990,12 @@ self._debuglevel = level def do_request_(self, request): - host = request.get_host() + host = request.host if not host: raise URLError('no host given') - if request.has_data(): # POST - data = request.get_data() + if request.data is not None: # POST + data = request.data if not request.has_header('Content-type'): request.add_unredirected_header( 'Content-type', @@ -1022,7 +1006,7 @@ sel_host = host if request.has_proxy(): - scheme, sel = splittype(request.get_selector()) + scheme, sel = splittype(request.selector) sel_host, sel_path = splithost(sel) if not request.has_header('Host'): request.add_unredirected_header('Host', sel_host) @@ -1034,16 +1018,11 @@ return request def do_open(self, http_class, req): - """Return an addinfourl object for the request, using http_class. + """Return an HTTPResponse object for the request, using http_class. http_class must implement the HTTPConnection API from http.client. - The addinfourl return value is a file-like object. It also - has methods and attributes including: - - info(): return a email Message object for the headers - - geturl(): return the original request URL - - code: HTTP status code """ - host = req.get_host() + host = req.host if not host: raise URLError('no host given') @@ -1061,19 +1040,21 @@ # So make sure the connection gets closed after the (only) # request. headers["Connection"] = "close" - headers = dict( - (name.title(), val) for name, val in headers.items()) + headers = dict((name.title(), val) for name, val in headers.items()) try: - h.request(req.get_method(), req.get_selector(), req.data, headers) - r = h.getresponse() - except socket.error as err: # XXX what error? + h.request(req.get_method(), req.selector, req.data, headers) + r = h.getresponse() # an HTTPResponse instance + except socket.error as err: raise URLError(err) -## resp = addinfourl(r.fp, r.msg, req.get_full_url()) - resp = addinfourl(r, r.msg, req.get_full_url()) - resp.code = r.status - resp.msg = r.reason - return resp + r.url = req.full_url + # This line replaces the .msg attribute of the HTTPResponse + # with .headers, because urllib clients expect the response to + # have the reason in .msg. It would be good to mark this + # attribute is deprecated and get then to use info() or + # .headers. + r.msg = r.reason + return r class HTTPHandler(AbstractHTTPHandler): @@ -1111,7 +1092,7 @@ class UnknownHandler(BaseHandler): def unknown_open(self, req): - type = req.get_type() + type = req.type raise URLError('unknown url type: %s' % type) def parse_keqv_list(l): @@ -1170,7 +1151,7 @@ class FileHandler(BaseHandler): # Use local file or FTP depending on form of URL def file_open(self, req): - url = req.get_selector() + url = req.selector if url[:2] == '//' and url[2:3] != '/': req.type = 'ftp' return self.parent.open(req) @@ -1192,8 +1173,8 @@ def open_local_file(self, req): import email.utils import mimetypes - host = req.get_host() - file = req.get_selector() + host = req.host + file = req.selector localfile = url2pathname(file) try: stats = os.stat(localfile) @@ -1223,7 +1204,7 @@ def ftp_open(self, req): import ftplib import mimetypes - host = req.get_host() + host = req.host if not host: raise URLError('ftp error: no host given') host, port = splitport(host) @@ -1246,7 +1227,7 @@ host = socket.gethostbyname(host) except socket.error as msg: raise URLError(msg) - path, attrs = splitattr(req.get_selector()) + path, attrs = splitattr(req.selector) dirs = path.split('/') dirs = list(map(unquote, dirs)) dirs, file = dirs[:-1], dirs[-1] @@ -1262,13 +1243,13 @@ type = value.upper() fp, retrlen = fw.retrfile(file, type) headers = "" - mtype = mimetypes.guess_type(req.get_full_url())[0] + mtype = mimetypes.guess_type(req.full_url)[0] if mtype: headers += "Content-type: %s\n" % mtype if retrlen is not None and retrlen >= 0: headers += "Content-length: %d\n" % retrlen headers = email.message_from_string(headers) - return addinfourl(fp, headers, req.get_full_url()) + return addinfourl(fp, headers, req.full_url) except ftplib.all_errors as msg: exc = URLError('ftp error: %s' % msg) raise exc.with_traceback(sys.exc_info()[2]) @@ -1581,9 +1562,9 @@ else: auth = None http_conn = connection_factory(host) - # XXX We should fix urllib so that it works with HTTP/1.1. - http_conn._http_vsn = 10 - http_conn._http_vsn_str = "HTTP/1.0" +## # XXX We should fix urllib so that it works with HTTP/1.1. +## http_conn._http_vsn = 10 +## http_conn._http_vsn_str = "HTTP/1.0" headers = {} if proxy_auth: Index: Lib/http/client.py =================================================================== --- Lib/http/client.py (revision 70664) +++ Lib/http/client.py (working copy) @@ -204,6 +204,12 @@ MAXAMOUNT = 1048576 class HTTPMessage(email.message.Message): + # XXX The only usage of this method is in + # http.server.CGIHTTPRequestHandler. Maybe move the code there so + # that it doesn't need to be part of the public API. The API has + # never been defined so this could cause backwards compatibility + # issues. + def getallmatchingheaders(self, name): """Find all header lines matching a given header name. @@ -261,21 +267,27 @@ # text following RFC 2047. The basic status line parsing only # accepts iso-8859-1. - def __init__(self, sock, debuglevel=0, strict=0, method=None): - # If the response includes a content-length header, we - # need to make sure that the client doesn't read more than the + def __init__(self, sock, debuglevel=0, strict=0, method=None, url=None): + # If the response includes a content-length header, we need to + # make sure that the client doesn't read more than the # specified number of bytes. If it does, it will block until - # the server times out and closes the connection. (The only - # applies to HTTP/1.1 connections.) This will happen if a self.fp.read() - # is done (without a size) whether self.fp is buffered or not. - # So, no self.fp.read() by clients unless they know what they are doing. + # the server times out and closes the connection. This will + # happen if a self.fp.read() is done (without a size) whether + # self.fp is buffered or not. So, no self.fp.read() by + # clients unless they know what they are doing. self.fp = sock.makefile("rb") self.debuglevel = debuglevel self.strict = strict self._method = method - self.msg = None + # The HTTPResponse object is returned via urllib. The clients + # of http and urllib expect different attributes for the + # headers. headers is used here and supports urllib. msg is + # provided as a backwards compatibility layer for http + # clients. + self.headers = self.msg = None + # from the Status-Line of the response self.version = _UNKNOWN # HTTP-Version self.status = _UNKNOWN # Status-Code @@ -326,7 +338,7 @@ return version, status, reason def begin(self): - if self.msg is not None: + if self.headers is not None: # we've already started reading the response return @@ -343,7 +355,7 @@ if self.debuglevel > 0: print("header:", skip) - self.status = status + self.code = self.status = status self.reason = reason.strip() if version == "HTTP/1.0": self.version = 10 @@ -358,17 +370,17 @@ self.length = None self.chunked = False self.will_close = True - self.msg = email.message_from_string('') + self.headers = self.msg = email.message_from_string('') return - self.msg = parse_headers(self.fp) + self.headers = self.msg = parse_headers(self.fp) if self.debuglevel > 0: - for hdr in self.msg: + for hdr in self.headers: print("header:", hdr, end=" ") # are we using the chunked-style of transfer encoding? - tr_enc = self.msg.get("transfer-encoding") + tr_enc = self.headers.get("transfer-encoding") if tr_enc and tr_enc.lower() == "chunked": self.chunked = True self.chunk_left = None @@ -381,10 +393,10 @@ # do we have a Content-Length? # NOTE: RFC 2616, S4.4, #3 says we ignore this if tr_enc is "chunked" self.length = None - length = self.msg.get("content-length") + length = self.headers.get("content-length") # are we using the chunked-style of transfer encoding? - tr_enc = self.msg.get("transfer-encoding") + tr_enc = self.headers.get("transfer-encoding") if length and not self.chunked: try: self.length = int(length) @@ -411,11 +423,11 @@ self.will_close = True def _check_close(self): - conn = self.msg.get("connection") + conn = self.headers.get("connection") if self.version == 11: # An HTTP/1.1 proxy is assumed to stay open unless # explicitly closed. - conn = self.msg.get("connection") + conn = self.headers.get("connection") if conn and "close" in conn.lower(): return True return False @@ -424,7 +436,7 @@ # connections, using rules different than HTTP/1.1. # For older HTTP, Keep-Alive indicates persistent connection. - if self.msg.get("keep-alive"): + if self.headers.get("keep-alive"): return False # At least Akamai returns a "Connection: Keep-Alive" header, @@ -433,7 +445,7 @@ return False # Proxy-Connection is a netscape hack. - pconn = self.msg.get("proxy-connection") + pconn = self.headers.get("proxy-connection") if pconn and "keep-alive" in pconn.lower(): return False @@ -584,22 +596,32 @@ return self.fp.fileno() def getheader(self, name, default=None): - if self.msg is None: + if self.headers is None: raise ResponseNotReady() - return ', '.join(self.msg.get_all(name, default)) + return ', '.join(self.headers.get_all(name, default)) def getheaders(self): """Return list of (header, value) tuples.""" - if self.msg is None: + if self.headers is None: raise ResponseNotReady() - return list(self.msg.items()) + return list(self.headers.items()) # We override IOBase.__iter__ so that it doesn't check for closed-ness def __iter__(self): return self + # For compatibility with old-style urllib responses. + def info(self): + return self.headers + + def geturl(self): + return self.url + + def getcode(self): + return self.status + class HTTPConnection: _http_vsn = 11 @@ -757,7 +779,7 @@ if self.__state == _CS_IDLE: self.__state = _CS_REQ_STARTED else: - raise CannotSendRequest() + raise CannotSendRequest(self.__state) # Save the method we use, we need it later in the response phase self._method = method @@ -906,13 +928,23 @@ self.endheaders(body) def getresponse(self): - """Get the response from the server.""" + """Get the response from the server. + If the HTTPConnection is in the correct state, returns an + instance of HTTPResponse or of whatever object is returned by + class the response_class variable. + + If a request has not been sent or if a previous response has + not be handled, ResponseNotReady is raised. If the HTTP + response indicates that the connection should be closed, then + it will be closed before the response is returned. When the + connection is closed, the underlying socket is closed. + """ + # if a prior response has been completed, then forget about it. if self.__response and self.__response.isclosed(): self.__response = None - # # if a prior response exists, then it must be completed (otherwise, we # cannot read this response's header to determine the connection-close # behavior) @@ -929,7 +961,7 @@ # isclosed() status to become true. # if self.__state != _CS_REQ_SENT or self.__response: - raise ResponseNotReady() + raise ResponseNotReady(self.__state) if self.debuglevel > 0: response = self.response_class(self.sock, self.debuglevel, Index: Lib/test/test_http_cookiejar.py =================================================================== --- Lib/test/test_http_cookiejar.py (revision 70664) +++ Lib/test/test_http_cookiejar.py (working copy) @@ -583,11 +583,6 @@ req = urllib.request.Request("http://www.acme.com/", headers={"Host": "irrelevant.com"}) self.assertEquals(request_host(req), "www.acme.com") - # not actually sure this one is valid Request object, so maybe should - # remove test for no host in url in request_host function? - req = urllib.request.Request("/resource.html", - headers={"Host": "www.acme.com"}) - self.assertEquals(request_host(req), "www.acme.com") # port shouldn't be in request-host req = urllib.request.Request("http://www.acme.com:2345/resource.html", headers={"Host": "www.acme.com:5432"}) Index: Lib/test/test_urllib2.py =================================================================== --- Lib/test/test_urllib2.py (revision 70664) +++ Lib/test/test_urllib2.py (working copy) @@ -683,8 +683,13 @@ self.msg = msg self.status = status self.reason = reason + self.code = 200 def read(self): return '' + def info(self): + return {} + def geturl(self): + return self.url class MockHTTPClass: def __init__(self): self.level = 0