# HG changeset patch # Parent 9332a545ad851c60f9917dc159369399a94f10e5 #22852: Add SplitResult(has_netloc=...) etc parameters * Reused some of Stian’s tests; added more from various bug reports * Had to drop __slots__ to allow adding fields to tuple subclass * Had to hack robotparser diff -r 9332a545ad85 Doc/library/urllib.parse.rst --- a/Doc/library/urllib.parse.rst Thu Mar 12 22:01:30 2015 +0200 +++ b/Doc/library/urllib.parse.rst Sun Mar 22 13:05:12 2015 +0000 @@ -85,7 +85,7 @@ for this argument is :const:`True`. The return value is actually an instance of a subclass of :class:`tuple`. This - class has the following additional read-only convenience attributes: + class has the following additional read-only attributes: +------------------+-------+--------------------------+----------------------+ | Attribute | Index | Value | Value if not present | @@ -184,11 +184,17 @@ .. function:: urlunparse(parts) - Construct a URL from a tuple as returned by ``urlparse()``. The *parts* - argument can be any six-item iterable. This may result in a slightly - different, but equivalent URL, if the URL that was parsed originally had - unnecessary delimiters (for example, a ``?`` with an empty query; the RFC - states that these are equivalent). + Construct a URL from the given components. The *parts* argument can + be a :class:`ParseResult` or :class:`ParseResultBytes` object, as + returned by :func:`urlparse`, or any six-item iterable. The resulting + URL may be slightly different, but equivalent, if the URL that was + parsed originally had an unnecessary parameters delimiter + (a semicolon "``;``" without any parameters). + + .. versionchanged:: 3.5 + Added the distinction between present-but-empty and missing + :attr:`netloc`, :attr:`query`, and :attr:`fragment` components. They + are now recomposed as defined in :rfc:`3986`, section 5.3. .. function:: urlsplit(urlstring, scheme='', allow_fragments=True) @@ -201,7 +207,7 @@ (addressing scheme, network location, path, query, fragment identifier). The return value is actually an instance of a subclass of :class:`tuple`. This - class has the following additional read-only convenience attributes: + class has the following additional read-only attributes: +------------------+-------+-------------------------+----------------------+ | Attribute | Index | Value | Value if not present | @@ -232,11 +238,16 @@ .. function:: urlunsplit(parts) - Combine the elements of a tuple as returned by :func:`urlsplit` into a - complete URL as a string. The *parts* argument can be any five-item - iterable. This may result in a slightly different, but equivalent URL, if the - URL that was parsed originally had unnecessary delimiters (for example, a ? - with an empty query; the RFC states that these are equivalent). + Construct a URL from the given components. The *parts* argument can be a + :class:`SplitResult` or :class:`SplitResultBytes` object, as returned by + :func:`urlsplit`, or any five-item iterable. The resulting URL may be + slightly different, but equivalent, if the URL was originally parsed by + :func:`urlsplit`. + + .. versionchanged:: 3.5 + Added the distinction between present-but-empty and missing + :attr:`netloc`, :attr:`query`, and :attr:`fragment` components. They + are now recomposed as defined in :rfc:`3986`, section 5.3. .. function:: urljoin(base, url, allow_fragments=True) @@ -350,18 +361,32 @@ :func:`urldefrag` functions are subclasses of the :class:`tuple` type. These subclasses add the attributes listed in the documentation for those functions, the encoding and decoding support described in the -previous section, as well as an additional method: +previous section, as well as these attributes and method: + +.. attribute:: urllib.parse.SplitResult.has_netloc + urllib.parse.SplitResult.has_query + urllib.parse.SplitResult.has_fragment + + These three flags are present in :func:`urlparse` and :func:`urlsplit` + result objects, but not in :func:`urldefrag` result objects. They + indicate if the corresponding :attr:`netloc`, :attr:`query`, and + :attr:`fragment` components are actually present in the URL. These + flags are directly set by constructor keyword arguments. By default, + those arguments are :const:`None`, and the flag values are automatically + determined. In that case, the values depend on whether the corresponding + URL parts are empty strings; however :attr:`has_netloc` always defaults + to :const:`True` for some URL schemes. + + .. versionadded:: 3.5 .. method:: urllib.parse.SplitResult.geturl() Return the re-combined version of the original URL as a string. This may differ from the original URL in that the scheme may be normalized to lower - case and empty components may be dropped. Specifically, empty parameters, - queries, and fragment identifiers will be removed. + case and empty components may be dropped. Specifically, empty parameters + will be removed. - For :func:`urldefrag` results, only empty fragment identifiers will be removed. - For :func:`urlsplit` and :func:`urlparse` results, all noted changes will be - made to the URL returned by this method. + For :func:`urldefrag` results, empty fragment identifiers will also be removed. The result of this method remains unchanged if passed back through the original parsing function: @@ -370,10 +395,10 @@ >>> url = 'HTTP://www.Python.org/doc/#' >>> r1 = urlsplit(url) >>> r1.geturl() - 'http://www.Python.org/doc/' + 'http://www.Python.org/doc/#' >>> r2 = urlsplit(r1.geturl()) >>> r2.geturl() - 'http://www.Python.org/doc/' + 'http://www.Python.org/doc/#' The following classes provide the implementations of the structured parse @@ -387,18 +412,24 @@ .. versionadded:: 3.2 -.. class:: ParseResult(scheme, netloc, path, params, query, fragment) +.. class:: ParseResult(scheme, netloc, path, params, query, fragment, *, has_netloc=None, has_query=None, has_fragment=None) Concrete class for :func:`urlparse` results containing :class:`str` data. The :meth:`encode` method returns a :class:`ParseResultBytes` instance. -.. class:: SplitResult(scheme, netloc, path, query, fragment) + .. versionchanged:: 3.5 + Added the *has_netloc*, *has_query*, and *has_fragment* flags. + +.. class:: SplitResult(scheme, netloc, path, query, fragment, *, has_netloc=None, has_query=None, has_fragment=None) Concrete class for :func:`urlsplit` results containing :class:`str` data. The :meth:`encode` method returns a :class:`SplitResultBytes` instance. + .. versionchanged:: 3.5 + Added the *has_netloc*, *has_query*, and *has_fragment* flags. + The following classes provide the implementations of the parse results when operating on :class:`bytes` or :class:`bytearray` objects: @@ -411,7 +442,7 @@ .. versionadded:: 3.2 -.. class:: ParseResultBytes(scheme, netloc, path, params, query, fragment) +.. class:: ParseResultBytes(scheme, netloc, path, params, query, fragment, *, has_netloc=None, has_query=None, has_fragment=None) Concrete class for :func:`urlparse` results containing :class:`bytes` data. The :meth:`decode` method returns a :class:`ParseResult` @@ -419,7 +450,10 @@ .. versionadded:: 3.2 -.. class:: SplitResultBytes(scheme, netloc, path, query, fragment) + .. versionchanged:: 3.5 + Added the *has_netloc*, *has_query*, and *has_fragment* flags. + +.. class:: SplitResultBytes(scheme, netloc, path, query, fragment, *, has_netloc=None, has_query=None, has_fragment=None) Concrete class for :func:`urlsplit` results containing :class:`bytes` data. The :meth:`decode` method returns a :class:`SplitResult` @@ -427,6 +461,9 @@ .. versionadded:: 3.2 + .. versionchanged:: 3.5 + Added the *has_netloc*, *has_query*, and *has_fragment* flags. + URL Quoting ----------- diff -r 9332a545ad85 Lib/test/test_urlparse.py --- a/Lib/test/test_urlparse.py Thu Mar 12 22:01:30 2015 +0200 +++ b/Lib/test/test_urlparse.py Sun Mar 22 13:05:12 2015 +0000 @@ -62,6 +62,9 @@ self.assertEqual(result3.password, result.password) self.assertEqual(result3.hostname, result.hostname) self.assertEqual(result3.port, result.port) + self.assertEqual(result3.has_netloc, result.has_netloc) + self.assertEqual(result3.has_query, result.has_query) + self.assertEqual(result3.has_fragment, result.has_fragment) # check the roundtrip using urlsplit() as well result = urllib.parse.urlsplit(url) @@ -86,6 +89,9 @@ self.assertEqual(result3.password, result.password) self.assertEqual(result3.hostname, result.hostname) self.assertEqual(result3.port, result.port) + self.assertEqual(result3.has_netloc, result.has_netloc) + self.assertEqual(result3.has_query, result.has_query) + self.assertEqual(result3.has_fragment, result.has_fragment) def test_qsl(self): for orig, expect in parse_qsl_test_cases: @@ -122,6 +128,10 @@ '','',''), ('git+ssh', 'git@github.com','/user/project.git', '', '')), + ('////evil.com', # "//evil.com" is path not netloc: Issue 23505 + ('', '', '//evil.com', '', '', ''), + ('', '', '//evil.com', '', ''), + ), ] def _encode(t): return (t[0].encode('ascii'), @@ -177,7 +187,18 @@ self.assertEqual(urllib.parse.urljoin(baseb, relurlb), expectedb) def test_unparse_parse(self): - str_cases = ['Python', './Python','x-newscheme://foo.com/stuff','x://y','x:/y','x:/','/',] + str_cases = [ + 'Python', + './Python', + 'x-newscheme://foo.com/stuff', + 'x://y', + 'x:/y', + 'x:/', + '/', + 'x://', # Issue 8339 + 'x:///y', # Issue 8339 + 'file:///tmp', # Issue 15009 + ] bytes_cases = [x.encode('ascii') for x in str_cases] for u in str_cases + bytes_cases: self.assertEqual(urllib.parse.urlunsplit(urllib.parse.urlsplit(u)), u) @@ -314,7 +335,7 @@ #Abnormal Examples - # The 'abnormal scenarios' are incompatible with RFC2986 parsing + # The 'abnormal scenarios' are incompatible with RFC2396 parsing # Tests are here for reference. self.checkJoin(RFC3986_BASE, '../../../g','http://a/g') @@ -616,6 +637,95 @@ self.assertEqual(p.port, None) self.assertEqual(p.geturl(), uri) + def test_empty_components(self): + + # Issue 22852: RFC3986 sec 5.3 + # http://tools.ietf.org/html/rfc3986#section-5.3 + # Note that we are careful to preserve the distinction between a + # component that is undefined, meaning that its separator was not + # present in the reference, and a component that is empty, meaning that + # the separator was present and was immediately followed by the next + # component separator or the end of the reference. + + tests = ( + ("/", dict( + scheme="", + netloc="", has_netloc=False, + path="/", + query="", has_query=False, fragment="", has_fragment=False, + )), + + # Silly example: Relative URI with empty username/password/query/fragment + ("//:@www.example.com:/?#", dict( + scheme="", + netloc=":@www.example.com:", has_netloc=True, + path="/", + query="", has_query=True, + fragment="", has_fragment=True, + )), + + # More realistic example: + ("http://www.example.com/document?", dict( + scheme="http", + netloc="www.example.com", has_netloc=True, + path="/document", + query="", has_query=True, + fragment="", has_fragment=False, + )), + + # Empty fragment, often used to identify ontologies + ("http://www.example.com/document#", dict( + scheme="http", + netloc="www.example.com", has_netloc=True, + path="/document", + query="", has_query=False, + fragment="", has_fragment=True, + )), + + # RFC1738 + # + # As a special case, can be the string "localhost" or the empty + # string; this is interpreted as `the machine from which the URL is + # being interpreted'. + ("file:///etc/hosts", dict( + scheme="file", + netloc="", has_netloc=True, + path="/etc/hosts", + query="", has_query=False, fragment="", has_fragment=False, + )), + + # Windows C: + ("file:///C:/Windows/System32/", dict( + scheme="file", + netloc="", has_netloc=True, + path="/C:/Windows/System32/", + query="", has_query=False, fragment="", has_fragment=False, + )), + + # Windows share + ("file://SERVER/share1/document.rtf", dict( + scheme="file", + netloc="SERVER", has_netloc=True, + path="/share1/document.rtf", + query="", has_query=False, fragment="", has_fragment=False, + )), + ) + for url, fields in tests: + split = urllib.parse.urlsplit(url) + parsed = urllib.parse.urlparse(url) + for attr, value in fields.items(): + with self.subTest(url, attr=attr): + self.assertEqual(getattr(split, attr), value) + self.assertEqual(getattr(parsed, attr), value) + + with self.subTest(url): + p = urllib.parse.SplitResult(**fields) + self.assertEqual(p.geturl(), url) + self.assertEqual(urllib.parse.urlunsplit(p), url) + p = urllib.parse.ParseResult(params="", **fields) + self.assertEqual(p.geturl(), url) + self.assertEqual(urllib.parse.urlunparse(p), url) + def test_noslash(self): # Issue 1637: http://foo.com?query is legal self.assertEqual(urllib.parse.urlparse("http://example.com?blahblah=/foo"), @@ -742,6 +852,47 @@ for result_type in result_types: self._check_result_type(result_type) + def test_result_default_flags(self): + # Check automatic has_netloc etc values + tests = ( + ( + ('http', 'a', '/b/c', 'query', 'fragment'), + 'http://a/b/c?query#fragment', + ), + ( + ('http', 'a', '/b/c', '', ''), + 'http://a/b/c', + ), + ( # Automatically adds //netloc part + ('file', '', '/path', '', ''), + 'file:///path', + ), + ( # No automatic //netloc part for "mailto:" + ('mailto', '', 'chris@example.com', '', ''), + 'mailto:chris@example.com', + ), + ( # No automatic //netloc part for no scheme + ('', '', '/path', '', ''), + '/path', + ), + ) + for parts, url in tests: + with self.subTest(parts): + result = urllib.parse.SplitResult(*parts) + self.assertEqual(result.geturl(), url) + parts = (part.encode('ascii') for part in parts) + bresult = urllib.parse.SplitResultBytes(*parts) + self.assertEqual(bresult.geturl(), url.encode('ascii')) + + result = urllib.parse.ParseResult(result.scheme, + result.netloc, result.path, '', result.query, + result.fragment) + self.assertEqual(result.geturl(), url) + bresult = urllib.parse.ParseResult(bresult.scheme, + bresult.netloc, bresult.path, b'', bresult.query, + bresult.fragment) + self.assertEqual(bresult.geturl(), url.encode('ascii')) + def test_parse_qs_encoding(self): result = urllib.parse.parse_qs("key=\u0141%E9", encoding="latin-1") self.assertEqual(result, {'key': ['\u0141\xE9']}) diff -r 9332a545ad85 Lib/urllib/parse.py --- a/Lib/urllib/parse.py Thu Mar 12 22:01:30 2015 +0200 +++ b/Lib/urllib/parse.py Sun Mar 22 13:05:12 2015 +0000 @@ -128,9 +128,29 @@ return self._decoded_counterpart(*(x.decode(encoding, errors) for x in self)) -class _NetlocResultMixinBase(object): - """Shared methods for the parsed result objects containing a netloc element""" - __slots__ = () +class _SplitParseBase(object): + """Shared methods for the urlsplit() and urlparse() result objects""" + + @staticmethod + def __new__( + type, *pos, + has_netloc=None, has_query=None, has_fragment=None, **kw): + self = super().__new__(type, *pos, **kw) + if has_netloc is None: + has_netloc = bool(self.netloc) + if not has_netloc and self.scheme: + scheme = self.scheme + if not isinstance(scheme, str): + scheme = scheme.decode('ascii', 'replace') + has_netloc = scheme in uses_netloc + self.has_netloc = has_netloc + if has_query is None: + has_query = bool(self.query) + self.has_query = has_query + if has_fragment is None: + has_fragment = bool(self.fragment) + self.has_fragment = has_fragment + return self @property def username(self): @@ -160,9 +180,7 @@ return port -class _NetlocResultMixinStr(_NetlocResultMixinBase, _ResultMixinStr): - __slots__ = () - +class _NetlocResultMixinStr(_SplitParseBase, _ResultMixinStr): @property def _userinfo(self): netloc = self.netloc @@ -189,10 +207,15 @@ port = None return hostname, port + def encode(self, encoding='ascii', errors='strict'): + result = _ResultMixinStr.encode(self, encoding, errors) + result.has_netloc = self.has_netloc + result.has_query = self.has_query + result.has_fragment = self.has_fragment + return result -class _NetlocResultMixinBytes(_NetlocResultMixinBase, _ResultMixinBytes): - __slots__ = () +class _NetlocResultMixinBytes(_SplitParseBase, _ResultMixinBytes): @property def _userinfo(self): netloc = self.netloc @@ -219,6 +242,13 @@ port = None return hostname, port + def decode(self, encoding='ascii', errors='strict'): + result = _ResultMixinBytes.decode(self, encoding, errors) + result.has_netloc = self.has_netloc + result.has_query = self.has_query + result.has_fragment = self.has_fragment + return result + from collections import namedtuple @@ -240,13 +270,11 @@ else: return self.url -class SplitResult(_SplitResultBase, _NetlocResultMixinStr): - __slots__ = () +class SplitResult(_NetlocResultMixinStr, _SplitResultBase): def geturl(self): return urlunsplit(self) -class ParseResult(_ParseResultBase, _NetlocResultMixinStr): - __slots__ = () +class ParseResult(_NetlocResultMixinStr, _ParseResultBase): def geturl(self): return urlunparse(self) @@ -259,13 +287,11 @@ else: return self.url -class SplitResultBytes(_SplitResultBase, _NetlocResultMixinBytes): - __slots__ = () +class SplitResultBytes(_NetlocResultMixinBytes, _SplitResultBase): def geturl(self): return urlunsplit(self) -class ParseResultBytes(_ParseResultBase, _NetlocResultMixinBytes): - __slots__ = () +class ParseResultBytes(_NetlocResultMixinBytes, _ParseResultBase): def geturl(self): return urlunparse(self) @@ -296,7 +322,11 @@ url, params = _splitparams(url) else: params = '' - result = ParseResult(scheme, netloc, url, params, query, fragment) + result = ParseResult(scheme, netloc, url, params, query, fragment, + has_netloc=splitresult.has_netloc, + has_query=splitresult.has_query, + has_fragment=splitresult.has_fragment, + ) return _coerce_result(result) def _splitparams(url): @@ -336,16 +366,21 @@ if url[:i] == 'http': # optimize the common case scheme = url[:i].lower() url = url[i+1:] - if url[:2] == '//': + has_netloc = url[:2] == '//' + if has_netloc: netloc, url = _splitnetloc(url, 2) if (('[' in netloc and ']' not in netloc) or (']' in netloc and '[' not in netloc)): raise ValueError("Invalid IPv6 URL") - if allow_fragments and '#' in url: + has_fragment = allow_fragments and '#' in url + if has_fragment: url, fragment = url.split('#', 1) - if '?' in url: + has_query = '?' in url + if has_query: url, query = url.split('?', 1) - v = SplitResult(scheme, netloc, url, query, fragment) + v = SplitResult(scheme, netloc, url, query, fragment, + has_netloc=has_netloc, has_query=has_query, + has_fragment=has_fragment) _parse_cache[key] = v return _coerce_result(v) for c in url[:i]: @@ -359,46 +394,64 @@ # not a port number scheme, url = url[:i].lower(), rest - if url[:2] == '//': + has_netloc = url[:2] == '//' + if has_netloc: netloc, url = _splitnetloc(url, 2) if (('[' in netloc and ']' not in netloc) or (']' in netloc and '[' not in netloc)): raise ValueError("Invalid IPv6 URL") - if allow_fragments and '#' in url: + has_fragment = allow_fragments and '#' in url + if has_fragment: url, fragment = url.split('#', 1) - if '?' in url: + has_query = '?' in url + if has_query: url, query = url.split('?', 1) - v = SplitResult(scheme, netloc, url, query, fragment) + v = SplitResult(scheme, netloc, url, query, fragment, + has_netloc=has_netloc, has_query=has_query, + has_fragment=has_fragment) _parse_cache[key] = v return _coerce_result(v) def urlunparse(components): - """Put a parsed URL back together again. This may result in a - slightly different, but equivalent URL, if the URL that was parsed - originally had redundant delimiters, e.g. a ? with an empty query - (the draft states that these are equivalent).""" + """Construct a URL from the given components. The argument can be a + ParseResult or ParseResultBytes object, as returned by urlparse(), or any + six-item iterable. The resulting URL may be slightly different, but + equivalent, if the URL that was parsed originally had an unnecessary + parameters delimiter (a semicolon ";" without any parameters).""" scheme, netloc, url, params, query, fragment, _coerce_result = ( _coerce_args(*components)) if params: url = "%s;%s" % (url, params) - return _coerce_result(urlunsplit((scheme, netloc, url, query, fragment))) + split = (scheme, netloc, url, query, fragment) + if isinstance(components, _SplitParseBase): + split = SplitResult(*split, + has_netloc=components.has_netloc, has_query=components.has_query, + has_fragment=components.has_fragment) + return _coerce_result(urlunsplit(split)) def urlunsplit(components): - """Combine the elements of a tuple as returned by urlsplit() into a - complete URL as a string. The data argument can be any five-item iterable. - This may result in a slightly different, but equivalent URL, if the URL that - was parsed originally had unnecessary delimiters (for example, a ? with an - empty query; the RFC states that these are equivalent).""" + """Construct a URL from the given components. The argument can be a + SplitResult or SplitResultBytes object, as returned by urlsplit(), or any + five-item iterable. The resulting URL may be slightly different, but + equivalent, if the URL was originally parsed by urlsplit().""" scheme, netloc, url, query, fragment, _coerce_result = ( _coerce_args(*components)) - if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'): + if isinstance(components, _SplitParseBase): + has_netloc = components.has_netloc + has_query = components.has_query + has_fragment = components.has_fragment + else: # Support plain tuple + has_netloc = netloc or (scheme and scheme in uses_netloc) + has_query = query + has_fragment = fragment + if has_netloc: if url and url[:1] != '/': url = '/' + url url = '//' + (netloc or '') + url if scheme: url = scheme + ':' + url - if query: + if has_query: url = url + '?' + query - if fragment: + if has_fragment: url = url + '#' + fragment return _coerce_result(url) diff -r 9332a545ad85 Lib/urllib/robotparser.py --- a/Lib/urllib/robotparser.py Thu Mar 12 22:01:30 2015 +0200 +++ b/Lib/urllib/robotparser.py Sun Mar 22 13:05:12 2015 +0000 @@ -164,7 +164,8 @@ if path == '' and not allowance: # an empty value means allow all allowance = True - path = urllib.parse.urlunparse(urllib.parse.urlparse(path)) + # Forget has_query etc and force normalization of empty query parts + path = urllib.parse.urlunparse(tuple(urllib.parse.urlparse(path))) self.path = urllib.parse.quote(path) self.allowance = allowance