# HG changeset patch # Parent 9332a545ad851c60f9917dc159369399a94f10e5 #22852: Add SplitResult(has_netloc=...) etc parameters * Reused some of Stian’s tests; added more from various bug reports * Had to drop __slots__ to allow adding fields to tuple subclass * Had to hack robotparser diff -r 9332a545ad85 Doc/library/urllib.parse.rst --- a/Doc/library/urllib.parse.rst Thu Mar 12 22:01:30 2015 +0200 +++ b/Doc/library/urllib.parse.rst Fri Mar 13 00:20:17 2015 +0000 @@ -85,12 +85,12 @@ for this argument is :const:`True`. The return value is actually an instance of a subclass of :class:`tuple`. This - class has the following additional read-only convenience attributes: + class has the following additional read-only attributes: +------------------+-------+--------------------------+----------------------+ | Attribute | Index | Value | Value if not present | +==================+=======+==========================+======================+ - | :attr:`scheme` | 0 | URL scheme specifier | empty string | + | :attr:`scheme` | 0 | URL scheme specifier | default *scheme* | +------------------+-------+--------------------------+----------------------+ | :attr:`netloc` | 1 | Network location part | empty string | +------------------+-------+--------------------------+----------------------+ @@ -184,11 +184,15 @@ .. function:: urlunparse(parts) - Construct a URL from a tuple as returned by ``urlparse()``. The *parts* - argument can be any six-item iterable. This may result in a slightly - different, but equivalent URL, if the URL that was parsed originally had - unnecessary delimiters (for example, a ``?`` with an empty query; the RFC - states that these are equivalent). + Construct a URL from an object as returned by :func:`urlparse`. + The *parts* argument can also be any six-item iterable. The resulting + URL may be slightly different, but equivalent, if the URL that was + parsed originally had an unnecessary parameters delimiter + (a semicolon "``;``" without any parameters). + + .. versionchanged:: 3.5 + Previously, empty :attr:`netloc`, :attr:`query`, and :attr:`fragment` + parts were sometimes removed from the new URL. .. function:: urlsplit(urlstring, scheme='', allow_fragments=True) @@ -201,12 +205,12 @@ (addressing scheme, network location, path, query, fragment identifier). The return value is actually an instance of a subclass of :class:`tuple`. This - class has the following additional read-only convenience attributes: + class has the following additional read-only attributes: +------------------+-------+-------------------------+----------------------+ | Attribute | Index | Value | Value if not present | +==================+=======+=========================+======================+ - | :attr:`scheme` | 0 | URL scheme specifier | empty string | + | :attr:`scheme` | 0 | URL scheme specifier | default *scheme* | +------------------+-------+-------------------------+----------------------+ | :attr:`netloc` | 1 | Network location part | empty string | +------------------+-------+-------------------------+----------------------+ @@ -232,11 +236,13 @@ .. function:: urlunsplit(parts) - Combine the elements of a tuple as returned by :func:`urlsplit` into a - complete URL as a string. The *parts* argument can be any five-item - iterable. This may result in a slightly different, but equivalent URL, if the - URL that was parsed originally had unnecessary delimiters (for example, a ? - with an empty query; the RFC states that these are equivalent). + Combine the elements of an object as returned by :func:`urlsplit` into a + complete URL as a string. The *parts* argument can also be any five-item + iterable. The resulting URL may be slightly different, but equivalent. + + .. versionchanged:: 3.5 + Previously, empty :attr:`netloc`, :attr:`query`, and :attr:`fragment` + parts were sometimes removed from the new URL. .. function:: urljoin(base, url, allow_fragments=True) @@ -350,18 +356,29 @@ :func:`urldefrag` functions are subclasses of the :class:`tuple` type. These subclasses add the attributes listed in the documentation for those functions, the encoding and decoding support described in the -previous section, as well as an additional method: +previous section, as well as these attributes and method: + +.. attribute:: urllib.parse.SplitResult.has_netloc + urllib.parse.SplitResult.has_query + urllib.parse.SplitResult.has_fragment + + These are flags indicating if the corresponding :attr:`netloc`, + :attr:`query`, and :attr:`fragment` parts are actually present in the URL. + The flags are not present in :func:`urldefrag` result objects. The default + values of these flags depend on the whether the corresponding URL parts + are empty strings, however :attr:`has_netloc` always defaults to + :const:`True` for some URL schemes. + + .. versionadded:: 3.5 .. method:: urllib.parse.SplitResult.geturl() Return the re-combined version of the original URL as a string. This may differ from the original URL in that the scheme may be normalized to lower - case and empty components may be dropped. Specifically, empty parameters, - queries, and fragment identifiers will be removed. + case and empty components may be dropped. Specifically, empty parameters + will be removed. - For :func:`urldefrag` results, only empty fragment identifiers will be removed. - For :func:`urlsplit` and :func:`urlparse` results, all noted changes will be - made to the URL returned by this method. + For :func:`urldefrag` results, empty fragment identifiers will also be removed. The result of this method remains unchanged if passed back through the original parsing function: @@ -370,10 +387,10 @@ >>> url = 'HTTP://www.Python.org/doc/#' >>> r1 = urlsplit(url) >>> r1.geturl() - 'http://www.Python.org/doc/' + 'http://www.Python.org/doc/#' >>> r2 = urlsplit(r1.geturl()) >>> r2.geturl() - 'http://www.Python.org/doc/' + 'http://www.Python.org/doc/#' The following classes provide the implementations of the structured parse @@ -387,18 +404,24 @@ .. versionadded:: 3.2 -.. class:: ParseResult(scheme, netloc, path, params, query, fragment) +.. class:: ParseResult(scheme, netloc, path, params, query, fragment, *, has_netloc=None, has_query=None, has_fragment=None) Concrete class for :func:`urlparse` results containing :class:`str` data. The :meth:`encode` method returns a :class:`ParseResultBytes` instance. -.. class:: SplitResult(scheme, netloc, path, query, fragment) + .. versionchanged:: 3.5 + Added the *has_* flags. + +.. class:: SplitResult(scheme, netloc, path, query, fragment, *, has_netloc=None, has_query=None, has_fragment=None) Concrete class for :func:`urlsplit` results containing :class:`str` data. The :meth:`encode` method returns a :class:`SplitResultBytes` instance. + .. versionchanged:: 3.5 + Added the *has_* flags. + The following classes provide the implementations of the parse results when operating on :class:`bytes` or :class:`bytearray` objects: @@ -411,7 +434,7 @@ .. versionadded:: 3.2 -.. class:: ParseResultBytes(scheme, netloc, path, params, query, fragment) +.. class:: ParseResultBytes(scheme, netloc, path, params, query, fragment, *, has_netloc=None, has_query=None, has_fragment=None) Concrete class for :func:`urlparse` results containing :class:`bytes` data. The :meth:`decode` method returns a :class:`ParseResult` @@ -419,7 +442,10 @@ .. versionadded:: 3.2 -.. class:: SplitResultBytes(scheme, netloc, path, query, fragment) + .. versionchanged:: 3.5 + Added the *has_* flags. + +.. class:: SplitResultBytes(scheme, netloc, path, query, fragment, *, has_netloc=None, has_query=None, has_fragment=None) Concrete class for :func:`urlsplit` results containing :class:`bytes` data. The :meth:`decode` method returns a :class:`SplitResult` @@ -427,6 +453,9 @@ .. versionadded:: 3.2 + .. versionchanged:: 3.5 + Added the *has_* flags. + URL Quoting ----------- diff -r 9332a545ad85 Lib/test/test_urlparse.py --- a/Lib/test/test_urlparse.py Thu Mar 12 22:01:30 2015 +0200 +++ b/Lib/test/test_urlparse.py Fri Mar 13 00:20:17 2015 +0000 @@ -62,6 +62,9 @@ self.assertEqual(result3.password, result.password) self.assertEqual(result3.hostname, result.hostname) self.assertEqual(result3.port, result.port) + self.assertEqual(result3.has_netloc, result.has_netloc) + self.assertEqual(result3.has_query, result.has_query) + self.assertEqual(result3.has_fragment, result.has_fragment) # check the roundtrip using urlsplit() as well result = urllib.parse.urlsplit(url) @@ -86,6 +89,9 @@ self.assertEqual(result3.password, result.password) self.assertEqual(result3.hostname, result.hostname) self.assertEqual(result3.port, result.port) + self.assertEqual(result3.has_netloc, result.has_netloc) + self.assertEqual(result3.has_query, result.has_query) + self.assertEqual(result3.has_fragment, result.has_fragment) def test_qsl(self): for orig, expect in parse_qsl_test_cases: @@ -122,6 +128,10 @@ '','',''), ('git+ssh', 'git@github.com','/user/project.git', '', '')), + ('////evil.com', # "//evil.com" is path not netloc: Issue 23505 + ('', '', '//evil.com', '', '', ''), + ('', '', '//evil.com', '', ''), + ), ] def _encode(t): return (t[0].encode('ascii'), @@ -177,7 +187,18 @@ self.assertEqual(urllib.parse.urljoin(baseb, relurlb), expectedb) def test_unparse_parse(self): - str_cases = ['Python', './Python','x-newscheme://foo.com/stuff','x://y','x:/y','x:/','/',] + str_cases = [ + 'Python', + './Python', + 'x-newscheme://foo.com/stuff', + 'x://y', + 'x:/y', + 'x:/', + '/', + 'x://', # Issue 8339 + 'x:///y', # Issue 8339 + 'file:///tmp', # Issue 15009 + ] bytes_cases = [x.encode('ascii') for x in str_cases] for u in str_cases + bytes_cases: self.assertEqual(urllib.parse.urlunsplit(urllib.parse.urlsplit(u)), u) @@ -314,7 +335,7 @@ #Abnormal Examples - # The 'abnormal scenarios' are incompatible with RFC2986 parsing + # The 'abnormal scenarios' are incompatible with RFC2396 parsing # Tests are here for reference. self.checkJoin(RFC3986_BASE, '../../../g','http://a/g') @@ -616,6 +637,95 @@ self.assertEqual(p.port, None) self.assertEqual(p.geturl(), uri) + def test_empty_components(self): + + # Issue 22852: RFC3986 sec 5.3 + # http://tools.ietf.org/html/rfc3986#section-5.3 + # Note that we are careful to preserve the distinction between a + # component that is undefined, meaning that its separator was not + # present in the reference, and a component that is empty, meaning that + # the separator was present and was immediately followed by the next + # component separator or the end of the reference. + + tests = ( + ("/", dict( + scheme="", + netloc="", has_netloc=False, + path="/", + query="", has_query=False, fragment="", has_fragment=False, + )), + + # Silly example: Relative URI with empty username/password/query/fragment + ("//:@www.example.com:/?#", dict( + scheme="", + netloc=":@www.example.com:", has_netloc=True, + path="/", + query="", has_query=True, + fragment="", has_fragment=True, + )), + + # More realistic example: + ("http://www.example.com/document?", dict( + scheme="http", + netloc="www.example.com", has_netloc=True, + path="/document", + query="", has_query=True, + fragment="", has_fragment=False, + )), + + # Empty fragment, often used to identify ontologies + ("http://www.example.com/document#", dict( + scheme="http", + netloc="www.example.com", has_netloc=True, + path="/document", + query="", has_query=False, + fragment="", has_fragment=True, + )), + + # RFC1738 + # + # As a special case, can be the string "localhost" or the empty + # string; this is interpreted as `the machine from which the URL is + # being interpreted'. + ("file:///etc/hosts", dict( + scheme="file", + netloc="", has_netloc=True, + path="/etc/hosts", + query="", has_query=False, fragment="", has_fragment=False, + )), + + # Windows C: + ("file:///C:/Windows/System32/", dict( + scheme="file", + netloc="", has_netloc=True, + path="/C:/Windows/System32/", + query="", has_query=False, fragment="", has_fragment=False, + )), + + # Windows share + ("file://SERVER/share1/document.rtf", dict( + scheme="file", + netloc="SERVER", has_netloc=True, + path="/share1/document.rtf", + query="", has_query=False, fragment="", has_fragment=False, + )), + ) + for url, fields in tests: + split = urllib.parse.urlsplit(url) + parsed = urllib.parse.urlparse(url) + for attr, value in fields.items(): + with self.subTest(url, attr=attr): + self.assertEqual(getattr(split, attr), value) + self.assertEqual(getattr(parsed, attr), value) + + with self.subTest(url): + p = urllib.parse.SplitResult(**fields) + self.assertEqual(p.geturl(), url) + self.assertEqual(urllib.parse.urlunsplit(p), url) + p = urllib.parse.ParseResult(params="", **fields) + self.assertEqual(p.geturl(), url) + self.assertEqual(urllib.parse.urlunparse(p), url) + def test_noslash(self): # Issue 1637: http://foo.com?query is legal self.assertEqual(urllib.parse.urlparse("http://example.com?blahblah=/foo"), @@ -742,6 +852,47 @@ for result_type in result_types: self._check_result_type(result_type) + def test_result_default_flags(self): + # Check automatic has_netloc etc values + tests = ( + ( + ('http', 'a', '/b/c', 'query', 'fragment'), + 'http://a/b/c?query#fragment', + ), + ( + ('http', 'a', '/b/c', '', ''), + 'http://a/b/c', + ), + ( # Automatically adds //netloc part + ('file', '', '/path', '', ''), + 'file:///path', + ), + ( # No automatic //netloc part for "mailto:" + ('mailto', '', 'chris@example.com', '', ''), + 'mailto:chris@example.com', + ), + ( # No automatic //netloc part for no scheme + ('', '', '/path', '', ''), + '/path', + ), + ) + for parts, url in tests: + with self.subTest(parts): + result = urllib.parse.SplitResult(*parts) + self.assertEqual(result.geturl(), url) + parts = (part.encode('ascii') for part in parts) + bresult = urllib.parse.SplitResultBytes(*parts) + self.assertEqual(bresult.geturl(), url.encode('ascii')) + + result = urllib.parse.ParseResult(result.scheme, + result.netloc, result.path, '', result.query, + result.fragment) + self.assertEqual(result.geturl(), url) + bresult = urllib.parse.ParseResult(bresult.scheme, + bresult.netloc, bresult.path, b'', bresult.query, + bresult.fragment) + self.assertEqual(bresult.geturl(), url.encode('ascii')) + def test_parse_qs_encoding(self): result = urllib.parse.parse_qs("key=\u0141%E9", encoding="latin-1") self.assertEqual(result, {'key': ['\u0141\xE9']}) diff -r 9332a545ad85 Lib/urllib/parse.py --- a/Lib/urllib/parse.py Thu Mar 12 22:01:30 2015 +0200 +++ b/Lib/urllib/parse.py Fri Mar 13 00:20:17 2015 +0000 @@ -130,7 +130,26 @@ class _NetlocResultMixinBase(object): """Shared methods for the parsed result objects containing a netloc element""" - __slots__ = () + + @staticmethod + def __new__(type, *pos, + has_netloc=None, has_query=None, has_fragment=None, **kw): + self = super().__new__(type, *pos, **kw) + if has_netloc is None: + has_netloc = bool(self.netloc) + if not has_netloc and self.scheme: + scheme = self.scheme + if not isinstance(scheme, str): + scheme = scheme.decode('ascii', 'replace') + has_netloc = scheme in uses_netloc + self.has_netloc = has_netloc + if has_query is None: + has_query = bool(self.query) + self.has_query = has_query + if has_fragment is None: + has_fragment = bool(self.fragment) + self.has_fragment = has_fragment + return self @property def username(self): @@ -161,8 +180,6 @@ class _NetlocResultMixinStr(_NetlocResultMixinBase, _ResultMixinStr): - __slots__ = () - @property def _userinfo(self): netloc = self.netloc @@ -189,10 +206,15 @@ port = None return hostname, port + def encode(self, encoding='ascii', errors='strict'): + result = _ResultMixinStr.encode(self, encoding, errors) + result.has_netloc = self.has_netloc + result.has_query = self.has_query + result.has_fragment = self.has_fragment + return result + class _NetlocResultMixinBytes(_NetlocResultMixinBase, _ResultMixinBytes): - __slots__ = () - @property def _userinfo(self): netloc = self.netloc @@ -219,6 +241,13 @@ port = None return hostname, port + def decode(self, encoding='ascii', errors='strict'): + result = _ResultMixinBytes.decode(self, encoding, errors) + result.has_netloc = self.has_netloc + result.has_query = self.has_query + result.has_fragment = self.has_fragment + return result + from collections import namedtuple @@ -240,13 +269,11 @@ else: return self.url -class SplitResult(_SplitResultBase, _NetlocResultMixinStr): - __slots__ = () +class SplitResult(_NetlocResultMixinStr, _SplitResultBase): def geturl(self): return urlunsplit(self) -class ParseResult(_ParseResultBase, _NetlocResultMixinStr): - __slots__ = () +class ParseResult(_NetlocResultMixinStr, _ParseResultBase): def geturl(self): return urlunparse(self) @@ -259,13 +286,11 @@ else: return self.url -class SplitResultBytes(_SplitResultBase, _NetlocResultMixinBytes): - __slots__ = () +class SplitResultBytes(_NetlocResultMixinBytes, _SplitResultBase): def geturl(self): return urlunsplit(self) -class ParseResultBytes(_ParseResultBase, _NetlocResultMixinBytes): - __slots__ = () +class ParseResultBytes(_NetlocResultMixinBytes, _ParseResultBase): def geturl(self): return urlunparse(self) @@ -296,7 +321,11 @@ url, params = _splitparams(url) else: params = '' - result = ParseResult(scheme, netloc, url, params, query, fragment) + result = ParseResult(scheme, netloc, url, params, query, fragment, + has_netloc=splitresult.has_netloc, + has_query=splitresult.has_query, + has_fragment=splitresult.has_fragment, + ) return _coerce_result(result) def _splitparams(url): @@ -336,16 +365,21 @@ if url[:i] == 'http': # optimize the common case scheme = url[:i].lower() url = url[i+1:] - if url[:2] == '//': + has_netloc = url[:2] == '//' + if has_netloc: netloc, url = _splitnetloc(url, 2) if (('[' in netloc and ']' not in netloc) or (']' in netloc and '[' not in netloc)): raise ValueError("Invalid IPv6 URL") - if allow_fragments and '#' in url: + has_fragment = allow_fragments and '#' in url + if has_fragment: url, fragment = url.split('#', 1) - if '?' in url: + has_query = '?' in url + if has_query: url, query = url.split('?', 1) - v = SplitResult(scheme, netloc, url, query, fragment) + v = SplitResult(scheme, netloc, url, query, fragment, + has_netloc=has_netloc, has_query=has_query, + has_fragment=has_fragment) _parse_cache[key] = v return _coerce_result(v) for c in url[:i]: @@ -359,46 +393,62 @@ # not a port number scheme, url = url[:i].lower(), rest - if url[:2] == '//': + has_netloc = url[:2] == '//' + if has_netloc: netloc, url = _splitnetloc(url, 2) if (('[' in netloc and ']' not in netloc) or (']' in netloc and '[' not in netloc)): raise ValueError("Invalid IPv6 URL") - if allow_fragments and '#' in url: + has_fragment = allow_fragments and '#' in url + if has_fragment: url, fragment = url.split('#', 1) - if '?' in url: + has_query = '?' in url + if has_query: url, query = url.split('?', 1) - v = SplitResult(scheme, netloc, url, query, fragment) + v = SplitResult(scheme, netloc, url, query, fragment, + has_netloc=has_netloc, has_query=has_query, + has_fragment=has_fragment) _parse_cache[key] = v return _coerce_result(v) def urlunparse(components): """Put a parsed URL back together again. This may result in a slightly different, but equivalent URL, if the URL that was parsed - originally had redundant delimiters, e.g. a ? with an empty query - (the draft states that these are equivalent).""" + originally had a redundant parameters delimiter, a semicolon (;) without + any parameters.""" scheme, netloc, url, params, query, fragment, _coerce_result = ( _coerce_args(*components)) if params: url = "%s;%s" % (url, params) - return _coerce_result(urlunsplit((scheme, netloc, url, query, fragment))) + split = (scheme, netloc, url, query, fragment) + if isinstance(components, _NetlocResultMixinBase): + split = SplitResult(*split, + has_netloc=components.has_netloc, has_query=components.has_query, + has_fragment=components.has_fragment) + return _coerce_result(urlunsplit(split)) def urlunsplit(components): - """Combine the elements of a tuple as returned by urlsplit() into a - complete URL as a string. The data argument can be any five-item iterable. - This may result in a slightly different, but equivalent URL, if the URL that - was parsed originally had unnecessary delimiters (for example, a ? with an - empty query; the RFC states that these are equivalent).""" + """Combine the elements of an object as returned by urlsplit() into a + complete URL as a string. The data argument can also be any five-item + iterable. The resulting URL may be slightly different, but equivalent.""" scheme, netloc, url, query, fragment, _coerce_result = ( _coerce_args(*components)) - if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'): + if isinstance(components, _NetlocResultMixinBase): + has_netloc = components.has_netloc + has_query = components.has_query + has_fragment = components.has_fragment + else: # Support plain tuple + has_netloc = netloc or (scheme and scheme in uses_netloc) + has_query = query + has_fragment = fragment + if has_netloc: if url and url[:1] != '/': url = '/' + url url = '//' + (netloc or '') + url if scheme: url = scheme + ':' + url - if query: + if has_query: url = url + '?' + query - if fragment: + if has_fragment: url = url + '#' + fragment return _coerce_result(url) diff -r 9332a545ad85 Lib/urllib/robotparser.py --- a/Lib/urllib/robotparser.py Thu Mar 12 22:01:30 2015 +0200 +++ b/Lib/urllib/robotparser.py Fri Mar 13 00:20:17 2015 +0000 @@ -164,7 +164,8 @@ if path == '' and not allowance: # an empty value means allow all allowance = True - path = urllib.parse.urlunparse(urllib.parse.urlparse(path)) + # Forget has_query etc and force normalization of empty query parts + path = urllib.parse.urlunparse(tuple(urllib.parse.urlparse(path))) self.path = urllib.parse.quote(path) self.allowance = allowance