# HG changeset patch
# Parent 9332a545ad851c60f9917dc159369399a94f10e5
#22852: Add SplitResult(has_netloc=...) etc parameters

* Reused some of Stian’s tests; added more from various bug reports
* Had to drop __slots__ to allow adding fields to tuple subclass
* Had to hack robotparser

diff -r 9332a545ad85 Doc/library/urllib.parse.rst
--- a/Doc/library/urllib.parse.rst	Thu Mar 12 22:01:30 2015 +0200
+++ b/Doc/library/urllib.parse.rst	Fri Mar 13 00:20:17 2015 +0000
@@ -85,12 +85,12 @@
    for this argument is :const:`True`.
 
    The return value is actually an instance of a subclass of :class:`tuple`.  This
-   class has the following additional read-only convenience attributes:
+   class has the following additional read-only attributes:
 
    +------------------+-------+--------------------------+----------------------+
    | Attribute        | Index | Value                    | Value if not present |
    +==================+=======+==========================+======================+
-   | :attr:`scheme`   | 0     | URL scheme specifier     | empty string         |
+   | :attr:`scheme`   | 0     | URL scheme specifier     | default *scheme*     |
    +------------------+-------+--------------------------+----------------------+
    | :attr:`netloc`   | 1     | Network location part    | empty string         |
    +------------------+-------+--------------------------+----------------------+
@@ -184,11 +184,15 @@
 
 .. function:: urlunparse(parts)
 
-   Construct a URL from a tuple as returned by ``urlparse()``. The *parts*
-   argument can be any six-item iterable. This may result in a slightly
-   different, but equivalent URL, if the URL that was parsed originally had
-   unnecessary delimiters (for example, a ``?`` with an empty query; the RFC
-   states that these are equivalent).
+   Construct a URL from an object as returned by :func:`urlparse`.
+   The *parts* argument can also be any six-item iterable. The resulting
+   URL may be slightly different, but equivalent, if the URL that was
+   parsed originally had an unnecessary parameters delimiter
+   (a semicolon "``;``" without any parameters).
+
+   .. versionchanged:: 3.5
+      Previously, empty :attr:`netloc`, :attr:`query`, and :attr:`fragment`
+      parts were sometimes removed from the new URL.
 
 
 .. function:: urlsplit(urlstring, scheme='', allow_fragments=True)
@@ -201,12 +205,12 @@
    (addressing scheme, network location, path, query, fragment identifier).
 
    The return value is actually an instance of a subclass of :class:`tuple`.  This
-   class has the following additional read-only convenience attributes:
+   class has the following additional read-only attributes:
 
    +------------------+-------+-------------------------+----------------------+
    | Attribute        | Index | Value                   | Value if not present |
    +==================+=======+=========================+======================+
-   | :attr:`scheme`   | 0     | URL scheme specifier    | empty string         |
+   | :attr:`scheme`   | 0     | URL scheme specifier    | default *scheme*     |
    +------------------+-------+-------------------------+----------------------+
    | :attr:`netloc`   | 1     | Network location part   | empty string         |
    +------------------+-------+-------------------------+----------------------+
@@ -232,11 +236,13 @@
 
 .. function:: urlunsplit(parts)
 
-   Combine the elements of a tuple as returned by :func:`urlsplit` into a
-   complete URL as a string. The *parts* argument can be any five-item
-   iterable. This may result in a slightly different, but equivalent URL, if the
-   URL that was parsed originally had unnecessary delimiters (for example, a ?
-   with an empty query; the RFC states that these are equivalent).
+   Combine the elements of an object as returned by :func:`urlsplit` into a
+   complete URL as a string. The *parts* argument can also be any five-item
+   iterable. The resulting URL may be slightly different, but equivalent.
+
+   .. versionchanged:: 3.5
+      Previously, empty :attr:`netloc`, :attr:`query`, and :attr:`fragment`
+      parts were sometimes removed from the new URL.
 
 
 .. function:: urljoin(base, url, allow_fragments=True)
@@ -350,18 +356,29 @@
 :func:`urldefrag` functions are subclasses of the :class:`tuple` type.
 These subclasses add the attributes listed in the documentation for
 those functions, the encoding and decoding support described in the
-previous section, as well as an additional method:
+previous section, as well as these attributes and method:
+
+.. attribute:: urllib.parse.SplitResult.has_netloc
+               urllib.parse.SplitResult.has_query
+               urllib.parse.SplitResult.has_fragment
+
+   These are flags indicating if the corresponding :attr:`netloc`,
+   :attr:`query`, and :attr:`fragment` parts are actually present in the URL.
+   The flags are not present in :func:`urldefrag` result objects. The default
+   values of these flags depend on the whether the corresponding URL parts
+   are empty strings, however :attr:`has_netloc` always defaults to
+   :const:`True` for some URL schemes.
+
+   .. versionadded:: 3.5
 
 .. method:: urllib.parse.SplitResult.geturl()
 
    Return the re-combined version of the original URL as a string. This may
    differ from the original URL in that the scheme may be normalized to lower
-   case and empty components may be dropped. Specifically, empty parameters,
-   queries, and fragment identifiers will be removed.
+   case and empty components may be dropped. Specifically, empty parameters
+   will be removed.
 
-   For :func:`urldefrag` results, only empty fragment identifiers will be removed.
-   For :func:`urlsplit` and :func:`urlparse` results, all noted changes will be
-   made to the URL returned by this method.
+   For :func:`urldefrag` results, empty fragment identifiers will also be removed.
 
    The result of this method remains unchanged if passed back through the original
    parsing function:
@@ -370,10 +387,10 @@
       >>> url = 'HTTP://www.Python.org/doc/#'
       >>> r1 = urlsplit(url)
       >>> r1.geturl()
-      'http://www.Python.org/doc/'
+      'http://www.Python.org/doc/#'
       >>> r2 = urlsplit(r1.geturl())
       >>> r2.geturl()
-      'http://www.Python.org/doc/'
+      'http://www.Python.org/doc/#'
 
 
 The following classes provide the implementations of the structured parse
@@ -387,18 +404,24 @@
 
    .. versionadded:: 3.2
 
-.. class:: ParseResult(scheme, netloc, path, params, query, fragment)
+.. class:: ParseResult(scheme, netloc, path, params, query, fragment, *, has_netloc=None, has_query=None, has_fragment=None)
 
    Concrete class for :func:`urlparse` results containing :class:`str`
    data. The :meth:`encode` method returns a :class:`ParseResultBytes`
    instance.
 
-.. class:: SplitResult(scheme, netloc, path, query, fragment)
+   .. versionchanged:: 3.5
+      Added the *has_* flags.
+
+.. class:: SplitResult(scheme, netloc, path, query, fragment, *, has_netloc=None, has_query=None, has_fragment=None)
 
    Concrete class for :func:`urlsplit` results containing :class:`str`
    data. The :meth:`encode` method returns a :class:`SplitResultBytes`
    instance.
 
+   .. versionchanged:: 3.5
+      Added the *has_* flags.
+
 
 The following classes provide the implementations of the parse results when
 operating on :class:`bytes` or :class:`bytearray` objects:
@@ -411,7 +434,7 @@
 
    .. versionadded:: 3.2
 
-.. class:: ParseResultBytes(scheme, netloc, path, params, query, fragment)
+.. class:: ParseResultBytes(scheme, netloc, path, params, query, fragment, *, has_netloc=None, has_query=None, has_fragment=None)
 
    Concrete class for :func:`urlparse` results containing :class:`bytes`
    data. The :meth:`decode` method returns a :class:`ParseResult`
@@ -419,7 +442,10 @@
 
    .. versionadded:: 3.2
 
-.. class:: SplitResultBytes(scheme, netloc, path, query, fragment)
+   .. versionchanged:: 3.5
+      Added the *has_* flags.
+
+.. class:: SplitResultBytes(scheme, netloc, path, query, fragment, *, has_netloc=None, has_query=None, has_fragment=None)
 
    Concrete class for :func:`urlsplit` results containing :class:`bytes`
    data. The :meth:`decode` method returns a :class:`SplitResult`
@@ -427,6 +453,9 @@
 
    .. versionadded:: 3.2
 
+   .. versionchanged:: 3.5
+      Added the *has_* flags.
+
 
 URL Quoting
 -----------
diff -r 9332a545ad85 Lib/test/test_urlparse.py
--- a/Lib/test/test_urlparse.py	Thu Mar 12 22:01:30 2015 +0200
+++ b/Lib/test/test_urlparse.py	Fri Mar 13 00:20:17 2015 +0000
@@ -62,6 +62,9 @@
         self.assertEqual(result3.password, result.password)
         self.assertEqual(result3.hostname, result.hostname)
         self.assertEqual(result3.port,     result.port)
+        self.assertEqual(result3.has_netloc,     result.has_netloc)
+        self.assertEqual(result3.has_query,     result.has_query)
+        self.assertEqual(result3.has_fragment,     result.has_fragment)
 
         # check the roundtrip using urlsplit() as well
         result = urllib.parse.urlsplit(url)
@@ -86,6 +89,9 @@
         self.assertEqual(result3.password, result.password)
         self.assertEqual(result3.hostname, result.hostname)
         self.assertEqual(result3.port,     result.port)
+        self.assertEqual(result3.has_netloc,     result.has_netloc)
+        self.assertEqual(result3.has_query,     result.has_query)
+        self.assertEqual(result3.has_fragment,     result.has_fragment)
 
     def test_qsl(self):
         for orig, expect in parse_qsl_test_cases:
@@ -122,6 +128,10 @@
              '','',''),
             ('git+ssh', 'git@github.com','/user/project.git',
              '', '')),
+            ('////evil.com',  # "//evil.com" is path not netloc: Issue 23505
+                ('', '', '//evil.com', '', '', ''),
+                ('', '', '//evil.com', '', ''),
+            ),
             ]
         def _encode(t):
             return (t[0].encode('ascii'),
@@ -177,7 +187,18 @@
         self.assertEqual(urllib.parse.urljoin(baseb, relurlb), expectedb)
 
     def test_unparse_parse(self):
-        str_cases = ['Python', './Python','x-newscheme://foo.com/stuff','x://y','x:/y','x:/','/',]
+        str_cases = [
+            'Python',
+            './Python',
+            'x-newscheme://foo.com/stuff',
+            'x://y',
+            'x:/y',
+            'x:/',
+            '/',
+            'x://',  # Issue 8339
+            'x:///y',  # Issue 8339
+            'file:///tmp',  # Issue 15009
+        ]
         bytes_cases = [x.encode('ascii') for x in str_cases]
         for u in str_cases + bytes_cases:
             self.assertEqual(urllib.parse.urlunsplit(urllib.parse.urlsplit(u)), u)
@@ -314,7 +335,7 @@
 
         #Abnormal Examples
 
-        # The 'abnormal scenarios' are incompatible with RFC2986 parsing
+        # The 'abnormal scenarios' are incompatible with RFC2396 parsing
         # Tests are here for reference.
 
         self.checkJoin(RFC3986_BASE, '../../../g','http://a/g')
@@ -616,6 +637,95 @@
         self.assertEqual(p.port, None)
         self.assertEqual(p.geturl(), uri)
 
+    def test_empty_components(self):
+
+        # Issue 22852: RFC3986 sec 5.3
+        # http://tools.ietf.org/html/rfc3986#section-5.3
+        # Note that we are careful to preserve the distinction between a
+        # component that is undefined, meaning that its separator was not
+        # present in the reference, and a component that is empty, meaning that
+        # the separator was present and was immediately followed by the next
+        # component separator or the end of the reference.
+
+        tests = (
+            ("/", dict(
+                scheme="",
+                netloc="", has_netloc=False,
+                path="/",
+                query="", has_query=False, fragment="", has_fragment=False,
+            )),
+
+            # Silly example: Relative URI with empty username/password/query/fragment
+            ("//:@www.example.com:/?#", dict(
+                scheme="",
+                netloc=":@www.example.com:", has_netloc=True,
+                path="/",
+                query="", has_query=True,
+                fragment="", has_fragment=True,
+            )),
+
+            # More realistic example:
+            ("http://www.example.com/document?", dict(
+                scheme="http",
+                netloc="www.example.com", has_netloc=True,
+                path="/document",
+                query="", has_query=True,
+                fragment="", has_fragment=False,
+            )),
+
+            # Empty fragment, often used to identify ontologies
+            ("http://www.example.com/document#", dict(
+                scheme="http",
+                netloc="www.example.com", has_netloc=True,
+                path="/document",
+                query="", has_query=False,
+                fragment="", has_fragment=True,
+            )),
+
+            # RFC1738
+            #
+            # As a special case, <host> can be the string "localhost" or the empty
+            # string; this is interpreted as `the machine from which the URL is
+            # being interpreted'.
+            ("file:///etc/hosts", dict(
+                scheme="file",
+                netloc="", has_netloc=True,
+                path="/etc/hosts",
+                query="", has_query=False, fragment="", has_fragment=False,
+            )),
+
+            # Windows C:
+            ("file:///C:/Windows/System32/", dict(
+                scheme="file",
+                netloc="", has_netloc=True,
+                path="/C:/Windows/System32/",
+                query="", has_query=False, fragment="", has_fragment=False,
+            )),
+
+            # Windows share
+            ("file://SERVER/share1/document.rtf", dict(
+                scheme="file",
+                netloc="SERVER", has_netloc=True,
+                path="/share1/document.rtf",
+                query="", has_query=False, fragment="", has_fragment=False,
+            )),
+        )
+        for url, fields in tests:
+            split = urllib.parse.urlsplit(url)
+            parsed = urllib.parse.urlparse(url)
+            for attr, value in fields.items():
+                with self.subTest(url, attr=attr):
+                    self.assertEqual(getattr(split, attr), value)
+                    self.assertEqual(getattr(parsed, attr), value)
+
+            with self.subTest(url):
+                p = urllib.parse.SplitResult(**fields)
+                self.assertEqual(p.geturl(), url)
+                self.assertEqual(urllib.parse.urlunsplit(p), url)
+                p = urllib.parse.ParseResult(params="", **fields)
+                self.assertEqual(p.geturl(), url)
+                self.assertEqual(urllib.parse.urlunparse(p), url)
+
     def test_noslash(self):
         # Issue 1637: http://foo.com?query is legal
         self.assertEqual(urllib.parse.urlparse("http://example.com?blahblah=/foo"),
@@ -742,6 +852,47 @@
         for result_type in result_types:
             self._check_result_type(result_type)
 
+    def test_result_default_flags(self):
+        # Check automatic has_netloc etc values
+        tests = (
+            (
+                ('http', 'a', '/b/c', 'query', 'fragment'),
+                'http://a/b/c?query#fragment',
+            ),
+            (
+                ('http', 'a', '/b/c', '', ''),
+                'http://a/b/c',
+            ),
+            (  # Automatically adds //netloc part
+                ('file', '', '/path', '', ''),
+                'file:///path',
+            ),
+            (  # No automatic //netloc part for "mailto:"
+                ('mailto', '', 'chris@example.com', '', ''),
+                'mailto:chris@example.com',
+            ),
+            (  # No automatic //netloc part for no scheme
+                ('', '', '/path', '', ''),
+                '/path',
+            ),
+        )
+        for parts, url in tests:
+            with self.subTest(parts):
+                result = urllib.parse.SplitResult(*parts)
+                self.assertEqual(result.geturl(), url)
+                parts = (part.encode('ascii') for part in parts)
+                bresult = urllib.parse.SplitResultBytes(*parts)
+                self.assertEqual(bresult.geturl(), url.encode('ascii'))
+
+                result = urllib.parse.ParseResult(result.scheme,
+                    result.netloc, result.path, '', result.query,
+                    result.fragment)
+                self.assertEqual(result.geturl(), url)
+                bresult = urllib.parse.ParseResult(bresult.scheme,
+                    bresult.netloc, bresult.path, b'', bresult.query,
+                    bresult.fragment)
+                self.assertEqual(bresult.geturl(), url.encode('ascii'))
+
     def test_parse_qs_encoding(self):
         result = urllib.parse.parse_qs("key=\u0141%E9", encoding="latin-1")
         self.assertEqual(result, {'key': ['\u0141\xE9']})
diff -r 9332a545ad85 Lib/urllib/parse.py
--- a/Lib/urllib/parse.py	Thu Mar 12 22:01:30 2015 +0200
+++ b/Lib/urllib/parse.py	Fri Mar 13 00:20:17 2015 +0000
@@ -130,7 +130,26 @@
 
 class _NetlocResultMixinBase(object):
     """Shared methods for the parsed result objects containing a netloc element"""
-    __slots__ = ()
+
+    @staticmethod
+    def __new__(type, *pos,
+            has_netloc=None, has_query=None, has_fragment=None, **kw):
+        self = super().__new__(type, *pos, **kw)
+        if has_netloc is None:
+            has_netloc = bool(self.netloc)
+            if not has_netloc and self.scheme:
+                scheme = self.scheme
+                if not isinstance(scheme, str):
+                    scheme = scheme.decode('ascii', 'replace')
+                has_netloc = scheme in uses_netloc
+        self.has_netloc = has_netloc
+        if has_query is None:
+            has_query = bool(self.query)
+        self.has_query = has_query
+        if has_fragment is None:
+            has_fragment = bool(self.fragment)
+        self.has_fragment = has_fragment
+        return self
 
     @property
     def username(self):
@@ -161,8 +180,6 @@
 
 
 class _NetlocResultMixinStr(_NetlocResultMixinBase, _ResultMixinStr):
-    __slots__ = ()
-
     @property
     def _userinfo(self):
         netloc = self.netloc
@@ -189,10 +206,15 @@
             port = None
         return hostname, port
 
+    def encode(self, encoding='ascii', errors='strict'):
+        result = _ResultMixinStr.encode(self, encoding, errors)
+        result.has_netloc = self.has_netloc
+        result.has_query = self.has_query
+        result.has_fragment = self.has_fragment
+        return result
+
 
 class _NetlocResultMixinBytes(_NetlocResultMixinBase, _ResultMixinBytes):
-    __slots__ = ()
-
     @property
     def _userinfo(self):
         netloc = self.netloc
@@ -219,6 +241,13 @@
             port = None
         return hostname, port
 
+    def decode(self, encoding='ascii', errors='strict'):
+        result = _ResultMixinBytes.decode(self, encoding, errors)
+        result.has_netloc = self.has_netloc
+        result.has_query = self.has_query
+        result.has_fragment = self.has_fragment
+        return result
+
 
 from collections import namedtuple
 
@@ -240,13 +269,11 @@
         else:
             return self.url
 
-class SplitResult(_SplitResultBase, _NetlocResultMixinStr):
-    __slots__ = ()
+class SplitResult(_NetlocResultMixinStr, _SplitResultBase):
     def geturl(self):
         return urlunsplit(self)
 
-class ParseResult(_ParseResultBase, _NetlocResultMixinStr):
-    __slots__ = ()
+class ParseResult(_NetlocResultMixinStr, _ParseResultBase):
     def geturl(self):
         return urlunparse(self)
 
@@ -259,13 +286,11 @@
         else:
             return self.url
 
-class SplitResultBytes(_SplitResultBase, _NetlocResultMixinBytes):
-    __slots__ = ()
+class SplitResultBytes(_NetlocResultMixinBytes, _SplitResultBase):
     def geturl(self):
         return urlunsplit(self)
 
-class ParseResultBytes(_ParseResultBase, _NetlocResultMixinBytes):
-    __slots__ = ()
+class ParseResultBytes(_NetlocResultMixinBytes, _ParseResultBase):
     def geturl(self):
         return urlunparse(self)
 
@@ -296,7 +321,11 @@
         url, params = _splitparams(url)
     else:
         params = ''
-    result = ParseResult(scheme, netloc, url, params, query, fragment)
+    result = ParseResult(scheme, netloc, url, params, query, fragment,
+        has_netloc=splitresult.has_netloc,
+        has_query=splitresult.has_query,
+        has_fragment=splitresult.has_fragment,
+    )
     return _coerce_result(result)
 
 def _splitparams(url):
@@ -336,16 +365,21 @@
         if url[:i] == 'http': # optimize the common case
             scheme = url[:i].lower()
             url = url[i+1:]
-            if url[:2] == '//':
+            has_netloc = url[:2] == '//'
+            if has_netloc:
                 netloc, url = _splitnetloc(url, 2)
                 if (('[' in netloc and ']' not in netloc) or
                         (']' in netloc and '[' not in netloc)):
                     raise ValueError("Invalid IPv6 URL")
-            if allow_fragments and '#' in url:
+            has_fragment = allow_fragments and '#' in url
+            if has_fragment:
                 url, fragment = url.split('#', 1)
-            if '?' in url:
+            has_query = '?' in url
+            if has_query:
                 url, query = url.split('?', 1)
-            v = SplitResult(scheme, netloc, url, query, fragment)
+            v = SplitResult(scheme, netloc, url, query, fragment,
+                has_netloc=has_netloc, has_query=has_query,
+                has_fragment=has_fragment)
             _parse_cache[key] = v
             return _coerce_result(v)
         for c in url[:i]:
@@ -359,46 +393,62 @@
                 # not a port number
                 scheme, url = url[:i].lower(), rest
 
-    if url[:2] == '//':
+    has_netloc = url[:2] == '//'
+    if has_netloc:
         netloc, url = _splitnetloc(url, 2)
         if (('[' in netloc and ']' not in netloc) or
                 (']' in netloc and '[' not in netloc)):
             raise ValueError("Invalid IPv6 URL")
-    if allow_fragments and '#' in url:
+    has_fragment = allow_fragments and '#' in url
+    if has_fragment:
         url, fragment = url.split('#', 1)
-    if '?' in url:
+    has_query = '?' in url
+    if has_query:
         url, query = url.split('?', 1)
-    v = SplitResult(scheme, netloc, url, query, fragment)
+    v = SplitResult(scheme, netloc, url, query, fragment,
+        has_netloc=has_netloc, has_query=has_query,
+        has_fragment=has_fragment)
     _parse_cache[key] = v
     return _coerce_result(v)
 
 def urlunparse(components):
     """Put a parsed URL back together again.  This may result in a
     slightly different, but equivalent URL, if the URL that was parsed
-    originally had redundant delimiters, e.g. a ? with an empty query
-    (the draft states that these are equivalent)."""
+    originally had a redundant parameters delimiter, a semicolon (;) without
+    any parameters."""
     scheme, netloc, url, params, query, fragment, _coerce_result = (
                                                   _coerce_args(*components))
     if params:
         url = "%s;%s" % (url, params)
-    return _coerce_result(urlunsplit((scheme, netloc, url, query, fragment)))
+    split = (scheme, netloc, url, query, fragment)
+    if isinstance(components, _NetlocResultMixinBase):
+        split = SplitResult(*split,
+            has_netloc=components.has_netloc, has_query=components.has_query,
+            has_fragment=components.has_fragment)
+    return _coerce_result(urlunsplit(split))
 
 def urlunsplit(components):
-    """Combine the elements of a tuple as returned by urlsplit() into a
-    complete URL as a string. The data argument can be any five-item iterable.
-    This may result in a slightly different, but equivalent URL, if the URL that
-    was parsed originally had unnecessary delimiters (for example, a ? with an
-    empty query; the RFC states that these are equivalent)."""
+    """Combine the elements of an object as returned by urlsplit() into a
+    complete URL as a string. The data argument can also be any five-item
+    iterable. The resulting URL may be slightly different, but equivalent."""
     scheme, netloc, url, query, fragment, _coerce_result = (
                                           _coerce_args(*components))
-    if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
+    if isinstance(components, _NetlocResultMixinBase):
+        has_netloc = components.has_netloc
+        has_query = components.has_query
+        has_fragment = components.has_fragment
+    else:  # Support plain tuple
+        has_netloc = netloc or (scheme and scheme in uses_netloc)
+        has_query = query
+        has_fragment = fragment
+    if has_netloc:
         if url and url[:1] != '/': url = '/' + url
         url = '//' + (netloc or '') + url
     if scheme:
         url = scheme + ':' + url
-    if query:
+    if has_query:
         url = url + '?' + query
-    if fragment:
+    if has_fragment:
         url = url + '#' + fragment
     return _coerce_result(url)
 
diff -r 9332a545ad85 Lib/urllib/robotparser.py
--- a/Lib/urllib/robotparser.py	Thu Mar 12 22:01:30 2015 +0200
+++ b/Lib/urllib/robotparser.py	Fri Mar 13 00:20:17 2015 +0000
@@ -164,7 +164,8 @@
         if path == '' and not allowance:
             # an empty value means allow all
             allowance = True
-        path = urllib.parse.urlunparse(urllib.parse.urlparse(path))
+        # Forget has_query etc and force normalization of empty query parts
+        path = urllib.parse.urlunparse(tuple(urllib.parse.urlparse(path)))
         self.path = urllib.parse.quote(path)
         self.allowance = allowance