Message 366833 - Python tracker

➜

This issue tracker has been migrated to GitHub, and is currently read-only.
For more information, see the GitHub FAQs in the Python's Developer Guide.

Author	vstinner
Recipients	vstinner
Date	2020-04-20.14:46:13
SpamBayes Score	-1.0
Marked as misclassified	Yes
Message-id	<1587393973.44.0.749605052245.issue40338@roundup.psfhosted.org>
In-reply-to

Content
(The first message is basically David's email rephrased. Here is my reply ;-)) > This could present issues if server-side checks are used by applications to validate a URLs authority. Which kind of application would be affected by this vulnerability? It's unclear to me if urllib should be modified to explicitly reject \ in netloc, or if only third-party code should pay attention to this corner case (potential vulnerability). The urllib module has _parse_proxy() and HTTPPasswordMgr.reduce_uri() code which use an "authority" variable. Example: --- from urllib.parse import urlsplit, _splitport, _splittype, _splituser, _splitpasswd def _parse_proxy(proxy): """Return (scheme, user, password, host/port) given a URL or an authority. If a URL is supplied, it must have an authority (host:port) component. According to RFC 3986, having an authority component means the URL must have two slashes after the scheme. """ scheme, r_scheme = _splittype(proxy) if not r_scheme.startswith("/"): # authority scheme = None authority = proxy else: # URL if not r_scheme.startswith("//"): raise ValueError("proxy URL with no authority: %r" % proxy) # We have an authority, so for RFC 3986-compliant URLs (by ss 3. # and 3.3.), path is empty or starts with '/' end = r_scheme.find("/", 2) if end == -1: end = None authority = r_scheme[2:end] userinfo, hostport = _splituser(authority) if userinfo is not None: user, password = _splitpasswd(userinfo) else: user = password = None return scheme, user, password, hostport def reduce_uri(uri, default_port=True): """Accept authority or URI and extract only the authority and path.""" # note HTTP URLs do not have a userinfo component parts = urlsplit(uri) if parts[1]: # URI scheme = parts[0] authority = parts[1] path = parts[2] or '/' else: # host or host:port scheme = None authority = uri path = '/' host, port = _splitport(authority) if default_port and port is None and scheme is not None: dport = {"http": 80, "https": 443, }.get(scheme) if dport is not None: authority = "%s:%d" % (host, dport) return authority, path def test(uri): print(f"{uri} => reduce_uri: {reduce_uri(uri)}") print(f"{uri} => _parse_proxy: {_parse_proxy(uri)}") test(r"https://www.example.com") test(r"https://user@www.example.com") test(r"https://xdavidhu.me\test.corp.google.com") test(r"https://user:password@xdavidhu.me\test.corp.google.com") --- Output on Python 3.9: --- https://www.example.com => reduce_uri: ('www.example.com:443', '/') https://www.example.com => _parse_proxy: ('https', None, None, 'www.example.com') https://user@www.example.com => reduce_uri: ('user@www.example.com:443', '/') https://user@www.example.com => _parse_proxy: ('https', 'user', None, 'www.example.com') https://xdavidhu.me\test.corp.google.com => reduce_uri: ('xdavidhu.me\\test.corp.google.com:443', '/') https://xdavidhu.me\test.corp.google.com => _parse_proxy: ('https', None, None, 'xdavidhu.me\\test.corp.google.com') https://user:password@xdavidhu.me\test.corp.google.com => reduce_uri: ('user:password@xdavidhu.me\\test.corp.google.com:443', '/') https://user:password@xdavidhu.me\test.corp.google.com => _parse_proxy: ('https', 'user', 'password', 'xdavidhu.me\\test.corp.google.com') --- It seems to behave as expected, no?

(The first message is basically David's email rephrased. Here is my reply ;-))

> This could present issues if server-side checks are used by applications to validate a URLs authority.

Which kind of application would be affected by this vulnerability?

It's unclear to me if urllib should be modified to explicitly reject \ in netloc, or if only third-party code should pay attention to this corner case (potential vulnerability).


The urllib module has _parse_proxy() and HTTPPasswordMgr.reduce_uri() code which use an "authority" variable.

Example:
---
from urllib.parse import urlsplit, _splitport, _splittype, _splituser,
_splitpasswd

def _parse_proxy(proxy):
    """Return (scheme, user, password, host/port) given a URL or an authority.

    If a URL is supplied, it must have an authority (host:port) component.
    According to RFC 3986, having an authority component means the URL must
    have two slashes after the scheme.
    """
    scheme, r_scheme = _splittype(proxy)
    if not r_scheme.startswith("/"):
        # authority
        scheme = None
        authority = proxy
    else:
        # URL
        if not r_scheme.startswith("//"):
            raise ValueError("proxy URL with no authority: %r" % proxy)
        # We have an authority, so for RFC 3986-compliant URLs (by ss 3.
        # and 3.3.), path is empty or starts with '/'
        end = r_scheme.find("/", 2)
        if end == -1:
            end = None
        authority = r_scheme[2:end]
    userinfo, hostport = _splituser(authority)
    if userinfo is not None:
        user, password = _splitpasswd(userinfo)
    else:
        user = password = None
    return scheme, user, password, hostport


def reduce_uri(uri, default_port=True):
    """Accept authority or URI and extract only the authority and path."""
    # note HTTP URLs do not have a userinfo component
    parts = urlsplit(uri)
    if parts[1]:
        # URI
        scheme = parts[0]
        authority = parts[1]
        path = parts[2] or '/'
    else:
        # host or host:port
        scheme = None
        authority = uri
        path = '/'
    host, port = _splitport(authority)
    if default_port and port is None and scheme is not None:
        dport = {"http": 80,
                 "https": 443,
                 }.get(scheme)
        if dport is not None:
            authority = "%s:%d" % (host, dport)
    return authority, path

def test(uri):
    print(f"{uri} => reduce_uri: {reduce_uri(uri)}")
    print(f"{uri} => _parse_proxy: {_parse_proxy(uri)}")

test(r"https://www.example.com")
test(r"https://user@www.example.com")
test(r"https://xdavidhu.me\test.corp.google.com")
test(r"https://user:password@xdavidhu.me\test.corp.google.com")
---

Output on Python 3.9:
---
https://www.example.com => reduce_uri: ('www.example.com:443', '/')
https://www.example.com => _parse_proxy: ('https', None, None,
'www.example.com')
https://user@www.example.com => reduce_uri: ('user@www.example.com:443', '/')
https://user@www.example.com => _parse_proxy: ('https', 'user', None,
'www.example.com')
https://xdavidhu.me\test.corp.google.com => reduce_uri:
('xdavidhu.me\\test.corp.google.com:443', '/')
https://xdavidhu.me\test.corp.google.com => _parse_proxy: ('https',
None, None, 'xdavidhu.me\\test.corp.google.com')
https://user:password@xdavidhu.me\test.corp.google.com => reduce_uri:
('user:password@xdavidhu.me\\test.corp.google.com:443', '/')
https://user:password@xdavidhu.me\test.corp.google.com =>
_parse_proxy: ('https', 'user', 'password',
'xdavidhu.me\\test.corp.google.com')
---

It seems to behave as expected, no?

History
Date	User	Action	Args
2020-04-20 14:46:13	vstinner	set	recipients: + vstinner
2020-04-20 14:46:13	vstinner	set	messageid: <1587393973.44.0.749605052245.issue40338@roundup.psfhosted.org>
2020-04-20 14:46:13	vstinner	link	issue40338 messages
2020-04-20 14:46:13	vstinner	create