diff -r b07400659dba Lib/test/test_urlparse.py --- a/Lib/test/test_urlparse.py Thu Mar 06 17:06:41 2014 +0100 +++ b/Lib/test/test_urlparse.py Fri Mar 07 17:15:21 2014 +0100 @@ -533,28 +533,23 @@ class UrlParseTestCase(unittest.TestCase self.assertEqual(p.geturl(), url) # Verify an illegal port is returned as None - url = b"HTTP://WWW.PYTHON.ORG:65536/doc/#frag" - p = urllib.parse.urlsplit(url) - self.assertEqual(p.port, None) + + with self.assertRaises(ValueError) as cm: + urllib.parse.urlsplit(b"HTTP://WWW.PYTHON.ORG:65536/doc/#frag") + self.assertRegex(str(cm.exception), '^Invalid port number: ') def test_attributes_bad_port(self): """Check handling of non-integer ports.""" - p = urllib.parse.urlsplit("http://www.example.net:foo") - self.assertEqual(p.netloc, "www.example.net:foo") - self.assertRaises(ValueError, lambda: p.port) - - p = urllib.parse.urlparse("http://www.example.net:foo") - self.assertEqual(p.netloc, "www.example.net:foo") - self.assertRaises(ValueError, lambda: p.port) + self.assertRaises(ValueError, + urllib.parse.urlsplit, "http://www.example.net:foo") + self.assertRaises(ValueError, + urllib.parse.urlparse, "http://www.example.net:foo") # Once again, repeat ourselves to test bytes - p = urllib.parse.urlsplit(b"http://www.example.net:foo") - self.assertEqual(p.netloc, b"www.example.net:foo") - self.assertRaises(ValueError, lambda: p.port) - - p = urllib.parse.urlparse(b"http://www.example.net:foo") - self.assertEqual(p.netloc, b"www.example.net:foo") - self.assertRaises(ValueError, lambda: p.port) + self.assertRaises(ValueError, + urllib.parse.urlsplit, b"http://www.example.net:foo") + self.assertRaises(ValueError, + urllib.parse.urlsplit, b"http://www.example.net:foo") def test_attributes_without_netloc(self): # This example is straight from RFC 3261. It looks like it @@ -755,7 +750,13 @@ class UrlParseTestCase(unittest.TestCase self.assertEqual(splitport('parrot'), ('parrot', None)) self.assertEqual(splitport('parrot:'), ('parrot', None)) self.assertEqual(splitport('127.0.0.1'), ('127.0.0.1', None)) - self.assertEqual(splitport('parrot:cheese'), ('parrot:cheese', None)) + + # invalid host (":") + self.assertRaises(ValueError, splitport, '::1') + # invalid port number ("cheese" is not an integer) + self.assertRaises(ValueError, splitport, 'parrot:cheese') + # 123456789 is an invalid port number + self.assertRaises(ValueError, splitport, 'host:123456789') def test_splitnport(self): splitnport = urllib.parse.splitnport @@ -871,6 +872,15 @@ class UrlParseTestCase(unittest.TestCase quoter = urllib.parse.Quoter(urllib.parse._ALWAYS_SAFE) self.assertIn('Quoter', repr(quoter)) + def test_invalid_ipv6(self): + urls = ( + 'http://::1/', + 'http://[127.0.0.1]/', + 'http://[host]/', + ) + for url in urls: + self.assertRaises(ValueError, urllib.parse.urlparse, url) + def test_main(): support.run_unittest(UrlParseTestCase) diff -r b07400659dba Lib/urllib/parse.py --- a/Lib/urllib/parse.py Thu Mar 06 17:06:41 2014 +0100 +++ b/Lib/urllib/parse.py Fri Mar 07 17:15:21 2014 +0100 @@ -27,9 +27,10 @@ parsing quirks from older RFCs are retai test_urlparse.py provides a good indicator of parsing behavior. """ +import collections +import ipaddress import re import sys -import collections __all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag", "urlsplit", "urlunsplit", "urlencode", "parse_qs", @@ -314,7 +315,12 @@ def _splitnetloc(url, start=0): wdelim = url.find(c, start) # find first of this delim if wdelim >= 0: # if found delim = min(delim, wdelim) # use earliest delim position - return url[start:delim], url[delim:] # return (domain, rest) + netloc, url = url[start:delim], url[delim:] # return (domain, rest) + + # call splitport() to validate netloc + splitport(netloc) + + return netloc, url def urlsplit(url, scheme='', allow_fragments=True): """Parse a URL into 5 components: @@ -338,9 +344,6 @@ def urlsplit(url, scheme='', allow_fragm url = url[i+1:] if url[:2] == '//': netloc, url = _splitnetloc(url, 2) - if (('[' in netloc and ']' not in netloc) or - (']' in netloc and '[' not in netloc)): - raise ValueError("Invalid IPv6 URL") if allow_fragments and '#' in url: url, fragment = url.split('#', 1) if '?' in url: @@ -361,9 +364,6 @@ def urlsplit(url, scheme='', allow_fragm if url[:2] == '//': netloc, url = _splitnetloc(url, 2) - if (('[' in netloc and ']' not in netloc) or - (']' in netloc and '[' not in netloc)): - raise ValueError("Invalid IPv6 URL") if allow_fragments and '#' in url: url, fragment = url.split('#', 1) if '?' in url: @@ -894,18 +894,39 @@ def splitpasswd(user): # splittag('/path#tag') --> '/path', 'tag' _portprog = None -def splitport(host): +def splitport(netloc): """splitport('host:port') --> 'host', 'port'.""" - global _portprog - if _portprog is None: - _portprog = re.compile('^(.*):([0-9]*)$') + auth = max(netloc.rfind('@'), 0) + pos = max(netloc.rfind(']'), auth) + pos = netloc.rfind(':', pos) + if pos != -1: + full_host = netloc[:pos] + port = netloc[pos+1:] + if port: + try: + number = int(port) + if not(1 <= number <= 65535): + raise ValueError + except ValueError: + raise ValueError("Invalid port number: %r" % port) from None + else: + port = None + else: + full_host = netloc + port = None - match = _portprog.match(host) - if match: - host, port = match.groups() - if port: - return host, port - return host, None + host = full_host[auth:] + if host.startswith('[') and host.endswith(']'): + ipv6 = host[1:-1] + try: + ipaddress.IPv6Address(ipv6) + except ValueError: + raise ValueError("Invalid IPv6 URL: %r" % ipv6) from None + elif re.search('[][:]', host): + # host must not contain '[', ']' or ':' + raise ValueError("Invalid host: %r" % host) + + return full_host, port _nportprog = None def splitnport(host, defport=-1):