diff -r 94d0e842b9ea -r 6555962c18f6 Doc/library/urllib.parse.rst --- a/Doc/library/urllib.parse.rst Fri Aug 01 12:28:49 2014 +0200 +++ b/Doc/library/urllib.parse.rst Wed Aug 06 20:52:33 2014 -0700 @@ -238,7 +238,7 @@ with an empty query; the RFC states that these are equivalent). -.. function:: urljoin(base, url, allow_fragments=True) +.. function:: urljoin(base, url, allow_fragments=True, rfc1808=False) Construct a full ("absolute") URL by combining a "base URL" (*base*) with another URL (*url*). Informally, this uses components of the base URL, in @@ -267,6 +267,13 @@ :func:`urlunsplit`, removing possible *scheme* and *netloc* parts. + .. versionchanged:: 3.5 + + Behaviour updated to match the semantics defined in :rfc:`3986`. Legacy + behaviour (:rfc:`1808`) can still be achieved by setting the optional + parameter `rfc1808` to `True`. + + .. function:: urldefrag(url) If *url* contains a fragment identifier, return a modified version of *url* diff -r 94d0e842b9ea -r 6555962c18f6 Lib/test/test_urlparse.py --- a/Lib/test/test_urlparse.py Fri Aug 01 12:28:49 2014 +0200 +++ b/Lib/test/test_urlparse.py Wed Aug 06 20:52:33 2014 -0700 @@ -170,12 +170,14 @@ split = (scheme,) + split self.checkRoundtrips(url, parsed, split) - def checkJoin(self, base, relurl, expected): + def checkJoin(self, base, relurl, expected, rfc1808=False): str_components = (base, relurl, expected) - self.assertEqual(urllib.parse.urljoin(base, relurl), expected) + self.assertEqual(urllib.parse.urljoin(base, relurl, rfc1808=rfc1808), + expected) bytes_components = baseb, relurlb, expectedb = [ x.encode('ascii') for x in str_components] - self.assertEqual(urllib.parse.urljoin(baseb, relurlb), expectedb) + self.assertEqual(urllib.parse.urljoin(baseb, relurlb, rfc1808=rfc1808), + expectedb) def test_unparse_parse(self): str_cases = ['Python', './Python','x-newscheme://foo.com/stuff','x://y','x:/y','x:/','/',] @@ -211,10 +213,14 @@ # "abnormal" cases from RFC 1808: self.checkJoin(RFC1808_BASE, '', 'http://a/b/c/d;p?q#f') - self.checkJoin(RFC1808_BASE, '../../../g', 'http://a/../g') - self.checkJoin(RFC1808_BASE, '../../../../g', 'http://a/../../g') - self.checkJoin(RFC1808_BASE, '/./g', 'http://a/./g') - self.checkJoin(RFC1808_BASE, '/../g', 'http://a/../g') + self.checkJoin(RFC1808_BASE, '../../../g', 'http://a/../g', + rfc1808=True) + self.checkJoin(RFC1808_BASE, '../../../../g', 'http://a/../../g', + rfc1808=True) + self.checkJoin(RFC1808_BASE, '/./g', 'http://a/./g', + rfc1808=True) + self.checkJoin(RFC1808_BASE, '/../g', 'http://a/../g', + rfc1808=True) self.checkJoin(RFC1808_BASE, 'g.', 'http://a/b/c/g.') self.checkJoin(RFC1808_BASE, '.g', 'http://a/b/c/.g') self.checkJoin(RFC1808_BASE, 'g..', 'http://a/b/c/g..') @@ -259,10 +265,14 @@ self.checkJoin(RFC2396_BASE, '../../', 'http://a/') self.checkJoin(RFC2396_BASE, '../../g', 'http://a/g') self.checkJoin(RFC2396_BASE, '', RFC2396_BASE) - self.checkJoin(RFC2396_BASE, '../../../g', 'http://a/../g') - self.checkJoin(RFC2396_BASE, '../../../../g', 'http://a/../../g') - self.checkJoin(RFC2396_BASE, '/./g', 'http://a/./g') - self.checkJoin(RFC2396_BASE, '/../g', 'http://a/../g') + self.checkJoin(RFC2396_BASE, '../../../g', 'http://a/../g', + rfc1808=True) + self.checkJoin(RFC2396_BASE, '../../../../g', 'http://a/../../g', + rfc1808=True) + self.checkJoin(RFC2396_BASE, '/./g', 'http://a/./g', + rfc1808=True) + self.checkJoin(RFC2396_BASE, '/../g', 'http://a/../g', + rfc1808=True) self.checkJoin(RFC2396_BASE, 'g.', 'http://a/b/c/g.') self.checkJoin(RFC2396_BASE, '.g', 'http://a/b/c/.g') self.checkJoin(RFC2396_BASE, 'g..', 'http://a/b/c/g..') @@ -281,7 +291,7 @@ def test_RFC3986(self): # Test cases from RFC3986 self.checkJoin(RFC3986_BASE, '?y','http://a/b/c/d;p?y') - self.checkJoin(RFC2396_BASE, ';x', 'http://a/b/c/;x') + self.checkJoin(RFC3986_BASE, ';x', 'http://a/b/c/;x') self.checkJoin(RFC3986_BASE, 'g:h','g:h') self.checkJoin(RFC3986_BASE, 'g','http://a/b/c/g') self.checkJoin(RFC3986_BASE, './g','http://a/b/c/g') @@ -305,17 +315,17 @@ self.checkJoin(RFC3986_BASE, '../..','http://a/') self.checkJoin(RFC3986_BASE, '../../','http://a/') self.checkJoin(RFC3986_BASE, '../../g','http://a/g') + self.checkJoin(RFC3986_BASE, '../../../g', 'http://a/g') #Abnormal Examples # The 'abnormal scenarios' are incompatible with RFC2986 parsing # Tests are here for reference. - #self.checkJoin(RFC3986_BASE, '../../../g','http://a/g') - #self.checkJoin(RFC3986_BASE, '../../../../g','http://a/g') - #self.checkJoin(RFC3986_BASE, '/./g','http://a/g') - #self.checkJoin(RFC3986_BASE, '/../g','http://a/g') - + self.checkJoin(RFC3986_BASE, '../../../g','http://a/g') + self.checkJoin(RFC3986_BASE, '../../../../g','http://a/g') + self.checkJoin(RFC3986_BASE, '/./g','http://a/g') + self.checkJoin(RFC3986_BASE, '/../g','http://a/g') self.checkJoin(RFC3986_BASE, 'g.','http://a/b/c/g.') self.checkJoin(RFC3986_BASE, '.g','http://a/b/c/.g') self.checkJoin(RFC3986_BASE, 'g..','http://a/b/c/g..') @@ -355,10 +365,12 @@ self.checkJoin(SIMPLE_BASE, '../g','http://a/b/g') self.checkJoin(SIMPLE_BASE, '../..','http://a/') self.checkJoin(SIMPLE_BASE, '../../g','http://a/g') - self.checkJoin(SIMPLE_BASE, '../../../g','http://a/../g') + self.checkJoin(SIMPLE_BASE, '../../../g','http://a/../g', + rfc1808=True) self.checkJoin(SIMPLE_BASE, './../g','http://a/b/g') self.checkJoin(SIMPLE_BASE, './g/.','http://a/b/c/g/') - self.checkJoin(SIMPLE_BASE, '/./g','http://a/./g') + self.checkJoin(SIMPLE_BASE, '/./g','http://a/./g', + rfc1808=True) self.checkJoin(SIMPLE_BASE, 'g/./h','http://a/b/c/g/h') self.checkJoin(SIMPLE_BASE, 'g/../h','http://a/b/c/h') self.checkJoin(SIMPLE_BASE, 'http:g','http://a/b/c/g') diff -r 94d0e842b9ea -r 6555962c18f6 Lib/urllib/parse.py --- a/Lib/urllib/parse.py Fri Aug 01 12:28:49 2014 +0200 +++ b/Lib/urllib/parse.py Wed Aug 06 20:52:33 2014 -0700 @@ -402,18 +402,20 @@ url = url + '#' + fragment return _coerce_result(url) -def urljoin(base, url, allow_fragments=True): +def urljoin(base, url, allow_fragments=True, rfc1808=False): """Join a base URL and a possibly relative URL to form an absolute interpretation of the latter.""" if not base: return url if not url: return base + base, url, _coerce_result = _coerce_args(base, url) bscheme, bnetloc, bpath, bparams, bquery, bfragment = \ urlparse(base, '', allow_fragments) scheme, netloc, path, params, query, fragment = \ urlparse(url, bscheme, allow_fragments) + if scheme != bscheme or scheme not in uses_relative: return _coerce_result(url) if scheme in uses_netloc: @@ -421,9 +423,11 @@ return _coerce_result(urlunparse((scheme, netloc, path, params, query, fragment))) netloc = bnetloc - if path[:1] == '/': - return _coerce_result(urlunparse((scheme, netloc, path, - params, query, fragment))) + + if rfc1808: + if path[:1] == '/': + return _coerce_result(urlunparse((scheme, netloc, path, + params, query, fragment))) if not path and not params: path = bpath params = bparams @@ -431,29 +435,46 @@ query = bquery return _coerce_result(urlunparse((scheme, netloc, path, params, query, fragment))) - segments = bpath.split('/')[:-1] + path.split('/') - # XXX The stuff below is bogus in various ways... - if segments[-1] == '.': - segments[-1] = '' - while '.' in segments: - segments.remove('.') - while 1: - i = 1 - n = len(segments) - 1 - while i < n: - if (segments[i] == '..' - and segments[i-1] not in ('', '..')): - del segments[i-1:i+1] - break - i = i+1 + + base_parts = bpath.split('/') + if base_parts[-1] != '': + # the last item is not a directory, so will not be taken into account + # in resolving the relative path + del base_parts[-1] + + # for rfc3986, ignore all base path should the first character be root. + # rfc1808 behaviour is implemented above with an early out + if path[:1] == '/': + segments = path.split('/') + else: + segments = base_parts + path.split('/') + + resolved_path = [] + + for seg in segments: + if seg == '..': + if len(resolved_path) > 0: + if not rfc1808 or resolved_path[-1] not in ('..', ''): + resolved_path.pop() + else: + resolved_path.append(seg) + elif rfc1808: + resolved_path.append(seg) + # ignore any .. segments that would otherwise cause an IndexError + # when popped from resolved_path if resolving for rfc3986 + elif seg == '.': + continue else: - break - if segments == ['', '..']: - segments[-1] = '' - elif len(segments) >= 2 and segments[-1] == '..': - segments[-2:] = [''] - return _coerce_result(urlunparse((scheme, netloc, '/'.join(segments), - params, query, fragment))) + resolved_path.append(seg) + + if segments[-1] in ('.', '..'): + # do some post-processing here. if the last segment was a relative dir, + # then we need to append the trailing '/' + resolved_path.append('') + + return _coerce_result(urlunparse((scheme, netloc, '/'.join( + resolved_path), params, query, fragment))) + def urldefrag(url): """Removes any existing fragment from URL.