diff -r 94d0e842b9ea -r 9a840524929b Doc/library/urllib.parse.rst --- a/Doc/library/urllib.parse.rst Fri Aug 01 12:28:49 2014 +0200 +++ b/Doc/library/urllib.parse.rst Mon Aug 11 07:03:05 2014 -0700 @@ -267,6 +267,11 @@ :func:`urlunsplit`, removing possible *scheme* and *netloc* parts. + .. versionchanged:: 3.5 + + Behaviour updated to match the semantics defined in :rfc:`3986`. + + .. function:: urldefrag(url) If *url* contains a fragment identifier, return a modified version of *url* diff -r 94d0e842b9ea -r 9a840524929b Lib/test/test_urlparse.py --- a/Lib/test/test_urlparse.py Fri Aug 01 12:28:49 2014 +0200 +++ b/Lib/test/test_urlparse.py Mon Aug 11 07:03:05 2014 -0700 @@ -211,10 +211,6 @@ # "abnormal" cases from RFC 1808: self.checkJoin(RFC1808_BASE, '', 'http://a/b/c/d;p?q#f') - self.checkJoin(RFC1808_BASE, '../../../g', 'http://a/../g') - self.checkJoin(RFC1808_BASE, '../../../../g', 'http://a/../../g') - self.checkJoin(RFC1808_BASE, '/./g', 'http://a/./g') - self.checkJoin(RFC1808_BASE, '/../g', 'http://a/../g') self.checkJoin(RFC1808_BASE, 'g.', 'http://a/b/c/g.') self.checkJoin(RFC1808_BASE, '.g', 'http://a/b/c/.g') self.checkJoin(RFC1808_BASE, 'g..', 'http://a/b/c/g..') @@ -229,6 +225,13 @@ #self.checkJoin(RFC1808_BASE, 'http:g', 'http:g') #self.checkJoin(RFC1808_BASE, 'http:', 'http:') + # XXX: The following tests are no longer compatible with RFC3986 + # self.checkJoin(RFC1808_BASE, '../../../g', 'http://a/../g') + # self.checkJoin(RFC1808_BASE, '../../../../g', 'http://a/../../g') + # self.checkJoin(RFC1808_BASE, '/./g', 'http://a/./g') + # self.checkJoin(RFC1808_BASE, '/../g', 'http://a/../g') + + def test_RFC2368(self): # Issue 11467: path that starts with a number is not parsed correctly self.assertEqual(urllib.parse.urlparse('mailto:1337@example.org'), @@ -259,10 +262,6 @@ self.checkJoin(RFC2396_BASE, '../../', 'http://a/') self.checkJoin(RFC2396_BASE, '../../g', 'http://a/g') self.checkJoin(RFC2396_BASE, '', RFC2396_BASE) - self.checkJoin(RFC2396_BASE, '../../../g', 'http://a/../g') - self.checkJoin(RFC2396_BASE, '../../../../g', 'http://a/../../g') - self.checkJoin(RFC2396_BASE, '/./g', 'http://a/./g') - self.checkJoin(RFC2396_BASE, '/../g', 'http://a/../g') self.checkJoin(RFC2396_BASE, 'g.', 'http://a/b/c/g.') self.checkJoin(RFC2396_BASE, '.g', 'http://a/b/c/.g') self.checkJoin(RFC2396_BASE, 'g..', 'http://a/b/c/g..') @@ -278,10 +277,17 @@ self.checkJoin(RFC2396_BASE, 'g#s/./x', 'http://a/b/c/g#s/./x') self.checkJoin(RFC2396_BASE, 'g#s/../x', 'http://a/b/c/g#s/../x') + # XXX: The following tests are no longer compatible with RFC3986 + # self.checkJoin(RFC2396_BASE, '../../../g', 'http://a/../g') + # self.checkJoin(RFC2396_BASE, '../../../../g', 'http://a/../../g') + # self.checkJoin(RFC2396_BASE, '/./g', 'http://a/./g') + # self.checkJoin(RFC2396_BASE, '/../g', 'http://a/../g') + + def test_RFC3986(self): # Test cases from RFC3986 self.checkJoin(RFC3986_BASE, '?y','http://a/b/c/d;p?y') - self.checkJoin(RFC2396_BASE, ';x', 'http://a/b/c/;x') + self.checkJoin(RFC3986_BASE, ';x', 'http://a/b/c/;x') self.checkJoin(RFC3986_BASE, 'g:h','g:h') self.checkJoin(RFC3986_BASE, 'g','http://a/b/c/g') self.checkJoin(RFC3986_BASE, './g','http://a/b/c/g') @@ -305,17 +311,17 @@ self.checkJoin(RFC3986_BASE, '../..','http://a/') self.checkJoin(RFC3986_BASE, '../../','http://a/') self.checkJoin(RFC3986_BASE, '../../g','http://a/g') + self.checkJoin(RFC3986_BASE, '../../../g', 'http://a/g') #Abnormal Examples # The 'abnormal scenarios' are incompatible with RFC2986 parsing # Tests are here for reference. - #self.checkJoin(RFC3986_BASE, '../../../g','http://a/g') - #self.checkJoin(RFC3986_BASE, '../../../../g','http://a/g') - #self.checkJoin(RFC3986_BASE, '/./g','http://a/g') - #self.checkJoin(RFC3986_BASE, '/../g','http://a/g') - + self.checkJoin(RFC3986_BASE, '../../../g','http://a/g') + self.checkJoin(RFC3986_BASE, '../../../../g','http://a/g') + self.checkJoin(RFC3986_BASE, '/./g','http://a/g') + self.checkJoin(RFC3986_BASE, '/../g','http://a/g') self.checkJoin(RFC3986_BASE, 'g.','http://a/b/c/g.') self.checkJoin(RFC3986_BASE, '.g','http://a/b/c/.g') self.checkJoin(RFC3986_BASE, 'g..','http://a/b/c/g..') @@ -355,10 +361,8 @@ self.checkJoin(SIMPLE_BASE, '../g','http://a/b/g') self.checkJoin(SIMPLE_BASE, '../..','http://a/') self.checkJoin(SIMPLE_BASE, '../../g','http://a/g') - self.checkJoin(SIMPLE_BASE, '../../../g','http://a/../g') self.checkJoin(SIMPLE_BASE, './../g','http://a/b/g') self.checkJoin(SIMPLE_BASE, './g/.','http://a/b/c/g/') - self.checkJoin(SIMPLE_BASE, '/./g','http://a/./g') self.checkJoin(SIMPLE_BASE, 'g/./h','http://a/b/c/g/h') self.checkJoin(SIMPLE_BASE, 'g/../h','http://a/b/c/h') self.checkJoin(SIMPLE_BASE, 'http:g','http://a/b/c/g') @@ -372,6 +376,10 @@ self.checkJoin('svn://pathtorepo/dir1', 'dir2', 'svn://pathtorepo/dir2') self.checkJoin('svn+ssh://pathtorepo/dir1', 'dir2', 'svn+ssh://pathtorepo/dir2') + # XXX: The following tests are no longer compatible with RFC3986 + # self.checkJoin(SIMPLE_BASE, '../../../g','http://a/../g') + # self.checkJoin(SIMPLE_BASE, '/./g','http://a/./g') + def test_RFC2732(self): str_cases = [ ('http://Test.python.org:5432/foo/', 'test.python.org', 5432), diff -r 94d0e842b9ea -r 9a840524929b Lib/urllib/parse.py --- a/Lib/urllib/parse.py Fri Aug 01 12:28:49 2014 +0200 +++ b/Lib/urllib/parse.py Mon Aug 11 07:03:05 2014 -0700 @@ -409,11 +409,13 @@ return url if not url: return base + base, url, _coerce_result = _coerce_args(base, url) bscheme, bnetloc, bpath, bparams, bquery, bfragment = \ urlparse(base, '', allow_fragments) scheme, netloc, path, params, query, fragment = \ urlparse(url, bscheme, allow_fragments) + if scheme != bscheme or scheme not in uses_relative: return _coerce_result(url) if scheme in uses_netloc: @@ -421,9 +423,7 @@ return _coerce_result(urlunparse((scheme, netloc, path, params, query, fragment))) netloc = bnetloc - if path[:1] == '/': - return _coerce_result(urlunparse((scheme, netloc, path, - params, query, fragment))) + if not path and not params: path = bpath params = bparams @@ -431,29 +431,42 @@ query = bquery return _coerce_result(urlunparse((scheme, netloc, path, params, query, fragment))) - segments = bpath.split('/')[:-1] + path.split('/') - # XXX The stuff below is bogus in various ways... - if segments[-1] == '.': - segments[-1] = '' - while '.' in segments: - segments.remove('.') - while 1: - i = 1 - n = len(segments) - 1 - while i < n: - if (segments[i] == '..' - and segments[i-1] not in ('', '..')): - del segments[i-1:i+1] - break - i = i+1 + + base_parts = bpath.split('/') + if base_parts[-1] != '': + # the last item is not a directory, so will not be taken into account + # in resolving the relative path + del base_parts[-1] + + # for rfc3986, ignore all base path should the first character be root. + if path[:1] == '/': + segments = path.split('/') + else: + segments = base_parts + path.split('/') + + resolved_path = [] + + for seg in segments: + if seg == '..': + try: + resolved_path.pop() + except IndexError: + # ignore any .. segments that would otherwise cause an IndexError + # when popped from resolved_path if resolving for rfc3986 + pass + elif seg == '.': + continue else: - break - if segments == ['', '..']: - segments[-1] = '' - elif len(segments) >= 2 and segments[-1] == '..': - segments[-2:] = [''] - return _coerce_result(urlunparse((scheme, netloc, '/'.join(segments), - params, query, fragment))) + resolved_path.append(seg) + + if segments[-1] in ('.', '..'): + # do some post-processing here. if the last segment was a relative dir, + # then we need to append the trailing '/' + resolved_path.append('') + + return _coerce_result(urlunparse((scheme, netloc, '/'.join( + resolved_path), params, query, fragment))) + def urldefrag(url): """Removes any existing fragment from URL.