From fc897fcc01964649f023e0baa4c95d142e4e8a10 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Sat, 31 Aug 2024 12:42:08 +0300 Subject: [PATCH] gh-76960: Fix urljoin() and urldefrag() for URIs with empty components (GH-123273) * urljoin() with relative reference "?" sets empty query and removes fragment. * Preserve empty components (authority, params, query, fragment) in urljoin(). * Preserve empty components (authority, params, query) in urldefrag(). Also refactor the code and get rid of double _coerce_args() and _coerce_result() calls in urljoin(), urldefrag(), urlparse() and urlunparse(). --- Lib/test/test_urlparse.py | 87 ++++++++++++--- Lib/urllib/parse.py | 100 +++++++++++------- ...4-08-23-22-01-30.gh-issue-76960.vsANPu.rst | 5 + 3 files changed, 140 insertions(+), 52 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2024-08-23-22-01-30.gh-issue-76960.vsANPu.rst diff --git a/Lib/test/test_urlparse.py b/Lib/test/test_urlparse.py index 3dbbd9cf1d3..d49e4388696 100644 --- a/Lib/test/test_urlparse.py +++ b/Lib/test/test_urlparse.py @@ -349,7 +349,7 @@ class UrlParseTestCase(unittest.TestCase): split = (scheme,) + split self.checkRoundtrips(url, parsed, split) - def checkJoin(self, base, relurl, expected): + def checkJoin(self, base, relurl, expected, *, relroundtrip=True): with self.subTest(base=base, relurl=relurl): self.assertEqual(urllib.parse.urljoin(base, relurl), expected) baseb = base.encode('ascii') @@ -357,10 +357,11 @@ class UrlParseTestCase(unittest.TestCase): expectedb = expected.encode('ascii') self.assertEqual(urllib.parse.urljoin(baseb, relurlb), expectedb) - relurl = urllib.parse.urlunsplit(urllib.parse.urlsplit(relurl)) - self.assertEqual(urllib.parse.urljoin(base, relurl), expected) - relurlb = urllib.parse.urlunsplit(urllib.parse.urlsplit(relurlb)) - self.assertEqual(urllib.parse.urljoin(baseb, relurlb), expectedb) + if relroundtrip: + relurl = urllib.parse.urlunsplit(urllib.parse.urlsplit(relurl)) + self.assertEqual(urllib.parse.urljoin(base, relurl), expected) + relurlb = urllib.parse.urlunsplit(urllib.parse.urlsplit(relurlb)) + self.assertEqual(urllib.parse.urljoin(baseb, relurlb), expectedb) def test_unparse_parse(self): str_cases = ['Python', './Python','x-newscheme://foo.com/stuff','x://y','x:/y','x:/','/',] @@ -526,8 +527,6 @@ class UrlParseTestCase(unittest.TestCase): def test_urljoins(self): self.checkJoin(SIMPLE_BASE, 'g:h','g:h') - self.checkJoin(SIMPLE_BASE, 'http:g','http://a/b/c/g') - self.checkJoin(SIMPLE_BASE, 'http:','http://a/b/c/d') self.checkJoin(SIMPLE_BASE, 'g','http://a/b/c/g') self.checkJoin(SIMPLE_BASE, './g','http://a/b/c/g') self.checkJoin(SIMPLE_BASE, 'g/','http://a/b/c/g/') @@ -548,8 +547,6 @@ class UrlParseTestCase(unittest.TestCase): self.checkJoin(SIMPLE_BASE, 'g/./h','http://a/b/c/g/h') self.checkJoin(SIMPLE_BASE, 'g/../h','http://a/b/c/h') self.checkJoin(SIMPLE_BASE, 'http:g','http://a/b/c/g') - self.checkJoin(SIMPLE_BASE, 'http:','http://a/b/c/d') - self.checkJoin(SIMPLE_BASE, 'http:?y','http://a/b/c/d?y') self.checkJoin(SIMPLE_BASE, 'http:g?y','http://a/b/c/g?y') self.checkJoin(SIMPLE_BASE, 'http:g?y/./x','http://a/b/c/g?y/./x') self.checkJoin('http:///', '..','http:///') @@ -579,6 +576,53 @@ class UrlParseTestCase(unittest.TestCase): # issue 23703: don't duplicate filename self.checkJoin('a', 'b', 'b') + # Test with empty (but defined) components. + self.checkJoin(RFC1808_BASE, '', 'http://a/b/c/d;p?q#f') + self.checkJoin(RFC1808_BASE, '#', 'http://a/b/c/d;p?q#', relroundtrip=False) + self.checkJoin(RFC1808_BASE, '#z', 'http://a/b/c/d;p?q#z') + self.checkJoin(RFC1808_BASE, '?', 'http://a/b/c/d;p?', relroundtrip=False) + self.checkJoin(RFC1808_BASE, '?#z', 'http://a/b/c/d;p?#z', relroundtrip=False) + self.checkJoin(RFC1808_BASE, '?y', 'http://a/b/c/d;p?y') + self.checkJoin(RFC1808_BASE, ';', 'http://a/b/c/;') + self.checkJoin(RFC1808_BASE, ';?y', 'http://a/b/c/;?y') + self.checkJoin(RFC1808_BASE, ';#z', 'http://a/b/c/;#z') + self.checkJoin(RFC1808_BASE, ';x', 'http://a/b/c/;x') + self.checkJoin(RFC1808_BASE, '/w', 'http://a/w') + self.checkJoin(RFC1808_BASE, '//', 'http://a/b/c/d;p?q#f') + self.checkJoin(RFC1808_BASE, '//#z', 'http://a/b/c/d;p?q#z') + self.checkJoin(RFC1808_BASE, '//?y', 'http://a/b/c/d;p?y') + self.checkJoin(RFC1808_BASE, '//;x', 'http://;x') + self.checkJoin(RFC1808_BASE, '///w', 'http://a/w') + self.checkJoin(RFC1808_BASE, '//v', 'http://v') + # For backward compatibility with RFC1630, the scheme name is allowed + # to be present in a relative reference if it is the same as the base + # URI scheme. + self.checkJoin(RFC1808_BASE, 'http:', 'http://a/b/c/d;p?q#f') + self.checkJoin(RFC1808_BASE, 'http:#', 'http://a/b/c/d;p?q#', relroundtrip=False) + self.checkJoin(RFC1808_BASE, 'http:#z', 'http://a/b/c/d;p?q#z') + self.checkJoin(RFC1808_BASE, 'http:?', 'http://a/b/c/d;p?', relroundtrip=False) + self.checkJoin(RFC1808_BASE, 'http:?#z', 'http://a/b/c/d;p?#z', relroundtrip=False) + self.checkJoin(RFC1808_BASE, 'http:?y', 'http://a/b/c/d;p?y') + self.checkJoin(RFC1808_BASE, 'http:;', 'http://a/b/c/;') + self.checkJoin(RFC1808_BASE, 'http:;?y', 'http://a/b/c/;?y') + self.checkJoin(RFC1808_BASE, 'http:;#z', 'http://a/b/c/;#z') + self.checkJoin(RFC1808_BASE, 'http:;x', 'http://a/b/c/;x') + self.checkJoin(RFC1808_BASE, 'http:/w', 'http://a/w') + self.checkJoin(RFC1808_BASE, 'http://', 'http://a/b/c/d;p?q#f') + self.checkJoin(RFC1808_BASE, 'http://#z', 'http://a/b/c/d;p?q#z') + self.checkJoin(RFC1808_BASE, 'http://?y', 'http://a/b/c/d;p?y') + self.checkJoin(RFC1808_BASE, 'http://;x', 'http://;x') + self.checkJoin(RFC1808_BASE, 'http:///w', 'http://a/w') + self.checkJoin(RFC1808_BASE, 'http://v', 'http://v') + # Different scheme is not ignored. + self.checkJoin(RFC1808_BASE, 'https:', 'https:', relroundtrip=False) + self.checkJoin(RFC1808_BASE, 'https:#', 'https:#', relroundtrip=False) + self.checkJoin(RFC1808_BASE, 'https:#z', 'https:#z', relroundtrip=False) + self.checkJoin(RFC1808_BASE, 'https:?', 'https:?', relroundtrip=False) + self.checkJoin(RFC1808_BASE, 'https:?y', 'https:?y', relroundtrip=False) + self.checkJoin(RFC1808_BASE, 'https:;', 'https:;') + self.checkJoin(RFC1808_BASE, 'https:;x', 'https:;x') + def test_RFC2732(self): str_cases = [ ('http://Test.python.org:5432/foo/', 'test.python.org', 5432), @@ -641,16 +685,31 @@ class UrlParseTestCase(unittest.TestCase): ('http://python.org/p?q', 'http://python.org/p?q', ''), (RFC1808_BASE, 'http://a/b/c/d;p?q', 'f'), (RFC2396_BASE, 'http://a/b/c/d;p?q', ''), + ('http://a/b/c;p?q#f', 'http://a/b/c;p?q', 'f'), + ('http://a/b/c;p?q#', 'http://a/b/c;p?q', ''), + ('http://a/b/c;p?q', 'http://a/b/c;p?q', ''), + ('http://a/b/c;p?#f', 'http://a/b/c;p?', 'f'), + ('http://a/b/c;p#f', 'http://a/b/c;p', 'f'), + ('http://a/b/c;?q#f', 'http://a/b/c;?q', 'f'), + ('http://a/b/c?q#f', 'http://a/b/c?q', 'f'), + ('http:///b/c;p?q#f', 'http:///b/c;p?q', 'f'), + ('http:b/c;p?q#f', 'http:b/c;p?q', 'f'), + ('http:;?q#f', 'http:;?q', 'f'), + ('http:?q#f', 'http:?q', 'f'), + ('//a/b/c;p?q#f', '//a/b/c;p?q', 'f'), + ('://a/b/c;p?q#f', '://a/b/c;p?q', 'f'), ] def _encode(t): return type(t)(x.encode('ascii') for x in t) bytes_cases = [_encode(x) for x in str_cases] for url, defrag, frag in str_cases + bytes_cases: - result = urllib.parse.urldefrag(url) - self.assertEqual(result.geturl(), url) - self.assertEqual(result, (defrag, frag)) - self.assertEqual(result.url, defrag) - self.assertEqual(result.fragment, frag) + with self.subTest(url): + result = urllib.parse.urldefrag(url) + hash = '#' if isinstance(url, str) else b'#' + self.assertEqual(result.geturl(), url.rstrip(hash)) + self.assertEqual(result, (defrag, frag)) + self.assertEqual(result.url, defrag) + self.assertEqual(result.fragment, frag) def test_urlsplit_scoped_IPv6(self): p = urllib.parse.urlsplit('http://[FE80::822a:a8ff:fe49:470c%tESt]:1234') diff --git a/Lib/urllib/parse.py b/Lib/urllib/parse.py index 33165309daf..5b00ab25c6b 100644 --- a/Lib/urllib/parse.py +++ b/Lib/urllib/parse.py @@ -392,20 +392,23 @@ def urlparse(url, scheme='', allow_fragments=True): Note that % escapes are not expanded. """ url, scheme, _coerce_result = _coerce_args(url, scheme) - splitresult = urlsplit(url, scheme, allow_fragments) - scheme, netloc, url, query, fragment = splitresult - if scheme in uses_params and ';' in url: - url, params = _splitparams(url) - else: - params = '' - result = ParseResult(scheme, netloc, url, params, query, fragment) + scheme, netloc, url, params, query, fragment = _urlparse(url, scheme, allow_fragments) + result = ParseResult(scheme or '', netloc or '', url, params or '', query or '', fragment or '') return _coerce_result(result) -def _splitparams(url): +def _urlparse(url, scheme=None, allow_fragments=True): + scheme, netloc, url, query, fragment = _urlsplit(url, scheme, allow_fragments) + if (scheme or '') in uses_params and ';' in url: + url, params = _splitparams(url, allow_none=True) + else: + params = None + return (scheme, netloc, url, params, query, fragment) + +def _splitparams(url, allow_none=False): if '/' in url: i = url.find(';', url.rfind('/')) if i < 0: - return url, '' + return url, None if allow_none else '' else: i = url.find(';') return url[:i], url[i+1:] @@ -472,17 +475,23 @@ def urlsplit(url, scheme='', allow_fragments=True): """ url, scheme, _coerce_result = _coerce_args(url, scheme) + scheme, netloc, url, query, fragment = _urlsplit(url, scheme, allow_fragments) + v = SplitResult(scheme or '', netloc or '', url, query or '', fragment or '') + return _coerce_result(v) + +def _urlsplit(url, scheme=None, allow_fragments=True): # Only lstrip url as some applications rely on preserving trailing space. # (https://url.spec.whatwg.org/#concept-basic-url-parser would strip both) url = url.lstrip(_WHATWG_C0_CONTROL_OR_SPACE) - scheme = scheme.strip(_WHATWG_C0_CONTROL_OR_SPACE) - for b in _UNSAFE_URL_BYTES_TO_REMOVE: url = url.replace(b, "") - scheme = scheme.replace(b, "") + if scheme is not None: + scheme = scheme.strip(_WHATWG_C0_CONTROL_OR_SPACE) + for b in _UNSAFE_URL_BYTES_TO_REMOVE: + scheme = scheme.replace(b, "") allow_fragments = bool(allow_fragments) - netloc = query = fragment = '' + netloc = query = fragment = None i = url.find(':') if i > 0 and url[0].isascii() and url[0].isalpha(): for c in url[:i]: @@ -503,8 +512,7 @@ def urlsplit(url, scheme='', allow_fragments=True): if '?' in url: url, query = url.split('?', 1) _checknetloc(netloc) - v = SplitResult(scheme, netloc, url, query, fragment) - return _coerce_result(v) + return (scheme, netloc, url, query, fragment) def urlunparse(components): """Put a parsed URL back together again. This may result in a @@ -513,9 +521,15 @@ def urlunparse(components): (the draft states that these are equivalent).""" scheme, netloc, url, params, query, fragment, _coerce_result = ( _coerce_args(*components)) + if not netloc: + if scheme and scheme in uses_netloc and (not url or url[:1] == '/'): + netloc = '' + else: + netloc = None if params: url = "%s;%s" % (url, params) - return _coerce_result(urlunsplit((scheme, netloc, url, query, fragment))) + return _coerce_result(_urlunsplit(scheme or None, netloc, url, + query or None, fragment or None)) def urlunsplit(components): """Combine the elements of a tuple as returned by urlsplit() into a @@ -525,20 +539,27 @@ def urlunsplit(components): empty query; the RFC states that these are equivalent).""" scheme, netloc, url, query, fragment, _coerce_result = ( _coerce_args(*components)) - if netloc: + if not netloc: + if scheme and scheme in uses_netloc and (not url or url[:1] == '/'): + netloc = '' + else: + netloc = None + return _coerce_result(_urlunsplit(scheme or None, netloc, url, + query or None, fragment or None)) + +def _urlunsplit(scheme, netloc, url, query, fragment): + if netloc is not None: if url and url[:1] != '/': url = '/' + url url = '//' + netloc + url elif url[:2] == '//': url = '//' + url - elif scheme and scheme in uses_netloc and (not url or url[:1] == '/'): - url = '//' + url if scheme: url = scheme + ':' + url - if query: + if query is not None: url = url + '?' + query - if fragment: + if fragment is not None: url = url + '#' + fragment - return _coerce_result(url) + return url def urljoin(base, url, allow_fragments=True): """Join a base URL and a possibly relative URL to form an absolute @@ -549,26 +570,29 @@ def urljoin(base, url, allow_fragments=True): return base base, url, _coerce_result = _coerce_args(base, url) - bscheme, bnetloc, bpath, bparams, bquery, bfragment = \ - urlparse(base, '', allow_fragments) - scheme, netloc, path, params, query, fragment = \ - urlparse(url, bscheme, allow_fragments) + bscheme, bnetloc, bpath, bquery, bfragment = \ + _urlsplit(base, None, allow_fragments) + scheme, netloc, path, query, fragment = \ + _urlsplit(url, None, allow_fragments) + if scheme is None: + scheme = bscheme if scheme != bscheme or scheme not in uses_relative: return _coerce_result(url) if scheme in uses_netloc: if netloc: - return _coerce_result(urlunparse((scheme, netloc, path, - params, query, fragment))) + return _coerce_result(_urlunsplit(scheme, netloc, path, + query, fragment)) netloc = bnetloc - if not path and not params: + if not path: path = bpath - params = bparams - if not query: + if query is None: query = bquery - return _coerce_result(urlunparse((scheme, netloc, path, - params, query, fragment))) + if fragment is None: + fragment = bfragment + return _coerce_result(_urlunsplit(scheme, netloc, path, + query, fragment)) base_parts = bpath.split('/') if base_parts[-1] != '': @@ -605,8 +629,8 @@ def urljoin(base, url, allow_fragments=True): # then we need to append the trailing '/' resolved_path.append('') - return _coerce_result(urlunparse((scheme, netloc, '/'.join( - resolved_path) or '/', params, query, fragment))) + return _coerce_result(_urlunsplit(scheme, netloc, '/'.join( + resolved_path) or '/', query, fragment)) def urldefrag(url): @@ -618,12 +642,12 @@ def urldefrag(url): """ url, _coerce_result = _coerce_args(url) if '#' in url: - s, n, p, a, q, frag = urlparse(url) - defrag = urlunparse((s, n, p, a, q, '')) + s, n, p, q, frag = _urlsplit(url) + defrag = _urlunsplit(s, n, p, q, None) else: frag = '' defrag = url - return _coerce_result(DefragResult(defrag, frag)) + return _coerce_result(DefragResult(defrag, frag or '')) _hexdig = '0123456789ABCDEFabcdef' _hextobyte = None diff --git a/Misc/NEWS.d/next/Library/2024-08-23-22-01-30.gh-issue-76960.vsANPu.rst b/Misc/NEWS.d/next/Library/2024-08-23-22-01-30.gh-issue-76960.vsANPu.rst new file mode 100644 index 00000000000..7ce2baee3f9 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2024-08-23-22-01-30.gh-issue-76960.vsANPu.rst @@ -0,0 +1,5 @@ +Fix :func:`urllib.parse.urljoin` and :func:`urllib.parse.urldefrag` for URIs +containing empty components. For example, :func:`!urljoin()` with relative +reference "?" now sets empty query and removes fragment. +Preserve empty components (authority, params, query, fragment) in :func:`!urljoin`. +Preserve empty components (authority, params, query) in :func:`!urldefrag`.