From 41e4faa82bdf4fb601a97565bf30ee683c4bfd50 Mon Sep 17 00:00:00 2001 From: Johannes Gijsbers Date: Sun, 9 Jan 2005 15:29:10 +0000 Subject: [PATCH] Patch #712317: In URLs such as http://www.example.com?query=spam, treat '?' as a delimiter. Previously, the 'network location' ( in RFC 2396) would become 'www.example.com?query=spam', while RFC 2396 does not allow a '?' in . See bug #548176 for further discussion. --- Lib/test/test_urlparse.py | 77 +++++++++++++++++++++++++-------------- Lib/urlparse.py | 25 ++++++------- 2 files changed, 62 insertions(+), 40 deletions(-) diff --git a/Lib/test/test_urlparse.py b/Lib/test/test_urlparse.py index 8932b3c7be2..04572ba60df 100644 --- a/Lib/test/test_urlparse.py +++ b/Lib/test/test_urlparse.py @@ -8,20 +8,22 @@ RFC1808_BASE = "http://a/b/c/d;p?q#f" RFC2396_BASE = "http://a/b/c/d;p?q" class UrlParseTestCase(unittest.TestCase): - def test_frags(self): - for url, parsed, split in [ - ('http://www.python.org', - ('http', 'www.python.org', '', '', '', ''), - ('http', 'www.python.org', '', '', '')), - ('http://www.python.org#abc', - ('http', 'www.python.org', '', '', '', 'abc'), - ('http', 'www.python.org', '', '', 'abc')), - ('http://www.python.org/#abc', - ('http', 'www.python.org', '/', '', '', 'abc'), - ('http', 'www.python.org', '/', '', 'abc')), - (RFC1808_BASE, - ('http', 'a', '/b/c/d', 'p', 'q', 'f'), - ('http', 'a', '/b/c/d;p', 'q', 'f')), + + def checkRoundtrips(self, url, parsed, split): + result = urlparse.urlparse(url) + self.assertEqual(result, parsed) + # put it back together and it should be the same + result2 = urlparse.urlunparse(result) + self.assertEqual(result2, url) + + # check the roundtrip using urlsplit() as well + result = urlparse.urlsplit(url) + self.assertEqual(result, split) + result2 = urlparse.urlunsplit(result) + self.assertEqual(result2, url) + + def test_roundtrips(self): + testcases = [ ('file:///tmp/junk.txt', ('file', '', '/tmp/junk.txt', '', '', ''), ('file', '', '/tmp/junk.txt', '', '')), @@ -29,20 +31,41 @@ class UrlParseTestCase(unittest.TestCase): ('imap', 'mail.python.org', '/mbox1', '', '', ''), ('imap', 'mail.python.org', '/mbox1', '', '')), ('mms://wms.sys.hinet.net/cts/Drama/09006251100.asf', - ('mms', 'wms.sys.hinet.net', '/cts/Drama/09006251100.asf', '', '', ''), - ('mms', 'wms.sys.hinet.net', '/cts/Drama/09006251100.asf', '', '')), - ]: - result = urlparse.urlparse(url) - self.assertEqual(result, parsed) - # put it back together and it should be the same - result2 = urlparse.urlunparse(result) - self.assertEqual(result2, url) + ('mms', 'wms.sys.hinet.net', '/cts/Drama/09006251100.asf', + '', '', ''), + ('mms', 'wms.sys.hinet.net', '/cts/Drama/09006251100.asf', + '', '')), + ] + for url, parsed, split in testcases: + self.checkRoundtrips(url, parsed, split) - # check the roundtrip using urlsplit() as well - result = urlparse.urlsplit(url) - self.assertEqual(result, split) - result2 = urlparse.urlunsplit(result) - self.assertEqual(result2, url) + def test_http_roundtrips(self): + # urlparse.urlsplit treats 'http:' as an optimized special case, + # so we test both 'http:' and 'https:' in all the following. + # Three cheers for white box knowledge! + testcases = [ + ('://www.python.org', + ('www.python.org', '', '', '', ''), + ('www.python.org', '', '', '')), + ('://www.python.org#abc', + ('www.python.org', '', '', '', 'abc'), + ('www.python.org', '', '', 'abc')), + ('://www.python.org?q=abc', + ('www.python.org', '', '', 'q=abc', ''), + ('www.python.org', '', 'q=abc', '')), + ('://www.python.org/#abc', + ('www.python.org', '/', '', '', 'abc'), + ('www.python.org', '/', '', 'abc')), + ('://a/b/c/d;p?q#f', + ('a', '/b/c/d', 'p', 'q', 'f'), + ('a', '/b/c/d;p', 'q', 'f')), + ] + for scheme in ('http', 'https'): + for url, parsed, split in testcases: + url = scheme + url + parsed = (scheme,) + parsed + split = (scheme,) + split + self.checkRoundtrips(url, parsed, split) def checkJoin(self, base, relurl, expected): self.assertEqual(urlparse.urljoin(base, relurl), expected, diff --git a/Lib/urlparse.py b/Lib/urlparse.py index 9c762725474..8469139344b 100644 --- a/Lib/urlparse.py +++ b/Lib/urlparse.py @@ -63,6 +63,15 @@ def _splitparams(url): i = url.find(';') return url[:i], url[i+1:] +def _splitnetloc(url, start=0): + for c in '/?#': # the order is important! + delim = url.find(c, start) + if delim >= 0: + break + else: + delim = len(url) + return url[start:delim], url[delim:] + def urlsplit(url, scheme='', allow_fragments=1): """Parse a URL into 5 components: :///?# @@ -82,13 +91,7 @@ def urlsplit(url, scheme='', allow_fragments=1): scheme = url[:i].lower() url = url[i+1:] if url[:2] == '//': - i = url.find('/', 2) - if i < 0: - i = url.find('#') - if i < 0: - i = len(url) - netloc = url[2:i] - url = url[i:] + netloc, url = _splitnetloc(url, 2) if allow_fragments and '#' in url: url, fragment = url.split('#', 1) if '?' in url: @@ -101,12 +104,8 @@ def urlsplit(url, scheme='', allow_fragments=1): break else: scheme, url = url[:i].lower(), url[i+1:] - if scheme in uses_netloc: - if url[:2] == '//': - i = url.find('/', 2) - if i < 0: - i = len(url) - netloc, url = url[2:i], url[i:] + if scheme in uses_netloc and url[:2] == '//': + netloc, url = _splitnetloc(url, 2) if allow_fragments and scheme in uses_fragment and '#' in url: url, fragment = url.split('#', 1) if scheme in uses_query and '?' in url: