[3.8] bpo-27657: Fix urlparse() with numeric paths (GH-661) (#16839)

* bpo-27657: Fix urlparse() with numeric paths Revert parsing decision from bpo-754016 in favor of the documented consensus in bpo-16932 of how to treat strings without a // to designate the netloc. * bpo-22891: Remove urlsplit() optimization for 'http' prefixed inputs. (cherry picked from commit 5a88d50ff0) Co-authored-by: Tim Graham <timograham@gmail.com>
2019-10-18 08:23:14 -07:00 · 2019-10-18 08:23:14 -07:00 · 0f3187c1ce
parent de812682a6
commit 0f3187c1ce
3 changed files with 9 additions and 25 deletions
--- a/Lib/test/test_urlparse.py
+++ b/Lib/test/test_urlparse.py
@ -709,15 +709,17 @@ class UrlParseTestCase(unittest.TestCase):
    def test_portseparator(self):
        # Issue 754016 makes changes for port separator ':' from scheme separator
-        self.assertEqual(urllib.parse.urlparse("path:80"),
+        self.assertEqual(urllib.parse.urlparse("http:80"), ('http','','80','','',''))
-                ('','','path:80','','',''))
+        self.assertEqual(urllib.parse.urlparse("https:80"), ('https','','80','','',''))
        self.assertEqual(urllib.parse.urlparse("path:80"), ('path','','80','','',''))
        self.assertEqual(urllib.parse.urlparse("http:"),('http','','','','',''))
        self.assertEqual(urllib.parse.urlparse("https:"),('https','','','','',''))
        self.assertEqual(urllib.parse.urlparse("http://www.python.org:80"),
                ('http','www.python.org:80','','','',''))
        # As usual, need to check bytes input as well
-        self.assertEqual(urllib.parse.urlparse(b"path:80"),
+        self.assertEqual(urllib.parse.urlparse(b"http:80"), (b'http',b'',b'80',b'',b'',b''))
-                (b'',b'',b'path:80',b'',b'',b''))
+        self.assertEqual(urllib.parse.urlparse(b"https:80"), (b'https',b'',b'80',b'',b'',b''))
        self.assertEqual(urllib.parse.urlparse(b"path:80"), (b'path',b'',b'80',b'',b'',b''))
        self.assertEqual(urllib.parse.urlparse(b"http:"),(b'http',b'',b'',b'',b'',b''))
        self.assertEqual(urllib.parse.urlparse(b"https:"),(b'https',b'',b'',b'',b'',b''))
        self.assertEqual(urllib.parse.urlparse(b"http://www.python.org:80"),
--- a/Lib/urllib/parse.py
+++ b/Lib/urllib/parse.py
@ -431,31 +431,11 @@ def urlsplit(url, scheme='', allow_fragments=True):
    netloc = query = fragment = ''
    i = url.find(':')
    if i > 0:
        if url[:i] == 'http': # optimize the common case
            url = url[i+1:]
            if url[:2] == '//':
                netloc, url = _splitnetloc(url, 2)
                if (('[' in netloc and ']' not in netloc) or
                        (']' in netloc and '[' not in netloc)):
                    raise ValueError("Invalid IPv6 URL")
            if allow_fragments and '#' in url:
                url, fragment = url.split('#', 1)
            if '?' in url:
                url, query = url.split('?', 1)
            _checknetloc(netloc)
            v = SplitResult('http', netloc, url, query, fragment)
            _parse_cache[key] = v
            return _coerce_result(v)
        for c in url[:i]:
            if c not in scheme_chars:
                break
        else:
-            # make sure "url" is not actually a port number (in which case
+            scheme, url = url[:i].lower(), url[i+1:]
            # "scheme" is really part of the path)
            rest = url[i+1:]
            if not rest or any(c not in '0123456789' for c in rest):
                # not a port number
                scheme, url = url[:i].lower(), rest
    if url[:2] == '//':
        netloc, url = _splitnetloc(url, 2)
--- a/Misc/NEWS.d/next/Library/2017-12-26-14-32-23.bpo-27657.6BhyVK.rst
+++ b/Misc/NEWS.d/next/Library/2017-12-26-14-32-23.bpo-27657.6BhyVK.rst
@ -0,0 +1,2 @@
 Fix urllib.parse.urlparse() with numeric paths. A string like "path:80" is
 no longer parsed as a path but as a scheme ("path") and a path ("80").
		`@ -0,0 +1,2 @@`
							`Fix urllib.parse.urlparse() with numeric paths. A string like "path:80" is`
							`no longer parsed as a path but as a scheme ("path") and a path ("80").`