bpo-40409: Updates urlsplit scheme validation logic

* Checks that the first character of a scheme is a letter. * Uses frozensets for scheme char validation instead of a string as the use of a string led to linear scanning when parsing * Adds a unit test Assuming this change is appropriate it should be easy to backport to older versions of Python. It is worth noting that this *does not* attempt to fix any other potential parsing issues. While testing I did notice that `urlsplit` returns "paths" that do not look valid at a glance both before AND after this change (e.g. `+git+ssh:///+git+ssh://git@github.com/user/project.git"`).
2020-04-27 13:01:04 -07:00 · 2020-04-27 13:01:04 -07:00 · e791451e53
parent 2b74c835a7
commit e791451e53
2 changed files with 18 additions and 6 deletions
--- a/Lib/test/test_urlparse.py
+++ b/Lib/test/test_urlparse.py
@ -1027,6 +1027,17 @@ class UrlParseTestCase(unittest.TestCase):
                        with self.assertRaises(ValueError):
                            urllib.parse.urlsplit(url)

+    def test_urlsplit_invalid_scheme(self):
+        base_url = "git+ssh://git@github.com/user/project.git"
+        illegal_prefixes = "1234567890+-"
+        illegal_chars = "Å_/"
+
+        self.assertEqual(urllib.parse.urlsplit(base_url).scheme, "git+ssh")
+
+        for prefix in illegal_prefixes + illegal_chars:
+            split_url = urllib.parse.urlsplit(prefix+base_url)
+            self.assertEqual(split_url.scheme, "")
+
 class Utility_Tests(unittest.TestCase):
    """Testcase to test the various utility functions in the urllib."""
    # In Python 2 this test class was in test_urllib.
--- a/Lib/urllib/parse.py
+++ b/Lib/urllib/parse.py
@ -73,9 +73,10 @@ uses_fragment = ['', 'ftp', 'hdl', 'http', 'gopher', 'news',
                 'file', 'prospero']

 # Characters valid in scheme names
-scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
-                'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
-                '0123456789'
+scheme_alpha_chars = frozenset('abcdefghijklmnopqrstuvwxyz'
+                               'ABCDEFGHIJKLMNOPQRSTUVWXYZ')
+
+scheme_chars = scheme_alpha_chars.union('0123456789'
                                        '+-.')

 # XXX: Consider replacing with functools.lru_cache
@ -462,8 +463,8 @@ def urlsplit(url, scheme='', allow_fragments=True):
        clear_cache()
    netloc = query = fragment = ''
    i = url.find(':')
-    if i > 0:
-        for c in url[:i]:
+    if i > 0 and url[0] in scheme_alpha_chars:
+        for c in url[1:i]:
            if c not in scheme_chars:
                break
        else: