bpo-40409: Updates urlsplit scheme validation logic
* Checks that the first character of a scheme is a letter. * Uses frozensets for scheme char validation instead of a string as the use of a string led to linear scanning when parsing * Adds a unit test Assuming this change is appropriate it should be easy to backport to older versions of Python. It is worth noting that this *does not* attempt to fix any other potential parsing issues. While testing I did notice that `urlsplit` returns "paths" that do not look valid at a glance both before AND after this change (e.g. `+git+ssh:///+git+ssh://git@github.com/user/project.git"`).
This commit is contained in:
parent
2b74c835a7
commit
e791451e53
|
@ -1027,6 +1027,17 @@ class UrlParseTestCase(unittest.TestCase):
|
|||
with self.assertRaises(ValueError):
|
||||
urllib.parse.urlsplit(url)
|
||||
|
||||
def test_urlsplit_invalid_scheme(self):
|
||||
base_url = "git+ssh://git@github.com/user/project.git"
|
||||
illegal_prefixes = "1234567890+-"
|
||||
illegal_chars = "Å_/"
|
||||
|
||||
self.assertEqual(urllib.parse.urlsplit(base_url).scheme, "git+ssh")
|
||||
|
||||
for prefix in illegal_prefixes + illegal_chars:
|
||||
split_url = urllib.parse.urlsplit(prefix+base_url)
|
||||
self.assertEqual(split_url.scheme, "")
|
||||
|
||||
class Utility_Tests(unittest.TestCase):
|
||||
"""Testcase to test the various utility functions in the urllib."""
|
||||
# In Python 2 this test class was in test_urllib.
|
||||
|
|
|
@ -73,9 +73,10 @@ uses_fragment = ['', 'ftp', 'hdl', 'http', 'gopher', 'news',
|
|||
'file', 'prospero']
|
||||
|
||||
# Characters valid in scheme names
|
||||
scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
|
||||
'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
|
||||
'0123456789'
|
||||
scheme_alpha_chars = frozenset('abcdefghijklmnopqrstuvwxyz'
|
||||
'ABCDEFGHIJKLMNOPQRSTUVWXYZ')
|
||||
|
||||
scheme_chars = scheme_alpha_chars.union('0123456789'
|
||||
'+-.')
|
||||
|
||||
# XXX: Consider replacing with functools.lru_cache
|
||||
|
@ -462,8 +463,8 @@ def urlsplit(url, scheme='', allow_fragments=True):
|
|||
clear_cache()
|
||||
netloc = query = fragment = ''
|
||||
i = url.find(':')
|
||||
if i > 0:
|
||||
for c in url[:i]:
|
||||
if i > 0 and url[0] in scheme_alpha_chars:
|
||||
for c in url[1:i]:
|
||||
if c not in scheme_chars:
|
||||
break
|
||||
else:
|
||||
|
|
Loading…
Reference in New Issue