bpo-40409: Updates urlsplit scheme validation logic

* Checks that the first character of a scheme is a letter.
* Uses frozensets for scheme char validation instead of a string as the
  use of a string led to linear scanning when parsing
* Adds a unit test

Assuming this change is appropriate it should be easy to backport to older
versions of Python.

It is worth noting that this *does not* attempt to fix any other potential
parsing issues. While testing I did notice that `urlsplit` returns "paths"
that do not look valid at a glance both before AND after this change
(e.g. `+git+ssh:///+git+ssh://git@github.com/user/project.git"`).
This commit is contained in:
Samani G. Gikandi 2020-04-27 13:01:04 -07:00
parent 2b74c835a7
commit e791451e53
2 changed files with 18 additions and 6 deletions

View File

@ -1027,6 +1027,17 @@ class UrlParseTestCase(unittest.TestCase):
with self.assertRaises(ValueError):
urllib.parse.urlsplit(url)
def test_urlsplit_invalid_scheme(self):
base_url = "git+ssh://git@github.com/user/project.git"
illegal_prefixes = "1234567890+-"
illegal_chars = "Å_/"
self.assertEqual(urllib.parse.urlsplit(base_url).scheme, "git+ssh")
for prefix in illegal_prefixes + illegal_chars:
split_url = urllib.parse.urlsplit(prefix+base_url)
self.assertEqual(split_url.scheme, "")
class Utility_Tests(unittest.TestCase):
"""Testcase to test the various utility functions in the urllib."""
# In Python 2 this test class was in test_urllib.

View File

@ -73,9 +73,10 @@ uses_fragment = ['', 'ftp', 'hdl', 'http', 'gopher', 'news',
'file', 'prospero']
# Characters valid in scheme names
scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
'0123456789'
scheme_alpha_chars = frozenset('abcdefghijklmnopqrstuvwxyz'
'ABCDEFGHIJKLMNOPQRSTUVWXYZ')
scheme_chars = scheme_alpha_chars.union('0123456789'
'+-.')
# XXX: Consider replacing with functools.lru_cache
@ -462,8 +463,8 @@ def urlsplit(url, scheme='', allow_fragments=True):
clear_cache()
netloc = query = fragment = ''
i = url.find(':')
if i > 0:
for c in url[:i]:
if i > 0 and url[0] in scheme_alpha_chars:
for c in url[1:i]:
if c not in scheme_chars:
break
else: