Fix parsing of parameters from a URL; urlparse() did not check that it only

split parameters from the last path segment.  Introduces two new functions,
urlsplit() and urlunsplit(), that do the simpler job of splitting the URL
without monkeying around with the parameters field, since that was not being
handled properly.
This closes bug #478038.
This commit is contained in:
Fred Drake 2001-11-16 02:52:57 +00:00
parent c66ff203bb
commit 5751a22ede
1 changed files with 46 additions and 34 deletions

View File

@ -43,19 +43,42 @@ def clear_cache():
_parse_cache = {} _parse_cache = {}
def urlparse(url, scheme = '', allow_fragments = 1): def urlparse(url, scheme='', allow_fragments=1):
"""Parse a URL into 6 components: """Parse a URL into 6 components:
<scheme>://<netloc>/<path>;<params>?<query>#<fragment> <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
Return a 6-tuple: (scheme, netloc, path, params, query, fragment). Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
Note that we don't break the components up in smaller bits Note that we don't break the components up in smaller bits
(e.g. netloc is a single string) and we don't expand % escapes.""" (e.g. netloc is a single string) and we don't expand % escapes."""
tuple = urlsplit(url, scheme, allow_fragments)
scheme, netloc, url, query, fragment = tuple
if scheme in uses_params and ';' in url:
url, params = _splitparams(url)
else:
params = ''
return scheme, netloc, url, params, query, fragment
def _splitparams(url):
if '/' in url:
i = url.find(';', url.rfind('/'))
if i < 0:
return url, ''
else:
i = url.find(';')
return url[:i], url[i+1:]
def urlsplit(url, scheme='', allow_fragments=1):
"""Parse a URL into 5 components:
<scheme>://<netloc>/<path>?<query>#<fragment>
Return a 5-tuple: (scheme, netloc, path, query, fragment).
Note that we don't break the components up in smaller bits
(e.g. netloc is a single string) and we don't expand % escapes."""
key = url, scheme, allow_fragments key = url, scheme, allow_fragments
cached = _parse_cache.get(key, None) cached = _parse_cache.get(key, None)
if cached: if cached:
return cached return cached
if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
clear_cache() clear_cache()
netloc = params = query = fragment = '' netloc = query = fragment = ''
i = url.find(':') i = url.find(':')
if i > 0: if i > 0:
if url[:i] == 'http': # optimize the common case if url[:i] == 'http': # optimize the common case
@ -67,20 +90,11 @@ def urlparse(url, scheme = '', allow_fragments = 1):
i = len(url) i = len(url)
netloc = url[2:i] netloc = url[2:i]
url = url[i:] url = url[i:]
if allow_fragments: if allow_fragments and '#' in url:
i = url.rfind('#') url, fragment = url.split('#', 1)
if i >= 0: if '?' in url:
fragment = url[i+1:] url, query = url.split('?', 1)
url = url[:i] tuple = scheme, netloc, url, query, fragment
i = url.find('?')
if i >= 0:
query = url[i+1:]
url = url[:i]
i = url.find(';')
if i >= 0:
params = url[i+1:]
url = url[:i]
tuple = scheme, netloc, url, params, query, fragment
_parse_cache[key] = tuple _parse_cache[key] = tuple
return tuple return tuple
for c in url[:i]: for c in url[:i]:
@ -94,19 +108,11 @@ def urlparse(url, scheme = '', allow_fragments = 1):
if i < 0: if i < 0:
i = len(url) i = len(url)
netloc, url = url[2:i], url[i:] netloc, url = url[2:i], url[i:]
if allow_fragments and scheme in uses_fragment: if allow_fragments and scheme in uses_fragment and '#' in url:
i = url.rfind('#') url, fragment = url.split('#', 1)
if i >= 0: if scheme in uses_query and '?' in url:
url, fragment = url[:i], url[i+1:] url, query = url.split('?', 1)
if scheme in uses_query: tuple = scheme, netloc, url, query, fragment
i = url.find('?')
if i >= 0:
url, query = url[:i], url[i+1:]
if scheme in uses_params:
i = url.find(';')
if i >= 0:
url, params = url[:i], url[i+1:]
tuple = scheme, netloc, url, params, query, fragment
_parse_cache[key] = tuple _parse_cache[key] = tuple
return tuple return tuple
@ -115,13 +121,16 @@ def urlunparse((scheme, netloc, url, params, query, fragment)):
slightly different, but equivalent URL, if the URL that was parsed slightly different, but equivalent URL, if the URL that was parsed
originally had redundant delimiters, e.g. a ? with an empty query originally had redundant delimiters, e.g. a ? with an empty query
(the draft states that these are equivalent).""" (the draft states that these are equivalent)."""
if params:
url = "%s;%s" % (url, params)
return urlunsplit((scheme, netloc, url, query, fragment))
def urlunsplit((scheme, netloc, url, query, fragment)):
if netloc or (scheme in uses_netloc and url[:2] == '//'): if netloc or (scheme in uses_netloc and url[:2] == '//'):
if url and url[:1] != '/': url = '/' + url if url and url[:1] != '/': url = '/' + url
url = '//' + (netloc or '') + url url = '//' + (netloc or '') + url
if scheme: if scheme:
url = scheme + ':' + url url = scheme + ':' + url
if params:
url = url + ';' + params
if query: if query:
url = url + '?' + query url = url + '?' + query
if fragment: if fragment:
@ -187,9 +196,12 @@ def urldefrag(url):
the URL contained no fragments, the second element is the the URL contained no fragments, the second element is the
empty string. empty string.
""" """
s, n, p, a, q, frag = urlparse(url) if '#' in url:
defrag = urlunparse((s, n, p, a, q, '')) s, n, p, a, q, frag = urlparse(url)
return defrag, frag defrag = urlunparse((s, n, p, a, q, ''))
return defrag, frag
else:
return url, ''
test_input = """ test_input = """