"""Parse (absolute and relative) URLs. See RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June 1995. """ import sys __all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag", "urlsplit", "urlunsplit"] # A classification of schemes ('' means apply by default) uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap', 'wais', 'file', 'https', 'shttp', 'mms', 'prospero', 'rtsp', 'rtspu', '', 'sftp'] uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet', 'imap', 'wais', 'file', 'mms', 'https', 'shttp', 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '', 'svn', 'svn+ssh', 'sftp'] non_hierarchical = ['gopher', 'hdl', 'mailto', 'news', 'telnet', 'wais', 'imap', 'snews', 'sip', 'sips'] uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap', 'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips', 'mms', '', 'sftp'] uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms', 'gopher', 'rtsp', 'rtspu', 'sip', 'sips', ''] uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news', 'nntp', 'wais', 'https', 'shttp', 'snews', 'file', 'prospero', ''] # Characters valid in scheme names scheme_chars = ('abcdefghijklmnopqrstuvwxyz' 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' '0123456789' '+-.') MAX_CACHE_SIZE = 20 _parse_cache = {} def clear_cache(): """Clear the parse cache.""" _parse_cache.clear() class ResultMixin(object): """Shared methods for the parsed result objects.""" @property def username(self): netloc = self.netloc if "@" in netloc: userinfo = netloc.rsplit("@", 1)[0] if ":" in userinfo: userinfo = userinfo.split(":", 1)[0] return userinfo return None @property def password(self): netloc = self.netloc if "@" in netloc: userinfo = netloc.rsplit("@", 1)[0] if ":" in userinfo: return userinfo.split(":", 1)[1] return None @property def hostname(self): netloc = self.netloc if "@" in netloc: netloc = netloc.rsplit("@", 1)[1] if ":" in netloc: netloc = netloc.split(":", 1)[0] return netloc.lower() or None @property def port(self): netloc = self.netloc if "@" in netloc: netloc = netloc.rsplit("@", 1)[1] if ":" in netloc: port = netloc.split(":", 1)[1] return int(port, 10) return None from collections import namedtuple class SplitResult(namedtuple('SplitResult', 'scheme netloc path query fragment'), ResultMixin): __slots__ = () def geturl(self): return urlunsplit(self) class ParseResult(namedtuple('ParseResult', 'scheme netloc path params query fragment'), ResultMixin): __slots__ = () def geturl(self): return urlunparse(self) def urlparse(url, scheme='', allow_fragments=True): """Parse a URL into 6 components: :///;?# Return a 6-tuple: (scheme, netloc, path, params, query, fragment). Note that we don't break the components up in smaller bits (e.g. netloc is a single string) and we don't expand % escapes.""" tuple = urlsplit(url, scheme, allow_fragments) scheme, netloc, url, query, fragment = tuple if scheme in uses_params and ';' in url: url, params = _splitparams(url) else: params = '' return ParseResult(scheme, netloc, url, params, query, fragment) def _splitparams(url): if '/' in url: i = url.find(';', url.rfind('/')) if i < 0: return url, '' else: i = url.find(';') return url[:i], url[i+1:] def _splitnetloc(url, start=0): delim = len(url) # position of end of domain part of url, default is end for c in '/?#': # look for delimiters; the order is NOT important wdelim = url.find(c, start) # find first of this delim if wdelim >= 0: # if found delim = min(delim, wdelim) # use earliest delim position return url[start:delim], url[delim:] # return (domain, rest) def urlsplit(url, scheme='', allow_fragments=True): """Parse a URL into 5 components: :///?# Return a 5-tuple: (scheme, netloc, path, query, fragment). Note that we don't break the components up in smaller bits (e.g. netloc is a single string) and we don't expand % escapes.""" allow_fragments = bool(allow_fragments) key = url, scheme, allow_fragments, type(url), type(scheme) cached = _parse_cache.get(key, None) if cached: return cached if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth clear_cache() netloc = query = fragment = '' i = url.find(':') if i > 0: if url[:i] == 'http': # optimize the common case scheme = url[:i].lower() url = url[i+1:] if url[:2] == '//': netloc, url = _splitnetloc(url, 2) if allow_fragments and '#' in url: url, fragment = url.split('#', 1) if '?' in url: url, query = url.split('?', 1) v = SplitResult(scheme, netloc, url, query, fragment) _parse_cache[key] = v return v for c in url[:i]: if c not in scheme_chars: break else: scheme, url = url[:i].lower(), url[i+1:] if scheme in uses_netloc and url[:2] == '//': netloc, url = _splitnetloc(url, 2) if allow_fragments and scheme in uses_fragment and '#' in url: url, fragment = url.split('#', 1) if scheme in uses_query and '?' in url: url, query = url.split('?', 1) v = SplitResult(scheme, netloc, url, query, fragment) _parse_cache[key] = v return v def urlunparse(components): """Put a parsed URL back together again. This may result in a slightly different, but equivalent URL, if the URL that was parsed originally had redundant delimiters, e.g. a ? with an empty query (the draft states that these are equivalent).""" scheme, netloc, url, params, query, fragment = components if params: url = "%s;%s" % (url, params) return urlunsplit((scheme, netloc, url, query, fragment)) def urlunsplit(components): scheme, netloc, url, query, fragment = components if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'): if url and url[:1] != '/': url = '/' + url url = '//' + (netloc or '') + url if scheme: url = scheme + ':' + url if query: url = url + '?' + query if fragment: url = url + '#' + fragment return url def urljoin(base, url, allow_fragments=True): """Join a base URL and a possibly relative URL to form an absolute interpretation of the latter.""" if not base: return url if not url: return base bscheme, bnetloc, bpath, bparams, bquery, bfragment = \ urlparse(base, '', allow_fragments) scheme, netloc, path, params, query, fragment = \ urlparse(url, bscheme, allow_fragments) if scheme != bscheme or scheme not in uses_relative: return url if scheme in uses_netloc: if netloc: return urlunparse((scheme, netloc, path, params, query, fragment)) netloc = bnetloc if path[:1] == '/': return urlunparse((scheme, netloc, path, params, query, fragment)) if not path: path = bpath if not params: params = bparams else: path = path[:-1] return urlunparse((scheme, netloc, path, params, query, fragment)) if not query: query = bquery return urlunparse((scheme, netloc, path, params, query, fragment)) segments = bpath.split('/')[:-1] + path.split('/') # XXX The stuff below is bogus in various ways... if segments[-1] == '.': segments[-1] = '' while '.' in segments: segments.remove('.') while 1: i = 1 n = len(segments) - 1 while i < n: if (segments[i] == '..' and segments[i-1] not in ('', '..')): del segments[i-1:i+1] break i = i+1 else: break if segments == ['', '..']: segments[-1] = '' elif len(segments) >= 2 and segments[-1] == '..': segments[-2:] = [''] return urlunparse((scheme, netloc, '/'.join(segments), params, query, fragment)) def urldefrag(url): """Removes any existing fragment from URL. Returns a tuple of the defragmented URL and the fragment. If the URL contained no fragments, the second element is the empty string. """ if '#' in url: s, n, p, a, q, frag = urlparse(url) defrag = urlunparse((s, n, p, a, q, '')) return defrag, frag else: return url, '' _hextochr = dict(('%02x' % i, chr(i)) for i in range(256)) _hextochr.update(('%02X' % i, chr(i)) for i in range(256)) def unquote(s): """unquote('abc%20def') -> 'abc def'.""" res = s.split('%') for i in range(1, len(res)): item = res[i] try: res[i] = _hextochr[item[:2]] + item[2:] except KeyError: res[i] = '%' + item except UnicodeDecodeError: res[i] = chr(int(item[:2], 16)) + item[2:] return "".join(res) def unquote_plus(s): """unquote('%7e/abc+def') -> '~/abc def'""" s = s.replace('+', ' ') return unquote(s) always_safe = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' '0123456789' '_.-') _safe_quoters= {} class Quoter: def __init__(self, safe): self.cache = {} self.safe = safe + always_safe def __call__(self, c): try: return self.cache[c] except KeyError: if ord(c) < 256: res = (c in self.safe) and c or ('%%%02X' % ord(c)) self.cache[c] = res return res else: return "".join(['%%%02X' % i for i in c.encode("utf-8")]) def quote(s, safe = '/'): """quote('abc def') -> 'abc%20def' Each part of a URL, e.g. the path info, the query, etc., has a different set of reserved characters that must be quoted. RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists the following reserved characters. reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" | "$" | "," Each of these characters is reserved in some component of a URL, but not necessarily in all of them. By default, the quote function is intended for quoting the path section of a URL. Thus, it will not encode '/'. This character is reserved, but in typical usage the quote function is being called on a path where the existing slash characters are used as reserved characters. """ cachekey = (safe, always_safe) try: quoter = _safe_quoters[cachekey] except KeyError: quoter = Quoter(safe) _safe_quoters[cachekey] = quoter res = map(quoter, s) return ''.join(res) def quote_plus(s, safe = ''): """Quote the query fragment of a URL; replacing ' ' with '+'""" if ' ' in s: s = quote(s, safe + ' ') return s.replace(' ', '+') return quote(s, safe) def urlencode(query,doseq=0): """Encode a sequence of two-element tuples or dictionary into a URL query string. If any values in the query arg are sequences and doseq is true, each sequence element is converted to a separate parameter. If the query arg is a sequence of two-element tuples, the order of the parameters in the output will match the order of parameters in the input. """ if hasattr(query,"items"): # mapping objects query = query.items() else: # it's a bother at times that strings and string-like objects are # sequences... try: # non-sequence items should not work with len() # non-empty strings will fail this if len(query) and not isinstance(query[0], tuple): raise TypeError # zero-length sequences of all types will get here and succeed, # but that's a minor nit - since the original implementation # allowed empty dicts that type of behavior probably should be # preserved for consistency except TypeError: ty,va,tb = sys.exc_info() raise TypeError("not a valid non-string sequence or mapping object").with_traceback(tb) l = [] if not doseq: # preserve old behavior for k, v in query: k = quote_plus(str(k)) v = quote_plus(str(v)) l.append(k + '=' + v) else: for k, v in query: k = quote_plus(str(k)) if isinstance(v, str): v = quote_plus(v) l.append(k + '=' + v) elif isinstance(v, str): # is there a reasonable way to convert to ASCII? # encode generates a string, but "replace" or "ignore" # lose information and "strict" can raise UnicodeError v = quote_plus(v.encode("ASCII","replace")) l.append(k + '=' + v) else: try: # is this a sufficient test for sequence-ness? x = len(v) except TypeError: # not a sequence v = quote_plus(str(v)) l.append(k + '=' + v) else: # loop over the sequence for elt in v: l.append(k + '=' + quote_plus(str(elt))) return '&'.join(l) # Utilities to parse URLs (most of these return None for missing parts): # unwrap('') --> 'type://host/path' # splittype('type:opaquestring') --> 'type', 'opaquestring' # splithost('//host[:port]/path') --> 'host[:port]', '/path' # splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]' # splitpasswd('user:passwd') -> 'user', 'passwd' # splitport('host:port') --> 'host', 'port' # splitquery('/path?query') --> '/path', 'query' # splittag('/path#tag') --> '/path', 'tag' # splitattr('/path;attr1=value1;attr2=value2;...') -> # '/path', ['attr1=value1', 'attr2=value2', ...] # splitvalue('attr=value') --> 'attr', 'value' # urllib.parse.unquote('abc%20def') -> 'abc def' # quote('abc def') -> 'abc%20def') def to_bytes(url): """to_bytes(u"URL") --> 'URL'.""" # Most URL schemes require ASCII. If that changes, the conversion # can be relaxed. # XXX get rid of to_bytes() if isinstance(url, str): try: url = url.encode("ASCII").decode() except UnicodeError: raise UnicodeError("URL " + repr(url) + " contains non-ASCII characters") return url def unwrap(url): """unwrap('') --> 'type://host/path'.""" url = str(url).strip() if url[:1] == '<' and url[-1:] == '>': url = url[1:-1].strip() if url[:4] == 'URL:': url = url[4:].strip() return url _typeprog = None def splittype(url): """splittype('type:opaquestring') --> 'type', 'opaquestring'.""" global _typeprog if _typeprog is None: import re _typeprog = re.compile('^([^/:]+):') match = _typeprog.match(url) if match: scheme = match.group(1) return scheme.lower(), url[len(scheme) + 1:] return None, url _hostprog = None def splithost(url): """splithost('//host[:port]/path') --> 'host[:port]', '/path'.""" global _hostprog if _hostprog is None: import re _hostprog = re.compile('^//([^/?]*)(.*)$') match = _hostprog.match(url) if match: return match.group(1, 2) return None, url _userprog = None def splituser(host): """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'.""" global _userprog if _userprog is None: import re _userprog = re.compile('^(.*)@(.*)$') match = _userprog.match(host) if match: return map(unquote, match.group(1, 2)) return None, host _passwdprog = None def splitpasswd(user): """splitpasswd('user:passwd') -> 'user', 'passwd'.""" global _passwdprog if _passwdprog is None: import re _passwdprog = re.compile('^([^:]*):(.*)$') match = _passwdprog.match(user) if match: return match.group(1, 2) return user, None # splittag('/path#tag') --> '/path', 'tag' _portprog = None def splitport(host): """splitport('host:port') --> 'host', 'port'.""" global _portprog if _portprog is None: import re _portprog = re.compile('^(.*):([0-9]+)$') match = _portprog.match(host) if match: return match.group(1, 2) return host, None _nportprog = None def splitnport(host, defport=-1): """Split host and port, returning numeric port. Return given default port if no ':' found; defaults to -1. Return numerical port if a valid number are found after ':'. Return None if ':' but not a valid number.""" global _nportprog if _nportprog is None: import re _nportprog = re.compile('^(.*):(.*)$') match = _nportprog.match(host) if match: host, port = match.group(1, 2) try: if not port: raise ValueError("no digits") nport = int(port) except ValueError: nport = None return host, nport return host, defport _queryprog = None def splitquery(url): """splitquery('/path?query') --> '/path', 'query'.""" global _queryprog if _queryprog is None: import re _queryprog = re.compile('^(.*)\?([^?]*)$') match = _queryprog.match(url) if match: return match.group(1, 2) return url, None _tagprog = None def splittag(url): """splittag('/path#tag') --> '/path', 'tag'.""" global _tagprog if _tagprog is None: import re _tagprog = re.compile('^(.*)#([^#]*)$') match = _tagprog.match(url) if match: return match.group(1, 2) return url, None def splitattr(url): """splitattr('/path;attr1=value1;attr2=value2;...') -> '/path', ['attr1=value1', 'attr2=value2', ...].""" words = url.split(';') return words[0], words[1:] _valueprog = None def splitvalue(attr): """splitvalue('attr=value') --> 'attr', 'value'.""" global _valueprog if _valueprog is None: import re _valueprog = re.compile('^([^=]*)=(.*)$') match = _valueprog.match(attr) if match: return match.group(1, 2) return attr, None test_input = """ http://a/b/c/d g:h = http:g = http: = g = ./g = g/ = /g = //g = ?y = g?y = g?y/./x = . = ./ = .. = ../ = ../g = ../.. = ../../g = ../../../g = ./../g = ./g/. = /./g = g/./h = g/../h = http:g = http: = http:?y = http:g?y = http:g?y/./x = """ def test(): base = '' if sys.argv[1:]: fn = sys.argv[1] if fn == '-': fp = sys.stdin else: fp = open(fn) else: from io import StringIO fp = StringIO(test_input) for line in fp: words = line.split() if not words: continue url = words[0] parts = urlparse(url) print('%-10s : %s' % (url, parts)) abs = urljoin(base, url) if not base: base = abs wrapped = '' % abs print('%-10s = %s' % (url, wrapped)) if len(words) == 3 and words[1] == '=': if wrapped != words[2]: print('EXPECTED', words[2], '!!!!!!!!!!') if __name__ == '__main__': test()