cpython/Lib/urlparse.py

"""Parse (absolute and relative) URLs.

See RFC 1808: "Relative Uniform Resource Locators", by R. Fielding,
UC Irvine, June 1995.
"""

# A classification of schemes ('' means apply by default)
uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'wais', 'file',
                 'https', 'shttp',
                 'prospero', 'rtsp', 'rtspu', '']
uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet', 'wais',
               'file',
               'https', 'shttp', 'snews',
               'prospero', 'rtsp', 'rtspu', '']
non_hierarchical = ['gopher', 'hdl', 'mailto', 'news', 'telnet', 'wais',
                    'snews', 'sip',
                    ]
uses_params = ['ftp', 'hdl', 'prospero', 'http',
               'https', 'shttp', 'rtsp', 'rtspu', 'sip',
               '']
uses_query = ['http', 'wais',
              'https', 'shttp',
              'gopher', 'rtsp', 'rtspu', 'sip',
              '']
uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news', 'nntp', 'wais',
                 'https', 'shttp', 'snews',
                 'file', 'prospero', '']

# Characters valid in scheme names
scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
                'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
                '0123456789'
                '+-.')

MAX_CACHE_SIZE = 20
_parse_cache = {}

def clear_cache():
    """Clear the parse cache."""
    global _parse_cache
    _parse_cache = {}


def urlparse(url, scheme = '', allow_fragments = 1):
    """Parse a URL into 6 components:
    <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
    Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
    Note that we don't break the components up in smaller bits
    (e.g. netloc is a single string) and we don't expand % escapes."""
    key = url, scheme, allow_fragments
    cached = _parse_cache.get(key, None)
    if cached:
        return cached
    if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
        clear_cache()
    netloc = path = params = query = fragment = ''
    i = url.find(':')
    if i > 0:
        if url[:i] == 'http': # optimize the common case
            scheme = url[:i].lower()
            url = url[i+1:]
            if url[:2] == '//':
                i = url.find('/', 2)
                if i < 0:
                    i = len(url)
                netloc = url[2:i]
                url = url[i:]
            if allow_fragments:
                i = url.rfind('#')
                if i >= 0:
                    fragment = url[i+1:]
                    url = url[:i]
            i = url.find('?')
            if i >= 0:
                query = url[i+1:]
                url = url[:i]
            i = url.find(';')
            if i >= 0:
                params = url[i+1:]
                url = url[:i]
            tuple = scheme, netloc, url, params, query, fragment
            _parse_cache[key] = tuple
            return tuple
        for c in url[:i]:
            if c not in scheme_chars:
                break
        else:
            scheme, url = url[:i].lower(), url[i+1:]
    if scheme in uses_netloc:
        if url[:2] == '//':
            i = url.find('/', 2)
            if i < 0:
                i = len(url)
            netloc, url = url[2:i], url[i:]
    if allow_fragments and scheme in uses_fragment:
        i = url.rfind('#')
        if i >= 0:
            url, fragment = url[:i], url[i+1:]
    if scheme in uses_query:
        i = url.find('?')
        if i >= 0:
            url, query = url[:i], url[i+1:]
    if scheme in uses_params:
        i = url.find(';')
        if i >= 0:
            url, params = url[:i], url[i+1:]
    tuple = scheme, netloc, url, params, query, fragment
    _parse_cache[key] = tuple
    return tuple

def urlunparse((scheme, netloc, url, params, query, fragment)):
    """Put a parsed URL back together again.  This may result in a
    slightly different, but equivalent URL, if the URL that was parsed
    originally had redundant delimiters, e.g. a ? with an empty query
    (the draft states that these are equivalent)."""
    if netloc or (scheme in uses_netloc and url[:2] == '//'):
        if url and url[:1] != '/': url = '/' + url
        url = '//' + (netloc or '') + url
    if scheme:
        url = scheme + ':' + url
    if params:
        url = url + ';' + params
    if query:
        url = url + '?' + query
    if fragment:
        url = url + '#' + fragment
    return url

def urljoin(base, url, allow_fragments = 1):
    """Join a base URL and a possibly relative URL to form an absolute
    interpretation of the latter."""
    if not base:
        return url
    if not url:
        return base
    bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
            urlparse(base, '', allow_fragments)
    scheme, netloc, path, params, query, fragment = \
            urlparse(url, bscheme, allow_fragments)
    if scheme != bscheme or scheme not in uses_relative:
        return url
    if scheme in uses_netloc:
        if netloc:
            return urlunparse((scheme, netloc, path,
                               params, query, fragment))
        netloc = bnetloc
    if path[:1] == '/':
        return urlunparse((scheme, netloc, path,
                           params, query, fragment))
    if not path:
        if not params:
            params = bparams
            if not query:
                query = bquery
        return urlunparse((scheme, netloc, bpath,
                           params, query, fragment))
    segments = bpath.split('/')[:-1] + path.split('/')
    # XXX The stuff below is bogus in various ways...
    if segments[-1] == '.':
        segments[-1] = ''
    while '.' in segments:
        segments.remove('.')
    while 1:
        i = 1
        n = len(segments) - 1
        while i < n:
            if (segments[i] == '..'
                and segments[i-1] not in ('', '..')):
                del segments[i-1:i+1]
                break
            i = i+1
        else:
            break
    if segments == ['', '..']:
        segments[-1] = ''
    elif len(segments) >= 2 and segments[-1] == '..':
        segments[-2:] = ['']
    return urlunparse((scheme, netloc, '/'.join(segments),
                       params, query, fragment))

def urldefrag(url):
    """Removes any existing fragment from URL.

    Returns a tuple of the defragmented URL and the fragment.  If
    the URL contained no fragments, the second element is the
    empty string.
    """
    s, n, p, a, q, frag = urlparse(url)
    defrag = urlunparse((s, n, p, a, q, ''))
    return defrag, frag


test_input = """
      http://a/b/c/d

      g:h        = <URL:g:h>
      http:g     = <URL:http://a/b/c/g>
      http:      = <URL:http://a/b/c/d>
      g          = <URL:http://a/b/c/g>
      ./g        = <URL:http://a/b/c/g>
      g/         = <URL:http://a/b/c/g/>
      /g         = <URL:http://a/g>
      //g        = <URL:http://g>
      ?y         = <URL:http://a/b/c/d?y>
      g?y        = <URL:http://a/b/c/g?y>
      g?y/./x    = <URL:http://a/b/c/g?y/./x>
      .          = <URL:http://a/b/c/>
      ./         = <URL:http://a/b/c/>
      ..         = <URL:http://a/b/>
      ../        = <URL:http://a/b/>
      ../g       = <URL:http://a/b/g>
      ../..      = <URL:http://a/>
      ../../g    = <URL:http://a/g>
      ../../../g = <URL:http://a/../g>
      ./../g     = <URL:http://a/b/g>
      ./g/.      = <URL:http://a/b/c/g/>
      /./g       = <URL:http://a/./g>
      g/./h      = <URL:http://a/b/c/g/h>
      g/../h     = <URL:http://a/b/c/h>
      http:g     = <URL:http://a/b/c/g>
      http:      = <URL:http://a/b/c/d>
      http:?y         = <URL:http://a/b/c/d?y>
      http:g?y        = <URL:http://a/b/c/g?y>
      http:g?y/./x    = <URL:http://a/b/c/g?y/./x>
"""
# XXX The result for //g is actually http://g/; is this a problem?

def test():
    import sys
    base = ''
    if sys.argv[1:]:
        fn = sys.argv[1]
        if fn == '-':
            fp = sys.stdin
        else:
            fp = open(fn)
    else:
        import StringIO
        fp = StringIO.StringIO(test_input)
    while 1:
        line = fp.readline()
        if not line: break
        words = line.split()
        if not words:
            continue
        url = words[0]
        parts = urlparse(url)
        print '%-10s : %s' % (url, parts)
        abs = urljoin(base, url)
        if not base:
            base = abs
        wrapped = '<URL:%s>' % abs
        print '%-10s = %s' % (url, wrapped)
        if len(words) == 3 and words[1] == '=':
            if wrapped != words[2]:
                print 'EXPECTED', words[2], '!!!!!!!!!!'

if __name__ == '__main__':
    test()
The third and final doc-string sweep by Ka-Ping Yee. The attached patches update the standard library so that all modules have docstrings beginning with one-line summaries. A new docstring was added to formatter. The docstring for os.py was updated to mention nt, os2, ce in addition to posix, dos, mac. 2000-02-04 11:28:42 -04:00			`"""Parse (absolute and relative) URLs.`

			`See RFC 1808: "Relative Uniform Resource Locators", by R. Fielding,`
			`UC Irvine, June 1995.`
			`"""`
New tty/pty modules by Steen; new urlparser. 1994-09-12 07:36:35 -03:00
			`# A classification of schemes ('' means apply by default)`
			`uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'wais', 'file',`
Whitespace normalization. Top level of Lib now fixed-point for reindent.py! 2001-01-14 23:34:38 -04:00			`'https', 'shttp',`
			`'prospero', 'rtsp', 'rtspu', '']`
New tty/pty modules by Steen; new urlparser. 1994-09-12 07:36:35 -03:00			`uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet', 'wais',`
Whitespace normalization. Top level of Lib now fixed-point for reindent.py! 2001-01-14 23:34:38 -04:00			`'file',`
			`'https', 'shttp', 'snews',`
			`'prospero', 'rtsp', 'rtspu', '']`
Added characteristics of shttp, https, and snews. 1997-01-02 14:18:27 -04:00			`non_hierarchical = ['gopher', 'hdl', 'mailto', 'news', 'telnet', 'wais',`
Whitespace normalization. Top level of Lib now fixed-point for reindent.py! 2001-01-14 23:34:38 -04:00			`'snews', 'sip',`
			`]`
Added characteristics of shttp, https, and snews. 1997-01-02 14:18:27 -04:00			`uses_params = ['ftp', 'hdl', 'prospero', 'http',`
Whitespace normalization. Top level of Lib now fixed-point for reindent.py! 2001-01-14 23:34:38 -04:00			`'https', 'shttp', 'rtsp', 'rtspu', 'sip',`
			`'']`
Added characteristics of shttp, https, and snews. 1997-01-02 14:18:27 -04:00			`uses_query = ['http', 'wais',`
Whitespace normalization. Top level of Lib now fixed-point for reindent.py! 2001-01-14 23:34:38 -04:00			`'https', 'shttp',`
			`'gopher', 'rtsp', 'rtspu', 'sip',`
			`'']`
added hdl protocol properties 1996-05-28 20:10:02 -03:00			`uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news', 'nntp', 'wais',`
Whitespace normalization. Top level of Lib now fixed-point for reindent.py! 2001-01-14 23:34:38 -04:00			`'https', 'shttp', 'snews',`
			`'file', 'prospero', '']`
New tty/pty modules by Steen; new urlparser. 1994-09-12 07:36:35 -03:00
			`# Characters valid in scheme names`
Be explicit about scheme_chars -- string.letters is locale dependent so we can't use it. While I'm at it, got rid of string module use. (Found several new hard special cases for a hypothetical conversion tool: from string import join, find, rfind; and a local assignment "find=string.find".) 2000-12-19 12:48:13 -04:00			`scheme_chars = ('abcdefghijklmnopqrstuvwxyz'`
			`'ABCDEFGHIJKLMNOPQRSTUVWXYZ'`
			`'0123456789'`
			`'+-.')`
New tty/pty modules by Steen; new urlparser. 1994-09-12 07:36:35 -03:00
After some discussion with Jeremy and Fred, decided to limit the default urlparse cache size to 20 instead of 2000. The main use of the cache seems to be to gain some speed in Grail, which is calling urljoin with the same base for each anchor. 2000 is a bit too big for Jeremy, who doesn't need the cache at all. 20 should keep at least 95% of the Grail speedup while wasting an insignificant amount of memory in Jeremy's application. 1997-07-14 16:08:15 -03:00			`MAX_CACHE_SIZE = 20`
optimizations due to Fred Drake; added urldefrag() function 1996-05-28 20:54:24 -03:00			`_parse_cache = {}`

			`def clear_cache():`
Whitespace normalization. Top level of Lib now fixed-point for reindent.py! 2001-01-14 23:34:38 -04:00			`"""Clear the parse cache."""`
			`global _parse_cache`
			`_parse_cache = {}`
optimizations due to Fred Drake; added urldefrag() function 1996-05-28 20:54:24 -03:00

fix typo in keyword argument 'allow_frament' should be 'allow_fragment' 1998-08-25 16:45:24 -03:00			`def urlparse(url, scheme = '', allow_fragments = 1):`
Whitespace normalization. Top level of Lib now fixed-point for reindent.py! 2001-01-14 23:34:38 -04:00			`"""Parse a URL into 6 components:`
			`<scheme>://<netloc>/<path>;<params>?<query>#<fragment>`
			`Return a 6-tuple: (scheme, netloc, path, params, query, fragment).`
			`Note that we don't break the components up in smaller bits`
			`(e.g. netloc is a single string) and we don't expand % escapes."""`
			`key = url, scheme, allow_fragments`
			`cached = _parse_cache.get(key, None)`
			`if cached:`
			`return cached`
			`if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth`
			`clear_cache()`
			`netloc = path = params = query = fragment = ''`
			`i = url.find(':')`
			`if i > 0:`
			`if url[:i] == 'http': # optimize the common case`
			`scheme = url[:i].lower()`
			`url = url[i+1:]`
			`if url[:2] == '//':`
			`i = url.find('/', 2)`
			`if i < 0:`
			`i = len(url)`
			`netloc = url[2:i]`
			`url = url[i:]`
			`if allow_fragments:`
			`i = url.rfind('#')`
			`if i >= 0:`
			`fragment = url[i+1:]`
			`url = url[:i]`
			`i = url.find('?')`
			`if i >= 0:`
			`query = url[i+1:]`
			`url = url[:i]`
			`i = url.find(';')`
			`if i >= 0:`
			`params = url[i+1:]`
			`url = url[:i]`
			`tuple = scheme, netloc, url, params, query, fragment`
			`_parse_cache[key] = tuple`
			`return tuple`
			`for c in url[:i]:`
			`if c not in scheme_chars:`
			`break`
			`else:`
			`scheme, url = url[:i].lower(), url[i+1:]`
			`if scheme in uses_netloc:`
			`if url[:2] == '//':`
			`i = url.find('/', 2)`
			`if i < 0:`
			`i = len(url)`
			`netloc, url = url[2:i], url[i:]`
			`if allow_fragments and scheme in uses_fragment:`
			`i = url.rfind('#')`
			`if i >= 0:`
			`url, fragment = url[:i], url[i+1:]`
			`if scheme in uses_query:`
			`i = url.find('?')`
			`if i >= 0:`
			`url, query = url[:i], url[i+1:]`
			`if scheme in uses_params:`
			`i = url.find(';')`
			`if i >= 0:`
			`url, params = url[:i], url[i+1:]`
			`tuple = scheme, netloc, url, params, query, fragment`
			`_parse_cache[key] = tuple`
			`return tuple`
New tty/pty modules by Steen; new urlparser. 1994-09-12 07:36:35 -03:00
			`def urlunparse((scheme, netloc, url, params, query, fragment)):`
Whitespace normalization. Top level of Lib now fixed-point for reindent.py! 2001-01-14 23:34:38 -04:00			`"""Put a parsed URL back together again. This may result in a`
			`slightly different, but equivalent URL, if the URL that was parsed`
			`originally had redundant delimiters, e.g. a ? with an empty query`
			`(the draft states that these are equivalent)."""`
			`if netloc or (scheme in uses_netloc and url[:2] == '//'):`
			`if url and url[:1] != '/': url = '/' + url`
			`url = '//' + (netloc or '') + url`
			`if scheme:`
			`url = scheme + ':' + url`
			`if params:`
			`url = url + ';' + params`
			`if query:`
			`url = url + '?' + query`
			`if fragment:`
			`url = url + '#' + fragment`
			`return url`
New tty/pty modules by Steen; new urlparser. 1994-09-12 07:36:35 -03:00
fix typo in keyword argument 'allow_frament' should be 'allow_fragment' 1998-08-25 16:45:24 -03:00			`def urljoin(base, url, allow_fragments = 1):`
Whitespace normalization. Top level of Lib now fixed-point for reindent.py! 2001-01-14 23:34:38 -04:00			`"""Join a base URL and a possibly relative URL to form an absolute`
			`interpretation of the latter."""`
			`if not base:`
			`return url`
			`if not url:`
			`return base`
			`bscheme, bnetloc, bpath, bparams, bquery, bfragment = \`
			`urlparse(base, '', allow_fragments)`
			`scheme, netloc, path, params, query, fragment = \`
			`urlparse(url, bscheme, allow_fragments)`
			`if scheme != bscheme or scheme not in uses_relative:`
			`return url`
			`if scheme in uses_netloc:`
			`if netloc:`
			`return urlunparse((scheme, netloc, path,`
			`params, query, fragment))`
			`netloc = bnetloc`
			`if path[:1] == '/':`
			`return urlunparse((scheme, netloc, path,`
			`params, query, fragment))`
			`if not path:`
			`if not params:`
			`params = bparams`
			`if not query:`
			`query = bquery`
			`return urlunparse((scheme, netloc, bpath,`
			`params, query, fragment))`
			`segments = bpath.split('/')[:-1] + path.split('/')`
			`# XXX The stuff below is bogus in various ways...`
			`if segments[-1] == '.':`
			`segments[-1] = ''`
			`while '.' in segments:`
			`segments.remove('.')`
			`while 1:`
			`i = 1`
			`n = len(segments) - 1`
			`while i < n:`
			`if (segments[i] == '..'`
			`and segments[i-1] not in ('', '..')):`
			`del segments[i-1:i+1]`
			`break`
			`i = i+1`
			`else:`
			`break`
			`if segments == ['', '..']:`
			`segments[-1] = ''`
			`elif len(segments) >= 2 and segments[-1] == '..':`
			`segments[-2:] = ['']`
			`return urlunparse((scheme, netloc, '/'.join(segments),`
			`params, query, fragment))`
New tty/pty modules by Steen; new urlparser. 1994-09-12 07:36:35 -03:00
optimizations due to Fred Drake; added urldefrag() function 1996-05-28 20:54:24 -03:00			`def urldefrag(url):`
Whitespace normalization. Top level of Lib now fixed-point for reindent.py! 2001-01-14 23:34:38 -04:00			`"""Removes any existing fragment from URL.`
optimizations due to Fred Drake; added urldefrag() function 1996-05-28 20:54:24 -03:00
Whitespace normalization. Top level of Lib now fixed-point for reindent.py! 2001-01-14 23:34:38 -04:00			`Returns a tuple of the defragmented URL and the fragment. If`
			`the URL contained no fragments, the second element is the`
			`empty string.`
			`"""`
			`s, n, p, a, q, frag = urlparse(url)`
			`defrag = urlunparse((s, n, p, a, q, ''))`
			`return defrag, frag`
optimizations due to Fred Drake; added urldefrag() function 1996-05-28 20:54:24 -03:00

New tty/pty modules by Steen; new urlparser. 1994-09-12 07:36:35 -03:00			`test_input = """`
			`http://a/b/c/d`

			`g:h = <URL:g:h>`
			`http:g = <URL:http://a/b/c/g>`
			`http: = <URL:http://a/b/c/d>`
			`g = <URL:http://a/b/c/g>`
			`./g = <URL:http://a/b/c/g>`
			`g/ = <URL:http://a/b/c/g/>`
			`/g = <URL:http://a/g>`
			`//g = <URL:http://g>`
			`?y = <URL:http://a/b/c/d?y>`
			`g?y = <URL:http://a/b/c/g?y>`
			`g?y/./x = <URL:http://a/b/c/g?y/./x>`
			`. = <URL:http://a/b/c/>`
			`./ = <URL:http://a/b/c/>`
			`.. = <URL:http://a/b/>`
			`../ = <URL:http://a/b/>`
			`../g = <URL:http://a/b/g>`
			`../.. = <URL:http://a/>`
			`../../g = <URL:http://a/g>`
			`../../../g = <URL:http://a/../g>`
			`./../g = <URL:http://a/b/g>`
			`./g/. = <URL:http://a/b/c/g/>`
			`/./g = <URL:http://a/./g>`
			`g/./h = <URL:http://a/b/c/g/h>`
			`g/../h = <URL:http://a/b/c/h>`
			`http:g = <URL:http://a/b/c/g>`
			`http: = <URL:http://a/b/c/d>`
Fixed bug in the common-case code for HTTP URLs; it would lose the query, fragment, and/or parameter information. 3 cases added to the test suite to check for this bug. 1999-01-06 18:13:09 -04:00			`http:?y = <URL:http://a/b/c/d?y>`
			`http:g?y = <URL:http://a/b/c/g?y>`
			`http:g?y/./x = <URL:http://a/b/c/g?y/./x>`
New tty/pty modules by Steen; new urlparser. 1994-09-12 07:36:35 -03:00			`"""`
Add XXX comment about a test that doesn't seem right -- no time to explore this now. 1998-12-21 14:24:09 -04:00			`# XXX The result for //g is actually http://g/; is this a problem?`
New tty/pty modules by Steen; new urlparser. 1994-09-12 07:36:35 -03:00
			`def test():`
Whitespace normalization. Top level of Lib now fixed-point for reindent.py! 2001-01-14 23:34:38 -04:00			`import sys`
			`base = ''`
			`if sys.argv[1:]:`
			`fn = sys.argv[1]`
			`if fn == '-':`
			`fp = sys.stdin`
			`else:`
			`fp = open(fn)`
			`else:`
			`import StringIO`
			`fp = StringIO.StringIO(test_input)`
			`while 1:`
			`line = fp.readline()`
			`if not line: break`
			`words = line.split()`
			`if not words:`
			`continue`
			`url = words[0]`
			`parts = urlparse(url)`
			`print '%-10s : %s' % (url, parts)`
			`abs = urljoin(base, url)`
			`if not base:`
			`base = abs`
			`wrapped = '<URL:%s>' % abs`
			`print '%-10s = %s' % (url, wrapped)`
			`if len(words) == 3 and words[1] == '=':`
			`if wrapped != words[2]:`
			`print 'EXPECTED', words[2], '!!!!!!!!!!'`
New tty/pty modules by Steen; new urlparser. 1994-09-12 07:36:35 -03:00
			`if __name__ == '__main__':`
Whitespace normalization. Top level of Lib now fixed-point for reindent.py! 2001-01-14 23:34:38 -04:00			`test()`