cpython/Lib/urlparse.py

"""Parse (absolute and relative) URLs.

See RFC 1808: "Relative Uniform Resource Locators", by R. Fielding,
UC Irvine, June 1995.
"""

__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
           "urlsplit", "urlunsplit"]

# A classification of schemes ('' means apply by default)
uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
                               'wais', 'file', 'https', 'shttp', 'mms',
                               'prospero', 'rtsp', 'rtspu', '']
uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
                             'imap', 'wais', 'file', 'mms', 'https', 'shttp',
                             'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '']
non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
                                  'telnet', 'wais', 'imap', 'snews', 'sip']
uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
                             'https', 'shttp', 'rtsp', 'rtspu', 'sip',
                             'mms', '']
uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms',
                            'gopher', 'rtsp', 'rtspu', 'sip', '']
uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news',
                               'nntp', 'wais', 'https', 'shttp', 'snews',
                               'file', 'prospero', '']

# Characters valid in scheme names
scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
                'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
                '0123456789'
                '+-.')

MAX_CACHE_SIZE = 20
_parse_cache = {}

def clear_cache():
    """Clear the parse cache."""
    global _parse_cache
    _parse_cache = {}


def urlparse(url, scheme='', allow_fragments=1):
    """Parse a URL into 6 components:
    <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
    Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
    Note that we don't break the components up in smaller bits
    (e.g. netloc is a single string) and we don't expand % escapes."""
    tuple = urlsplit(url, scheme, allow_fragments)
    scheme, netloc, url, query, fragment = tuple
    if scheme in uses_params and ';' in url:
        url, params = _splitparams(url)
    else:
        params = ''
    return scheme, netloc, url, params, query, fragment

def _splitparams(url):
    if '/'  in url:
        i = url.find(';', url.rfind('/'))
        if i < 0:
            return url, ''
    else:
        i = url.find(';')
    return url[:i], url[i+1:]

def _splitnetloc(url, start=0):
    for c in '/?#': # the order is important!
        delim = url.find(c, start)
        if delim >= 0:
            break
    else:
        delim = len(url)
    return url[start:delim], url[delim:]

def urlsplit(url, scheme='', allow_fragments=1):
    """Parse a URL into 5 components:
    <scheme>://<netloc>/<path>?<query>#<fragment>
    Return a 5-tuple: (scheme, netloc, path, query, fragment).
    Note that we don't break the components up in smaller bits
    (e.g. netloc is a single string) and we don't expand % escapes."""
    key = url, scheme, allow_fragments
    cached = _parse_cache.get(key, None)
    if cached:
        return cached
    if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
        clear_cache()
    netloc = query = fragment = ''
    i = url.find(':')
    if i > 0:
        if url[:i] == 'http': # optimize the common case
            scheme = url[:i].lower()
            url = url[i+1:]
            if url[:2] == '//':
                netloc, url = _splitnetloc(url, 2)
            if allow_fragments and '#' in url:
                url, fragment = url.split('#', 1)
            if '?' in url:
                url, query = url.split('?', 1)
            tuple = scheme, netloc, url, query, fragment
            _parse_cache[key] = tuple
            return tuple
        for c in url[:i]:
            if c not in scheme_chars:
                break
        else:
            scheme, url = url[:i].lower(), url[i+1:]
    if scheme in uses_netloc and url[:2] == '//':
        netloc, url = _splitnetloc(url, 2)
    if allow_fragments and scheme in uses_fragment and '#' in url:
        url, fragment = url.split('#', 1)
    if scheme in uses_query and '?' in url:
        url, query = url.split('?', 1)
    tuple = scheme, netloc, url, query, fragment
    _parse_cache[key] = tuple
    return tuple

def urlunparse((scheme, netloc, url, params, query, fragment)):
    """Put a parsed URL back together again.  This may result in a
    slightly different, but equivalent URL, if the URL that was parsed
    originally had redundant delimiters, e.g. a ? with an empty query
    (the draft states that these are equivalent)."""
    if params:
        url = "%s;%s" % (url, params)
    return urlunsplit((scheme, netloc, url, query, fragment))

def urlunsplit((scheme, netloc, url, query, fragment)):
    if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
        if url and url[:1] != '/': url = '/' + url
        url = '//' + (netloc or '') + url
    if scheme:
        url = scheme + ':' + url
    if query:
        url = url + '?' + query
    if fragment:
        url = url + '#' + fragment
    return url

def urljoin(base, url, allow_fragments = 1):
    """Join a base URL and a possibly relative URL to form an absolute
    interpretation of the latter."""
    if not base:
        return url
    if not url:
        return base
    bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
            urlparse(base, '', allow_fragments)
    scheme, netloc, path, params, query, fragment = \
            urlparse(url, bscheme, allow_fragments)
    if scheme != bscheme or scheme not in uses_relative:
        return url
    if scheme in uses_netloc:
        if netloc:
            return urlunparse((scheme, netloc, path,
                               params, query, fragment))
        netloc = bnetloc
    if path[:1] == '/':
        return urlunparse((scheme, netloc, path,
                           params, query, fragment))
    if not (path or params or query):
        return urlunparse((scheme, netloc, bpath,
                           bparams, bquery, fragment))
    segments = bpath.split('/')[:-1] + path.split('/')
    # XXX The stuff below is bogus in various ways...
    if segments[-1] == '.':
        segments[-1] = ''
    while '.' in segments:
        segments.remove('.')
    while 1:
        i = 1
        n = len(segments) - 1
        while i < n:
            if (segments[i] == '..'
                and segments[i-1] not in ('', '..')):
                del segments[i-1:i+1]
                break
            i = i+1
        else:
            break
    if segments == ['', '..']:
        segments[-1] = ''
    elif len(segments) >= 2 and segments[-1] == '..':
        segments[-2:] = ['']
    return urlunparse((scheme, netloc, '/'.join(segments),
                       params, query, fragment))

def urldefrag(url):
    """Removes any existing fragment from URL.

    Returns a tuple of the defragmented URL and the fragment.  If
    the URL contained no fragments, the second element is the
    empty string.
    """
    if '#' in url:
        s, n, p, a, q, frag = urlparse(url)
        defrag = urlunparse((s, n, p, a, q, ''))
        return defrag, frag
    else:
        return url, ''


test_input = """
      http://a/b/c/d

      g:h        = <URL:g:h>
      http:g     = <URL:http://a/b/c/g>
      http:      = <URL:http://a/b/c/d>
      g          = <URL:http://a/b/c/g>
      ./g        = <URL:http://a/b/c/g>
      g/         = <URL:http://a/b/c/g/>
      /g         = <URL:http://a/g>
      //g        = <URL:http://g>
      ?y         = <URL:http://a/b/c/d?y>
      g?y        = <URL:http://a/b/c/g?y>
      g?y/./x    = <URL:http://a/b/c/g?y/./x>
      .          = <URL:http://a/b/c/>
      ./         = <URL:http://a/b/c/>
      ..         = <URL:http://a/b/>
      ../        = <URL:http://a/b/>
      ../g       = <URL:http://a/b/g>
      ../..      = <URL:http://a/>
      ../../g    = <URL:http://a/g>
      ../../../g = <URL:http://a/../g>
      ./../g     = <URL:http://a/b/g>
      ./g/.      = <URL:http://a/b/c/g/>
      /./g       = <URL:http://a/./g>
      g/./h      = <URL:http://a/b/c/g/h>
      g/../h     = <URL:http://a/b/c/h>
      http:g     = <URL:http://a/b/c/g>
      http:      = <URL:http://a/b/c/d>
      http:?y         = <URL:http://a/b/c/d?y>
      http:g?y        = <URL:http://a/b/c/g?y>
      http:g?y/./x    = <URL:http://a/b/c/g?y/./x>
"""

def test():
    import sys
    base = ''
    if sys.argv[1:]:
        fn = sys.argv[1]
        if fn == '-':
            fp = sys.stdin
        else:
            fp = open(fn)
    else:
        try:
            from cStringIO import StringIO
        except ImportError:
            from StringIO import StringIO
        fp = StringIO(test_input)
    while 1:
        line = fp.readline()
        if not line: break
        words = line.split()
        if not words:
            continue
        url = words[0]
        parts = urlparse(url)
        print '%-10s : %s' % (url, parts)
        abs = urljoin(base, url)
        if not base:
            base = abs
        wrapped = '<URL:%s>' % abs
        print '%-10s = %s' % (url, wrapped)
        if len(words) == 3 and words[1] == '=':
            if wrapped != words[2]:
                print 'EXPECTED', words[2], '!!!!!!!!!!'

if __name__ == '__main__':
    test()
The third and final doc-string sweep by Ka-Ping Yee. The attached patches update the standard library so that all modules have docstrings beginning with one-line summaries. A new docstring was added to formatter. The docstring for os.py was updated to mention nt, os2, ce in addition to posix, dos, mac. 2000-02-04 11:28:42 -04:00			`"""Parse (absolute and relative) URLs.`

			`See RFC 1808: "Relative Uniform Resource Locators", by R. Fielding,`
			`UC Irvine, June 1995.`
			`"""`
New tty/pty modules by Steen; new urlparser. 1994-09-12 07:36:35 -03:00
Added missing entries to __all__. 2002-10-16 18:21:39 -03:00			`__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",`
			`"urlsplit", "urlunsplit"]`
final round of __all__ lists (I hope) - skipped urllib2 because Moshe may be giving it a slight facelift 2001-03-01 00:27:19 -04:00
New tty/pty modules by Steen; new urlparser. 1994-09-12 07:36:35 -03:00			`# A classification of schemes ('' means apply by default)`
Revert last change. 2004-05-07 02:50:35 -03:00			`uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',`
* add mms (windows media) as another scheme * reformat schemes to 80 columns 2003-01-06 16:27:03 -04:00			`'wais', 'file', 'https', 'shttp', 'mms',`
Revert last change. 2004-05-07 02:50:35 -03:00			`'prospero', 'rtsp', 'rtspu', '']`
			`uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',`
* add mms (windows media) as another scheme * reformat schemes to 80 columns 2003-01-06 16:27:03 -04:00			`'imap', 'wais', 'file', 'mms', 'https', 'shttp',`
rsync is now a recognized protocol that uses "netloc" (i.e. specifies a network location) in its addressing. Closes bug #981299. 2004-06-29 01:02:40 -03:00			`'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '']`
Revert last change. 2004-05-07 02:50:35 -03:00			`non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',`
			`'telnet', 'wais', 'imap', 'snews', 'sip']`
			`uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',`
* add mms (windows media) as another scheme * reformat schemes to 80 columns 2003-01-06 16:27:03 -04:00			`'https', 'shttp', 'rtsp', 'rtspu', 'sip',`
Revert last change. 2004-05-07 02:50:35 -03:00			`'mms', '']`
			`uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms',`
			`'gopher', 'rtsp', 'rtspu', 'sip', '']`
			`uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news',`
* add mms (windows media) as another scheme * reformat schemes to 80 columns 2003-01-06 16:27:03 -04:00			`'nntp', 'wais', 'https', 'shttp', 'snews',`
Revert last change. 2004-05-07 02:50:35 -03:00			`'file', 'prospero', '']`
New tty/pty modules by Steen; new urlparser. 1994-09-12 07:36:35 -03:00
			`# Characters valid in scheme names`
Be explicit about scheme_chars -- string.letters is locale dependent so we can't use it. While I'm at it, got rid of string module use. (Found several new hard special cases for a hypothetical conversion tool: from string import join, find, rfind; and a local assignment "find=string.find".) 2000-12-19 12:48:13 -04:00			`scheme_chars = ('abcdefghijklmnopqrstuvwxyz'`
			`'ABCDEFGHIJKLMNOPQRSTUVWXYZ'`
			`'0123456789'`
			`'+-.')`
New tty/pty modules by Steen; new urlparser. 1994-09-12 07:36:35 -03:00
After some discussion with Jeremy and Fred, decided to limit the default urlparse cache size to 20 instead of 2000. The main use of the cache seems to be to gain some speed in Grail, which is calling urljoin with the same base for each anchor. 2000 is a bit too big for Jeremy, who doesn't need the cache at all. 20 should keep at least 95% of the Grail speedup while wasting an insignificant amount of memory in Jeremy's application. 1997-07-14 16:08:15 -03:00			`MAX_CACHE_SIZE = 20`
optimizations due to Fred Drake; added urldefrag() function 1996-05-28 20:54:24 -03:00			`_parse_cache = {}`

			`def clear_cache():`
Whitespace normalization. Top level of Lib now fixed-point for reindent.py! 2001-01-14 23:34:38 -04:00			`"""Clear the parse cache."""`
			`global _parse_cache`
			`_parse_cache = {}`
optimizations due to Fred Drake; added urldefrag() function 1996-05-28 20:54:24 -03:00

Fix parsing of parameters from a URL; urlparse() did not check that it only split parameters from the last path segment. Introduces two new functions, urlsplit() and urlunsplit(), that do the simpler job of splitting the URL without monkeying around with the parameters field, since that was not being handled properly. This closes bug #478038. 2001-11-15 22:52:57 -04:00			`def urlparse(url, scheme='', allow_fragments=1):`
Whitespace normalization. Top level of Lib now fixed-point for reindent.py! 2001-01-14 23:34:38 -04:00			`"""Parse a URL into 6 components:`
			`<scheme>://<netloc>/<path>;<params>?<query>#<fragment>`
			`Return a 6-tuple: (scheme, netloc, path, params, query, fragment).`
			`Note that we don't break the components up in smaller bits`
			`(e.g. netloc is a single string) and we don't expand % escapes."""`
Fix parsing of parameters from a URL; urlparse() did not check that it only split parameters from the last path segment. Introduces two new functions, urlsplit() and urlunsplit(), that do the simpler job of splitting the URL without monkeying around with the parameters field, since that was not being handled properly. This closes bug #478038. 2001-11-15 22:52:57 -04:00			`tuple = urlsplit(url, scheme, allow_fragments)`
			`scheme, netloc, url, query, fragment = tuple`
			`if scheme in uses_params and ';' in url:`
			`url, params = _splitparams(url)`
			`else:`
			`params = ''`
			`return scheme, netloc, url, params, query, fragment`

			`def _splitparams(url):`
			`if '/' in url:`
			`i = url.find(';', url.rfind('/'))`
			`if i < 0:`
			`return url, ''`
			`else:`
			`i = url.find(';')`
			`return url[:i], url[i+1:]`

Patch #712317: In URLs such as http://www.example.com?query=spam, treat '?' as a delimiter. Previously, the 'network location' (<authority> in RFC 2396) would become 'www.example.com?query=spam', while RFC 2396 does not allow a '?' in <authority>. See bug #548176 for further discussion. 2005-01-09 11:29:10 -04:00			`def _splitnetloc(url, start=0):`
			`for c in '/?#': # the order is important!`
			`delim = url.find(c, start)`
			`if delim >= 0:`
			`break`
			`else:`
			`delim = len(url)`
			`return url[start:delim], url[delim:]`

Fix parsing of parameters from a URL; urlparse() did not check that it only split parameters from the last path segment. Introduces two new functions, urlsplit() and urlunsplit(), that do the simpler job of splitting the URL without monkeying around with the parameters field, since that was not being handled properly. This closes bug #478038. 2001-11-15 22:52:57 -04:00			`def urlsplit(url, scheme='', allow_fragments=1):`
			`"""Parse a URL into 5 components:`
			`<scheme>://<netloc>/<path>?<query>#<fragment>`
			`Return a 5-tuple: (scheme, netloc, path, query, fragment).`
			`Note that we don't break the components up in smaller bits`
			`(e.g. netloc is a single string) and we don't expand % escapes."""`
Whitespace normalization. Top level of Lib now fixed-point for reindent.py! 2001-01-14 23:34:38 -04:00			`key = url, scheme, allow_fragments`
			`cached = _parse_cache.get(key, None)`
			`if cached:`
			`return cached`
			`if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth`
			`clear_cache()`
Fix parsing of parameters from a URL; urlparse() did not check that it only split parameters from the last path segment. Introduces two new functions, urlsplit() and urlunsplit(), that do the simpler job of splitting the URL without monkeying around with the parameters field, since that was not being handled properly. This closes bug #478038. 2001-11-15 22:52:57 -04:00			`netloc = query = fragment = ''`
Whitespace normalization. Top level of Lib now fixed-point for reindent.py! 2001-01-14 23:34:38 -04:00			`i = url.find(':')`
			`if i > 0:`
			`if url[:i] == 'http': # optimize the common case`
			`scheme = url[:i].lower()`
			`url = url[i+1:]`
			`if url[:2] == '//':`
Patch #712317: In URLs such as http://www.example.com?query=spam, treat '?' as a delimiter. Previously, the 'network location' (<authority> in RFC 2396) would become 'www.example.com?query=spam', while RFC 2396 does not allow a '?' in <authority>. See bug #548176 for further discussion. 2005-01-09 11:29:10 -04:00			`netloc, url = _splitnetloc(url, 2)`
Fix parsing of parameters from a URL; urlparse() did not check that it only split parameters from the last path segment. Introduces two new functions, urlsplit() and urlunsplit(), that do the simpler job of splitting the URL without monkeying around with the parameters field, since that was not being handled properly. This closes bug #478038. 2001-11-15 22:52:57 -04:00			`if allow_fragments and '#' in url:`
			`url, fragment = url.split('#', 1)`
			`if '?' in url:`
			`url, query = url.split('?', 1)`
			`tuple = scheme, netloc, url, query, fragment`
Whitespace normalization. Top level of Lib now fixed-point for reindent.py! 2001-01-14 23:34:38 -04:00			`_parse_cache[key] = tuple`
			`return tuple`
			`for c in url[:i]:`
			`if c not in scheme_chars:`
			`break`
			`else:`
			`scheme, url = url[:i].lower(), url[i+1:]`
Patch #712317: In URLs such as http://www.example.com?query=spam, treat '?' as a delimiter. Previously, the 'network location' (<authority> in RFC 2396) would become 'www.example.com?query=spam', while RFC 2396 does not allow a '?' in <authority>. See bug #548176 for further discussion. 2005-01-09 11:29:10 -04:00			`if scheme in uses_netloc and url[:2] == '//':`
			`netloc, url = _splitnetloc(url, 2)`
Fix parsing of parameters from a URL; urlparse() did not check that it only split parameters from the last path segment. Introduces two new functions, urlsplit() and urlunsplit(), that do the simpler job of splitting the URL without monkeying around with the parameters field, since that was not being handled properly. This closes bug #478038. 2001-11-15 22:52:57 -04:00			`if allow_fragments and scheme in uses_fragment and '#' in url:`
			`url, fragment = url.split('#', 1)`
			`if scheme in uses_query and '?' in url:`
			`url, query = url.split('?', 1)`
			`tuple = scheme, netloc, url, query, fragment`
Whitespace normalization. Top level of Lib now fixed-point for reindent.py! 2001-01-14 23:34:38 -04:00			`_parse_cache[key] = tuple`
			`return tuple`
New tty/pty modules by Steen; new urlparser. 1994-09-12 07:36:35 -03:00
			`def urlunparse((scheme, netloc, url, params, query, fragment)):`
Whitespace normalization. Top level of Lib now fixed-point for reindent.py! 2001-01-14 23:34:38 -04:00			`"""Put a parsed URL back together again. This may result in a`
			`slightly different, but equivalent URL, if the URL that was parsed`
			`originally had redundant delimiters, e.g. a ? with an empty query`
			`(the draft states that these are equivalent)."""`
Fix parsing of parameters from a URL; urlparse() did not check that it only split parameters from the last path segment. Introduces two new functions, urlsplit() and urlunsplit(), that do the simpler job of splitting the URL without monkeying around with the parameters field, since that was not being handled properly. This closes bug #478038. 2001-11-15 22:52:57 -04:00			`if params:`
			`url = "%s;%s" % (url, params)`
			`return urlunsplit((scheme, netloc, url, query, fragment))`

			`def urlunsplit((scheme, netloc, url, query, fragment)):`
Fix for 1.33: urlsplit() should only add '//' if scheme != ''. Will add test and backport. 2002-10-14 16:59:54 -03:00			`if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):`
Whitespace normalization. Top level of Lib now fixed-point for reindent.py! 2001-01-14 23:34:38 -04:00			`if url and url[:1] != '/': url = '/' + url`
			`url = '//' + (netloc or '') + url`
			`if scheme:`
			`url = scheme + ':' + url`
			`if query:`
			`url = url + '?' + query`
			`if fragment:`
			`url = url + '#' + fragment`
			`return url`
New tty/pty modules by Steen; new urlparser. 1994-09-12 07:36:35 -03:00
fix typo in keyword argument 'allow_frament' should be 'allow_fragment' 1998-08-25 16:45:24 -03:00			`def urljoin(base, url, allow_fragments = 1):`
Whitespace normalization. Top level of Lib now fixed-point for reindent.py! 2001-01-14 23:34:38 -04:00			`"""Join a base URL and a possibly relative URL to form an absolute`
			`interpretation of the latter."""`
			`if not base:`
			`return url`
			`if not url:`
			`return base`
			`bscheme, bnetloc, bpath, bparams, bquery, bfragment = \`
			`urlparse(base, '', allow_fragments)`
			`scheme, netloc, path, params, query, fragment = \`
			`urlparse(url, bscheme, allow_fragments)`
			`if scheme != bscheme or scheme not in uses_relative:`
			`return url`
			`if scheme in uses_netloc:`
			`if netloc:`
			`return urlunparse((scheme, netloc, path,`
			`params, query, fragment))`
			`netloc = bnetloc`
			`if path[:1] == '/':`
			`return urlunparse((scheme, netloc, path,`
			`params, query, fragment))`
See rev. 1.42 for log message 2003-10-12 01:29:10 -03:00			`if not (path or params or query):`
Whitespace normalization. Top level of Lib now fixed-point for reindent.py! 2001-01-14 23:34:38 -04:00			`return urlunparse((scheme, netloc, bpath,`
See rev. 1.42 for log message 2003-10-12 01:29:10 -03:00			`bparams, bquery, fragment))`
Whitespace normalization. Top level of Lib now fixed-point for reindent.py! 2001-01-14 23:34:38 -04:00			`segments = bpath.split('/')[:-1] + path.split('/')`
			`# XXX The stuff below is bogus in various ways...`
			`if segments[-1] == '.':`
			`segments[-1] = ''`
			`while '.' in segments:`
			`segments.remove('.')`
			`while 1:`
			`i = 1`
			`n = len(segments) - 1`
			`while i < n:`
			`if (segments[i] == '..'`
			`and segments[i-1] not in ('', '..')):`
			`del segments[i-1:i+1]`
			`break`
			`i = i+1`
			`else:`
			`break`
			`if segments == ['', '..']:`
			`segments[-1] = ''`
			`elif len(segments) >= 2 and segments[-1] == '..':`
			`segments[-2:] = ['']`
			`return urlunparse((scheme, netloc, '/'.join(segments),`
			`params, query, fragment))`
New tty/pty modules by Steen; new urlparser. 1994-09-12 07:36:35 -03:00
optimizations due to Fred Drake; added urldefrag() function 1996-05-28 20:54:24 -03:00			`def urldefrag(url):`
Whitespace normalization. Top level of Lib now fixed-point for reindent.py! 2001-01-14 23:34:38 -04:00			`"""Removes any existing fragment from URL.`
optimizations due to Fred Drake; added urldefrag() function 1996-05-28 20:54:24 -03:00
Whitespace normalization. Top level of Lib now fixed-point for reindent.py! 2001-01-14 23:34:38 -04:00			`Returns a tuple of the defragmented URL and the fragment. If`
			`the URL contained no fragments, the second element is the`
			`empty string.`
			`"""`
Fix parsing of parameters from a URL; urlparse() did not check that it only split parameters from the last path segment. Introduces two new functions, urlsplit() and urlunsplit(), that do the simpler job of splitting the URL without monkeying around with the parameters field, since that was not being handled properly. This closes bug #478038. 2001-11-15 22:52:57 -04:00			`if '#' in url:`
			`s, n, p, a, q, frag = urlparse(url)`
			`defrag = urlunparse((s, n, p, a, q, ''))`
			`return defrag, frag`
			`else:`
			`return url, ''`
optimizations due to Fred Drake; added urldefrag() function 1996-05-28 20:54:24 -03:00

New tty/pty modules by Steen; new urlparser. 1994-09-12 07:36:35 -03:00			`test_input = """`
			`http://a/b/c/d`

			`g:h = <URL:g:h>`
			`http:g = <URL:http://a/b/c/g>`
			`http: = <URL:http://a/b/c/d>`
			`g = <URL:http://a/b/c/g>`
			`./g = <URL:http://a/b/c/g>`
			`g/ = <URL:http://a/b/c/g/>`
			`/g = <URL:http://a/g>`
			`//g = <URL:http://g>`
			`?y = <URL:http://a/b/c/d?y>`
			`g?y = <URL:http://a/b/c/g?y>`
			`g?y/./x = <URL:http://a/b/c/g?y/./x>`
			`. = <URL:http://a/b/c/>`
			`./ = <URL:http://a/b/c/>`
			`.. = <URL:http://a/b/>`
			`../ = <URL:http://a/b/>`
			`../g = <URL:http://a/b/g>`
			`../.. = <URL:http://a/>`
			`../../g = <URL:http://a/g>`
			`../../../g = <URL:http://a/../g>`
			`./../g = <URL:http://a/b/g>`
			`./g/. = <URL:http://a/b/c/g/>`
			`/./g = <URL:http://a/./g>`
			`g/./h = <URL:http://a/b/c/g/h>`
			`g/../h = <URL:http://a/b/c/h>`
			`http:g = <URL:http://a/b/c/g>`
			`http: = <URL:http://a/b/c/d>`
Fixed bug in the common-case code for HTTP URLs; it would lose the query, fragment, and/or parameter information. 3 cases added to the test suite to check for this bug. 1999-01-06 18:13:09 -04:00			`http:?y = <URL:http://a/b/c/d?y>`
			`http:g?y = <URL:http://a/b/c/g?y>`
			`http:g?y/./x = <URL:http://a/b/c/g?y/./x>`
New tty/pty modules by Steen; new urlparser. 1994-09-12 07:36:35 -03:00			`"""`

			`def test():`
Whitespace normalization. Top level of Lib now fixed-point for reindent.py! 2001-01-14 23:34:38 -04:00			`import sys`
			`base = ''`
			`if sys.argv[1:]:`
			`fn = sys.argv[1]`
			`if fn == '-':`
			`fp = sys.stdin`
			`else:`
			`fp = open(fn)`
			`else:`
Use cStringIO where available. 2004-12-31 15:15:26 -04:00			`try:`
			`from cStringIO import StringIO`
			`except ImportError:`
			`from StringIO import StringIO`
			`fp = StringIO(test_input)`
Whitespace normalization. Top level of Lib now fixed-point for reindent.py! 2001-01-14 23:34:38 -04:00			`while 1:`
			`line = fp.readline()`
			`if not line: break`
			`words = line.split()`
			`if not words:`
			`continue`
			`url = words[0]`
			`parts = urlparse(url)`
			`print '%-10s : %s' % (url, parts)`
			`abs = urljoin(base, url)`
			`if not base:`
			`base = abs`
			`wrapped = '<URL:%s>' % abs`
			`print '%-10s = %s' % (url, wrapped)`
			`if len(words) == 3 and words[1] == '=':`
			`if wrapped != words[2]:`
			`print 'EXPECTED', words[2], '!!!!!!!!!!'`
New tty/pty modules by Steen; new urlparser. 1994-09-12 07:36:35 -03:00
			`if __name__ == '__main__':`
Whitespace normalization. Top level of Lib now fixed-point for reindent.py! 2001-01-14 23:34:38 -04:00			`test()`