diff --git a/Lib/test/test_urllib2.py b/Lib/test/test_urllib2.py index 7e0bbf064b0..c79a733e1a2 100644 --- a/Lib/test/test_urllib2.py +++ b/Lib/test/test_urllib2.py @@ -13,8 +13,7 @@ from urllib2 import Request, OpenerDirector # parse_keqv_list, parse_http_list (I'm leaving this for Anthony Baxter # and Greg Stein, since they're doing Digest Authentication) # Authentication stuff (ditto) -# ProxyHandler, CustomProxy, CustomProxyHandler (I don't use a proxy) -# GopherHandler (haven't used gopher for a decade or so...) +# CustomProxy, CustomProxyHandler class TrivialTests(unittest.TestCase): def test_trivial(self): @@ -90,6 +89,7 @@ class FakeMethod: return self.handle(self.meth_name, self.action, *args) class MockHandler: + handler_order = 500 def __init__(self, methods): self._define_methods(methods) def _define_methods(self, methods): @@ -154,7 +154,7 @@ def add_ordered_mock_handlers(opener, meth_spec): for meths in meth_spec: class MockHandlerSubclass(MockHandler): pass h = MockHandlerSubclass(meths) - h.handler_order = count + h.handler_order += count h.add_parent(opener) count = count + 1 handlers.append(h) @@ -642,6 +642,23 @@ class HandlerTests(unittest.TestCase): o.open("http://www.example.com/") self.assert_(not hh.req.has_header("Cookie")) + def test_proxy(self): + o = OpenerDirector() + ph = urllib2.ProxyHandler(dict(http="proxy.example.com:3128")) + o.add_handler(ph) + meth_spec = [ + [("http_open", "return response")] + ] + handlers = add_ordered_mock_handlers(o, meth_spec) + + req = Request("http://acme.example.com/") + self.assertEqual(req.get_host(), "acme.example.com") + r = o.open(req) + self.assertEqual(req.get_host(), "proxy.example.com:3128") + + self.assertEqual([(handlers[0], "http_open")], + [tup[0:2] for tup in o.calls]) + class MiscTests(unittest.TestCase): @@ -827,6 +844,7 @@ class NetworkTests(unittest.TestCase): def test_main(verbose=None): + test_support.run_doctest(urllib2, verbose) tests = (TrivialTests, OpenerDirectorTests, HandlerTests, diff --git a/Lib/urllib2.py b/Lib/urllib2.py index 21f916ce568..91bcc2bb7af 100644 --- a/Lib/urllib2.py +++ b/Lib/urllib2.py @@ -119,7 +119,8 @@ from urllib import (unwrap, unquote, splittype, splithost, quote, # support for FileHandler, proxies via environment variables from urllib import localhost, url2pathname, getproxies -__version__ = "2.5" +# used in User-Agent header sent +__version__ = sys.version[:3] _opener = None def urlopen(url, data=None): @@ -563,6 +564,80 @@ class HTTPRedirectHandler(BaseHandler): "lead to an infinite loop.\n" \ "The last 30x error message was:\n" + +def _parse_proxy(proxy): + """Return (scheme, user, password, host/port) given a URL or an authority. + + If a URL is supplied, it must have an authority (host:port) component. + According to RFC 3986, having an authority component means the URL must + have two slashes after the scheme: + + >>> _parse_proxy('file:/ftp.example.com/') + Traceback (most recent call last): + ValueError: proxy URL with no authority: 'file:/ftp.example.com/' + + The first three items of the returned tuple may be None. + + Examples of authority parsing: + + >>> _parse_proxy('proxy.example.com') + (None, None, None, 'proxy.example.com') + >>> _parse_proxy('proxy.example.com:3128') + (None, None, None, 'proxy.example.com:3128') + + The authority component may optionally include userinfo (assumed to be + username:password): + + >>> _parse_proxy('joe:password@proxy.example.com') + (None, 'joe', 'password', 'proxy.example.com') + >>> _parse_proxy('joe:password@proxy.example.com:3128') + (None, 'joe', 'password', 'proxy.example.com:3128') + + Same examples, but with URLs instead: + + >>> _parse_proxy('http://proxy.example.com/') + ('http', None, None, 'proxy.example.com') + >>> _parse_proxy('http://proxy.example.com:3128/') + ('http', None, None, 'proxy.example.com:3128') + >>> _parse_proxy('http://joe:password@proxy.example.com/') + ('http', 'joe', 'password', 'proxy.example.com') + >>> _parse_proxy('http://joe:password@proxy.example.com:3128') + ('http', 'joe', 'password', 'proxy.example.com:3128') + + Everything after the authority is ignored: + + >>> _parse_proxy('ftp://joe:password@proxy.example.com/rubbish:3128') + ('ftp', 'joe', 'password', 'proxy.example.com') + + Test for no trailing '/' case: + + >>> _parse_proxy('http://joe:password@proxy.example.com') + ('http', 'joe', 'password', 'proxy.example.com') + + """ + from urlparse import _splitnetloc + scheme, r_scheme = splittype(proxy) + if not r_scheme.startswith("/"): + # authority + scheme = None + authority = proxy + else: + # URL + if not r_scheme.startswith("//"): + raise ValueError("proxy URL with no authority: %r" % proxy) + # We have an authority, so for RFC 3986-compliant URLs (by ss 3. + # and 3.3.), path is empty or starts with '/' + end = r_scheme.find("/", 2) + if end == -1: + end = None + authority = r_scheme[2:end] + userinfo, hostport = splituser(authority) + if userinfo is not None: + user, password = splitpasswd(userinfo) + else: + user = password = None + return scheme, user, password, hostport + class ProxyHandler(BaseHandler): # Proxies must be in front handler_order = 100 @@ -579,30 +654,25 @@ class ProxyHandler(BaseHandler): def proxy_open(self, req, proxy, type): orig_type = req.get_type() - type, r_type = splittype(proxy) - if not type or r_type.isdigit(): - # proxy is specified without protocol - type = orig_type - host = proxy - else: - host, r_host = splithost(r_type) - user_pass, host = splituser(host) - user, password = splitpasswd(user_pass) + proxy_type, user, password, hostport = _parse_proxy(proxy) + if proxy_type is None: + proxy_type = orig_type if user and password: - user, password = user_pass.split(':', 1) - user_pass = base64.encodestring('%s:%s' % (unquote(user), - unquote(password))).strip() - req.add_header('Proxy-authorization', 'Basic ' + user_pass) - host = unquote(host) - req.set_proxy(host, type) - if orig_type == type: + user_pass = '%s:%s' % (unquote(user), unquote(password)) + creds = base64.encodestring(user_pass).strip() + req.add_header('Proxy-authorization', 'Basic ' + creds) + hostport = unquote(hostport) + req.set_proxy(hostport, proxy_type) + if orig_type == proxy_type: # let other handlers take care of it - # XXX this only makes sense if the proxy is before the - # other handlers return None else: # need to start over, because the other handlers don't # grok the proxy's URL type + # e.g. if we have a constructor arg proxies like so: + # {'http': 'ftp://proxy.example.com'}, we may end up turning + # a request for http://acme.example.com/a into one for + # ftp://proxy.example.com/a return self.parent.open(req) # feature suggested by Duncan Booth