Patch #1462790: fix urllib2 ProxyHandler for host:port proxies

2006-04-02 20:45:34 +00:00 · 2006-04-02 20:45:34 +00:00 · 720096a6bf
parent 4eb521e595
commit 720096a6bf
2 changed files with 110 additions and 22 deletions
--- a/Lib/test/test_urllib2.py
+++ b/Lib/test/test_urllib2.py
@ -13,8 +13,7 @@ from urllib2 import Request, OpenerDirector
 # parse_keqv_list, parse_http_list (I'm leaving this for Anthony Baxter
 #  and Greg Stein, since they're doing Digest Authentication)
 # Authentication stuff (ditto)
-# ProxyHandler, CustomProxy, CustomProxyHandler (I don't use a proxy)
+# CustomProxy, CustomProxyHandler
 # GopherHandler (haven't used gopher for a decade or so...)
 class TrivialTests(unittest.TestCase):
    def test_trivial(self):
@ -90,6 +89,7 @@ class FakeMethod:
        return self.handle(self.meth_name, self.action, *args)
 class MockHandler:
    handler_order = 500
    def __init__(self, methods):
        self._define_methods(methods)
    def _define_methods(self, methods):
@ -154,7 +154,7 @@ def add_ordered_mock_handlers(opener, meth_spec):
    for meths in meth_spec:
        class MockHandlerSubclass(MockHandler): pass
        h = MockHandlerSubclass(meths)
-        h.handler_order = count
+        h.handler_order += count
        h.add_parent(opener)
        count = count + 1
        handlers.append(h)
@ -642,6 +642,23 @@ class HandlerTests(unittest.TestCase):
        o.open("http://www.example.com/")
        self.assert_(not hh.req.has_header("Cookie"))
    def test_proxy(self):
        o = OpenerDirector()
        ph = urllib2.ProxyHandler(dict(http="proxy.example.com:3128"))
        o.add_handler(ph)
        meth_spec = [
            [("http_open", "return response")]
            ]
        handlers = add_ordered_mock_handlers(o, meth_spec)
        req = Request("http://acme.example.com/")
        self.assertEqual(req.get_host(), "acme.example.com")
        r = o.open(req)
        self.assertEqual(req.get_host(), "proxy.example.com:3128")
        self.assertEqual([(handlers[0], "http_open")],
                         [tup[0:2] for tup in o.calls])
 class MiscTests(unittest.TestCase):
@ -827,6 +844,7 @@ class NetworkTests(unittest.TestCase):
 def test_main(verbose=None):
    test_support.run_doctest(urllib2, verbose)
    tests = (TrivialTests,
             OpenerDirectorTests,
             HandlerTests,
--- a/Lib/urllib2.py
+++ b/Lib/urllib2.py
@ -119,7 +119,8 @@ from urllib import (unwrap, unquote, splittype, splithost, quote,
 # support for FileHandler, proxies via environment variables
 from urllib import localhost, url2pathname, getproxies
-__version__ = "2.5"
+# used in User-Agent header sent
 __version__ = sys.version[:3]
 _opener = None
 def urlopen(url, data=None):
@ -563,6 +564,80 @@ class HTTPRedirectHandler(BaseHandler):
              "lead to an infinite loop.\n" \
              "The last 30x error message was:\n"
 def _parse_proxy(proxy):
    """Return (scheme, user, password, host/port) given a URL or an authority.
    If a URL is supplied, it must have an authority (host:port) component.
    According to RFC 3986, having an authority component means the URL must
    have two slashes after the scheme:
    >>> _parse_proxy('file:/ftp.example.com/')
    Traceback (most recent call last):
    ValueError: proxy URL with no authority: 'file:/ftp.example.com/'
    The first three items of the returned tuple may be None.
    Examples of authority parsing:
    >>> _parse_proxy('proxy.example.com')
    (None, None, None, 'proxy.example.com')
    >>> _parse_proxy('proxy.example.com:3128')
    (None, None, None, 'proxy.example.com:3128')
    The authority component may optionally include userinfo (assumed to be
    username:password):
    >>> _parse_proxy('joe:password@proxy.example.com')
    (None, 'joe', 'password', 'proxy.example.com')
    >>> _parse_proxy('joe:password@proxy.example.com:3128')
    (None, 'joe', 'password', 'proxy.example.com:3128')
    Same examples, but with URLs instead:
    >>> _parse_proxy('http://proxy.example.com/')
    ('http', None, None, 'proxy.example.com')
    >>> _parse_proxy('http://proxy.example.com:3128/')
    ('http', None, None, 'proxy.example.com:3128')
    >>> _parse_proxy('http://joe:password@proxy.example.com/')
    ('http', 'joe', 'password', 'proxy.example.com')
    >>> _parse_proxy('http://joe:password@proxy.example.com:3128')
    ('http', 'joe', 'password', 'proxy.example.com:3128')
    Everything after the authority is ignored:
    >>> _parse_proxy('ftp://joe:password@proxy.example.com/rubbish:3128')
    ('ftp', 'joe', 'password', 'proxy.example.com')
    Test for no trailing '/' case:
    >>> _parse_proxy('http://joe:password@proxy.example.com')
    ('http', 'joe', 'password', 'proxy.example.com')
    """
    from urlparse import _splitnetloc
    scheme, r_scheme = splittype(proxy)
    if not r_scheme.startswith("/"):
        # authority
        scheme = None
        authority = proxy
    else:
        # URL
        if not r_scheme.startswith("//"):
            raise ValueError("proxy URL with no authority: %r" % proxy)
        # We have an authority, so for RFC 3986-compliant URLs (by ss 3.
        # and 3.3.), path is empty or starts with '/'
        end = r_scheme.find("/", 2)
        if end == -1:
            end = None
        authority = r_scheme[2:end]
    userinfo, hostport = splituser(authority)
    if userinfo is not None:
        user, password = splitpasswd(userinfo)
    else:
        user = password = None
    return scheme, user, password, hostport
 class ProxyHandler(BaseHandler):
    # Proxies must be in front
    handler_order = 100
@ -579,30 +654,25 @@ class ProxyHandler(BaseHandler):
    def proxy_open(self, req, proxy, type):
        orig_type = req.get_type()
-        type, r_type = splittype(proxy)
+        proxy_type, user, password, hostport = _parse_proxy(proxy)
-        if not type or r_type.isdigit():
+        if proxy_type is None:
-            # proxy is specified without protocol
+            proxy_type = orig_type
            type = orig_type
            host = proxy
        else:
            host, r_host = splithost(r_type)
        user_pass, host = splituser(host)
        user, password = splitpasswd(user_pass)
        if user and password:
-            user, password = user_pass.split(':', 1)
+            user_pass = '%s:%s' % (unquote(user), unquote(password))
-            user_pass = base64.encodestring('%s:%s' % (unquote(user),
+            creds = base64.encodestring(user_pass).strip()
-                                            unquote(password))).strip()
+            req.add_header('Proxy-authorization', 'Basic ' + creds)
-            req.add_header('Proxy-authorization', 'Basic ' + user_pass)
+        hostport = unquote(hostport)
-        host = unquote(host)
+        req.set_proxy(hostport, proxy_type)
-        req.set_proxy(host, type)
+        if orig_type == proxy_type:
        if orig_type == type:
            # let other handlers take care of it
            # XXX this only makes sense if the proxy is before the
            # other handlers
            return None
        else:
            # need to start over, because the other handlers don't
            # grok the proxy's URL type
            # e.g. if we have a constructor arg proxies like so:
            # {'http': 'ftp://proxy.example.com'}, we may end up turning
            # a request for http://acme.example.com/a into one for
            # ftp://proxy.example.com/a
            return self.parent.open(req)
 # feature suggested by Duncan Booth