diff --git a/Lib/test/test_urllib.py b/Lib/test/test_urllib.py index 245efb3133e..484acea0ba1 100644 --- a/Lib/test/test_urllib.py +++ b/Lib/test/test_urllib.py @@ -12,3 +12,21 @@ expected = 'abcdefghijklmnopqrstuvwxyz%df%e0%e1%e2%e3%e4%e5%e6%e7%e8%e9%ea%eb%ec test = urllib.quote(chars) assert test == expected, "urllib.quote problem" +test2 = urllib.unquote(expected) +assert test2 == chars + +in1 = "abc/def" +out1_1 = "abc/def" +out1_2 = "abc%2fdef" + +assert urllib.quote(in1) == out1_1, "urllib.quote problem" +assert urllib.quote(in1, '') == out1_2, "urllib.quote problem" + +in2 = "abc?def" +out2_1 = "abc%3fdef" +out2_2 = "abc?def" + +assert urllib.quote(in2) == out2_1, "urllib.quote problem" +assert urllib.quote(in2, '?') == out2_2, "urllib.quote problem" + + diff --git a/Lib/urllib.py b/Lib/urllib.py index 4492e992a56..8e11ba98ae4 100644 --- a/Lib/urllib.py +++ b/Lib/urllib.py @@ -426,7 +426,7 @@ class URLopener: dirs, file = dirs[:-1], dirs[-1] if dirs and not dirs[0]: dirs = dirs[1:] if dirs and not dirs[0]: dirs[0] = '/' - key = (user, host, port, string.joinfields(dirs, '/')) + key = user, host, port, string.join(dirs, '/') # XXX thread unsafe! if len(self.ftpcache) > MAXFTPCACHE: # Prune the cache, rather arbitrarily @@ -1013,22 +1013,58 @@ def unquote_plus(s): always_safe = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' - '0123456789' '_,.-') + '0123456789' '_.-') + +_fast_safe_test = always_safe + '/' +_fast_safe = None + +def _fast_quote(s): + global _fast_safe + if _fast_safe is None: + _fast_safe = {} + for c in _fast_safe_test: + _fast_safe[c] = c + res = list(s) + for i in range(len(res)): + c = res[i] + if not _fast_safe.has_key(c): + res[i] = '%%%02x' % ord(c) + return string.join(res, '') + def quote(s, safe = '/'): - """quote('abc def') -> 'abc%20def'.""" - # XXX Can speed this up an order of magnitude + """quote('abc def') -> 'abc%20def' + + Each part of a URL, e.g. the path info, the query, etc., has a + different set of reserved characters that must be quoted. + + RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists + the following reserved characters. + + reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" | + "$" | "," + + Each of these characters is reserved in some component of a URL, + but not necessarily in all of them. + + By default, the quote function is intended for quoting the path + section of a URL. Thus, it will not encode '/'. This character + is reserved, but in typical usage the quote function is being + called on a path where the existing slash characters are used as + reserved characters. + """ safe = always_safe + safe + if _fast_safe_test == safe: + return _fast_quote(s) res = list(s) for i in range(len(res)): c = res[i] if c not in safe: res[i] = '%%%02x' % ord(c) - return string.joinfields(res, '') + return string.join(res, '') -def quote_plus(s, safe = '/'): - # XXX Can speed this up an order of magnitude +def quote_plus(s, safe = ''): + """Quote the query fragment of a URL; replacing ' ' with '+'""" if ' ' in s: - # replace ' ' with '+' l = string.split(s, ' ') for i in range(len(l)): l[i] = quote(l[i], safe)