Fixing Issue1712522 - urllib.quote to support Unicode. The default

encoding='utf-8' and errors='strict'.
This commit is contained in:
Senthil Kumaran 2010-07-18 02:27:10 +00:00
parent 5d10d33cd5
commit 5dba6dfe6a
4 changed files with 152 additions and 7 deletions

View File

@ -202,24 +202,40 @@ High-level interface
Utility functions
-----------------
.. function:: quote(string[, safe])
.. function:: quote(string[, safe[, encoding[, errors]]])
Replace special characters in *string* using the ``%xx`` escape. Letters,
digits, and the characters ``'_.-'`` are never quoted. By default, this
function is intended for quoting the path section of the URL.The optional
function is intended for quoting the path section of the URL. The optional
*safe* parameter specifies additional characters that should not be quoted
--- its default value is ``'/'``.
*string* may be either a :class:`str` or a :class:`unicode`.
The optional *encoding* and *errors* parameters specify how to deal with
non-ASCII characters, as accepted by the :meth:`unicode.encode` method.
*encoding* defaults to ``'utf-8'``.
*errors* defaults to ``'strict'``, meaning unsupported characters raise a
:class:`UnicodeEncodeError`.
Non-Unicode strings are not encoded by default, and all bytes are allowed.
Example: ``quote('/~connolly/')`` yields ``'/%7econnolly/'``.
Example: ``quote(u'/El Niño/')`` yields ``'/El%20Ni%C3%B1o/'``.
.. function:: quote_plus(string[, safe])
.. versionchanged:: 2.7.1
Added *encoding* and *errors* parameters.
.. function:: quote_plus(string[, safe[, encoding[, errors]]])
Like :func:`quote`, but also replaces spaces by plus signs, as required for
quoting HTML form values when building up a query string to go into a URL.
Plus signs in the original string are escaped unless they are included in
*safe*. It also does not have *safe* default to ``'/'``.
Example: ``quote_plus(u'/El Niño/')`` yields ``'%2FEl+Ni%C3%B1o%2F'``.
.. function:: unquote(string)

View File

@ -355,6 +355,38 @@ class QuotingTests(unittest.TestCase):
self.assertEqual(quote_by_default, result,
"using quote_plus(): %s != %s" %
(quote_by_default, result))
# Safe expressed as unicode rather than str
result = urllib.quote(quote_by_default, safe=u"<>")
self.assertEqual(quote_by_default, result,
"using quote(): %r != %r" % (quote_by_default, result))
# "Safe" non-ASCII bytes should still work
# (Technically disallowed by the URI standard, but allowed for
# backwards compatibility with previous versions of Python)
result = urllib.quote(b"a\xfcb", safe=b"\xfc")
expect = b"a\xfcb"
self.assertEqual(expect, result,
"using quote(): %r != %r" %
(expect, result))
# Same as above, but with 'safe' as a unicode rather than str
# "Safe" non-ASCII unicode characters should have no effect
# (Since URIs are not allowed to have non-ASCII characters)
result = urllib.quote(b"a\xfcb", safe=u"\xfc")
expect = urllib.quote(b"a\xfcb", safe="")
self.assertEqual(expect, result,
"using quote(): %r != %r" %
(expect, result))
# Same as above, but quoting a unicode rather than a str
result = urllib.quote(u"a\xfcb", encoding="latin-1", safe=b"\xfc")
expect = b"a\xfcb"
self.assertEqual(expect, result,
"using quote(): %r != %r" %
(expect, result))
# Same as above, but with both the quoted value and 'safe' as unicode
result = urllib.quote(u"a\xfcb", encoding="latin-1", safe=u"\xfc")
expect = urllib.quote(u"a\xfcb", encoding="latin-1", safe="")
self.assertEqual(expect, result,
"using quote(): %r != %r" %
(expect, result))
def test_default_quoting(self):
# Make sure all characters that should be quoted are by default sans
@ -406,6 +438,81 @@ class QuotingTests(unittest.TestCase):
'alpha%2Bbeta+gamma')
self.assertEqual(urllib.quote_plus('alpha+beta gamma', '+'),
'alpha+beta+gamma')
# Test with unicode
self.assertEqual(urllib.quote_plus(u'alpha+beta gamma'),
'alpha%2Bbeta+gamma')
# Test with safe unicode
self.assertEqual(urllib.quote_plus('alpha+beta gamma', u'+'),
'alpha+beta+gamma')
def test_quote_bytes(self):
# Non-ASCII bytes should quote directly to percent-encoded values
given = b"\xa2\xd8ab\xff"
expect = "%A2%D8ab%FF"
result = urllib.quote(given)
self.assertEqual(expect, result,
"using quote(): %r != %r" % (expect, result))
# Encoding argument should raise UnicodeDecodeError on bytes input
# with non-ASCII characters (just as with str.encode).
self.assertRaises(UnicodeDecodeError, urllib.quote, given,
encoding="latin-1")
def test_quote_with_unicode(self):
# Characters in Latin-1 range, encoded by default in UTF-8
given = u"\xa2\xd8ab\xff"
expect = "%C2%A2%C3%98ab%C3%BF"
result = urllib.quote(given)
self.assertEqual(expect, result,
"using quote(): %r != %r" % (expect, result))
# Characters in Latin-1 range, encoded by with None (default)
result = urllib.quote(given, encoding=None, errors=None)
self.assertEqual(expect, result,
"using quote(): %r != %r" % (expect, result))
# Characters in Latin-1 range, encoded with Latin-1
given = u"\xa2\xd8ab\xff"
expect = "%A2%D8ab%FF"
result = urllib.quote(given, encoding="latin-1")
self.assertEqual(expect, result,
"using quote(): %r != %r" % (expect, result))
# Characters in BMP, encoded by default in UTF-8
given = u"\u6f22\u5b57" # "Kanji"
expect = "%E6%BC%A2%E5%AD%97"
result = urllib.quote(given)
self.assertEqual(expect, result,
"using quote(): %r != %r" % (expect, result))
# Characters in BMP, encoded with Latin-1
given = u"\u6f22\u5b57"
self.assertRaises(UnicodeEncodeError, urllib.quote, given,
encoding="latin-1")
# Characters in BMP, encoded with Latin-1, with replace error handling
given = u"\u6f22\u5b57"
expect = "%3F%3F" # "??"
result = urllib.quote(given, encoding="latin-1",
errors="replace")
self.assertEqual(expect, result,
"using quote(): %r != %r" % (expect, result))
# Characters in BMP, Latin-1, with xmlcharref error handling
given = u"\u6f22\u5b57"
expect = "%26%2328450%3B%26%2323383%3B" # "&#28450;&#23383;"
result = urllib.quote(given, encoding="latin-1",
errors="xmlcharrefreplace")
self.assertEqual(expect, result,
"using quote(): %r != %r" % (expect, result))
def test_quote_plus_with_unicode(self):
# Encoding (latin-1) test for quote_plus
given = u"\xa2\xd8 \xff"
expect = "%A2%D8+%FF"
result = urllib.quote_plus(given, encoding="latin-1")
self.assertEqual(expect, result,
"using quote_plus(): %r != %r" % (expect, result))
# Errors test for quote_plus
given = u"ab\u6f22\u5b57 cd"
expect = "ab%3F%3F+cd"
result = urllib.quote_plus(given, encoding="latin-1",
errors="replace")
self.assertEqual(expect, result,
"using quote_plus(): %r != %r" % (expect, result))
class UnquotingTests(unittest.TestCase):
"""Tests for unquote() and unquote_plus()

View File

@ -1193,7 +1193,7 @@ for i, c in zip(xrange(256), str(bytearray(xrange(256)))):
_safe_map[c] = c if (i < 128 and c in always_safe) else '%{:02X}'.format(i)
_safe_quoters = {}
def quote(s, safe='/'):
def quote(s, safe='/', encoding=None, errors=None):
"""quote('abc def') -> 'abc%20def'
Each part of a URL, e.g. the path info, the query, etc., has a
@ -1213,10 +1213,28 @@ def quote(s, safe='/'):
is reserved, but in typical usage the quote function is being
called on a path where the existing slash characters are used as
reserved characters.
string and safe may be either str or unicode objects.
The optional encoding and errors parameters specify how to deal with the
non-ASCII characters, as accepted by the unicode.encode method.
By default, encoding='utf-8' (characters are encoded with UTF-8), and
errors='strict' (unsupported characters raise a UnicodeEncodeError).
"""
# fastpath
if not s:
return s
if encoding is not None or isinstance(s, unicode):
if encoding is None:
encoding = 'utf-8'
if errors is None:
errors = 'strict'
s = s.encode(encoding, errors)
if isinstance(safe, unicode):
# Normalize 'safe' by converting to str and removing non-ASCII chars
safe = safe.encode('ascii', 'ignore')
cachekey = (safe, always_safe)
try:
(quoter, safe) = _safe_quoters[cachekey]
@ -1230,12 +1248,12 @@ def quote(s, safe='/'):
return s
return ''.join(map(quoter, s))
def quote_plus(s, safe=''):
def quote_plus(s, safe='', encoding=None, errors=None):
"""Quote the query fragment of a URL; replacing ' ' with '+'"""
if ' ' in s:
s = quote(s, safe + ' ')
s = quote(s, safe + ' ', encoding, errors)
return s.replace(' ', '+')
return quote(s, safe)
return quote(s, safe, encoding, errors)
def urlencode(query, doseq=0):
"""Encode a sequence of two-element tuples or dictionary into a URL query string.

View File

@ -18,6 +18,10 @@ Core and Builtins
Library
-------
- Issue 1712522: urllib.quote supports Unicode String with encoding and errors
parameter. The encoding parameter defaults to utf-8 and errors to strict.
Patch by Matt Giuca.
- Issue #7646: The fnmatch pattern cache no longer grows without bound.
- Issue #9136: Fix 'dictionary changed size during iteration'