From fe1ad15b4bcc923bfba384cad4c647ece8944b83 Mon Sep 17 00:00:00 2001 From: Senthil Kumaran Date: Sat, 3 Jul 2010 17:55:41 +0000 Subject: [PATCH] Merged revisions 82510 via svnmerge from svn+ssh://pythondev@svn.python.org/python/branches/py3k ........ r82510 | senthil.kumaran | 2010-07-03 23:18:22 +0530 (Sat, 03 Jul 2010) | 4 lines Fix Issue5468 - urlencode to handle bytes and other alternate encodings. (Extensive tests provided). Patch by Dan Mahn. ........ --- Doc/library/urllib.parse.rst | 38 +++++++----- Lib/test/test_urllib.py | 110 +++++++++++++++++++++++++++++++++++ Lib/urllib/parse.py | 38 +++++++++--- Misc/NEWS | 3 + 4 files changed, 165 insertions(+), 24 deletions(-) diff --git a/Doc/library/urllib.parse.rst b/Doc/library/urllib.parse.rst index d9776be4bfb..cfd995d4362 100644 --- a/Doc/library/urllib.parse.rst +++ b/Doc/library/urllib.parse.rst @@ -307,23 +307,29 @@ The :mod:`urllib.parse` module defines the following functions: ``b'a&\xef'``. -.. function:: urlencode(query, doseq=False) +.. function:: urlencode(query, doseq=False, safe='', encoding=None, errors=None) - Convert a mapping object or a sequence of two-element tuples to a - "url-encoded" string, suitable to pass to :func:`urlopen` above as the - optional *data* argument. This is useful to pass a dictionary of form - fields to a ``POST`` request. The resulting string is a series of - ``key=value`` pairs separated by ``'&'`` characters, where both *key* and - *value* are quoted using :func:`quote_plus` above. When a sequence of - two-element tuples is used as the *query* argument, the first element of - each tuple is a key and the second is a value. The value element in itself - can be a sequence and in that case, if the optional parameter *doseq* is - evaluates to *True*, individual ``key=value`` pairs separated by ``'&'``are - generated for each element of the value sequence for the key. The order of - parameters in the encoded string will match the order of parameter tuples in - the sequence. This module provides the functions :func:`parse_qs` and - :func:`parse_qsl` which are used to parse query strings into Python data - structures. + Convert a mapping object or a sequence of two-element, which may either be a + :class:`str` or a :class:`bytes` tuples, to a "url-encoded" string, + suitable to pass to :func:`urlopen` above as the optional *data* argument. + This is useful to pass a dictionary of form fields to a ``POST`` request. + The resulting string is a series of ``key=value`` pairs separated by ``'&'`` + characters, where both *key* and *value* are quoted using :func:`quote_plus` + above. When a sequence of two-element tuples is used as the *query* + argument, the first element of each tuple is a key and the second is a + value. The value element in itself can be a sequence and in that case, if + the optional parameter *doseq* is evaluates to *True*, individual + ``key=value`` pairs separated by ``'&'`` are generated for each element of + the value sequence for the key. The order of parameters in the encoded + string will match the order of parameter tuples in the sequence. This module + provides the functions :func:`parse_qs` and :func:`parse_qsl` which are used + to parse query strings into Python data structures. + + When *query* parameter is a :class:`str`, the *safe*, *encoding* and *error* + parameters are sent the :func:`quote_plus` for encoding. + + .. versionchanged:: 3.2 + query paramater supports bytes and string. .. seealso:: diff --git a/Lib/test/test_urllib.py b/Lib/test/test_urllib.py index f4b3766375c..acd55778248 100644 --- a/Lib/test/test_urllib.py +++ b/Lib/test/test_urllib.py @@ -797,6 +797,116 @@ class urlencode_Tests(unittest.TestCase): self.assertEqual("a=a&a=b", urllib.parse.urlencode({"a": {"a": 1, "b": 1}}, True)) + def test_urlencode_encoding(self): + # ASCII encoding. Expect %3F with errors="replace' + given = (('\u00a0', '\u00c1'),) + expect = '%3F=%3F' + result = urllib.parse.urlencode(given, encoding="ASCII", errors="replace") + self.assertEqual(expect, result) + + # Default is UTF-8 encoding. + given = (('\u00a0', '\u00c1'),) + expect = '%C2%A0=%C3%81' + result = urllib.parse.urlencode(given) + self.assertEqual(expect, result) + + # Latin-1 encoding. + given = (('\u00a0', '\u00c1'),) + expect = '%A0=%C1' + result = urllib.parse.urlencode(given, encoding="latin-1") + self.assertEqual(expect, result) + + def test_urlencode_encoding_doseq(self): + # ASCII Encoding. Expect %3F with errors="replace' + given = (('\u00a0', '\u00c1'),) + expect = '%3F=%3F' + result = urllib.parse.urlencode(given, doseq=True, + encoding="ASCII", errors="replace") + self.assertEqual(expect, result) + + # ASCII Encoding. On a sequence of values. + given = (("\u00a0", (1, "\u00c1")),) + expect = '%3F=1&%3F=%3F' + result = urllib.parse.urlencode(given, True, + encoding="ASCII", errors="replace") + self.assertEqual(expect, result) + + # Utf-8 + given = (("\u00a0", "\u00c1"),) + expect = '%C2%A0=%C3%81' + result = urllib.parse.urlencode(given, True) + self.assertEqual(expect, result) + + given = (("\u00a0", (42, "\u00c1")),) + expect = '%C2%A0=42&%C2%A0=%C3%81' + result = urllib.parse.urlencode(given, True) + self.assertEqual(expect, result) + + # latin-1 + given = (("\u00a0", "\u00c1"),) + expect = '%A0=%C1' + result = urllib.parse.urlencode(given, True, encoding="latin-1") + self.assertEqual(expect, result) + + given = (("\u00a0", (42, "\u00c1")),) + expect = '%A0=42&%A0=%C1' + result = urllib.parse.urlencode(given, True, encoding="latin-1") + self.assertEqual(expect, result) + + def test_urlencode_bytes(self): + given = ((b'\xa0\x24', b'\xc1\x24'),) + expect = '%A0%24=%C1%24' + result = urllib.parse.urlencode(given) + self.assertEqual(expect, result) + result = urllib.parse.urlencode(given, True) + self.assertEqual(expect, result) + + # Sequence of values + given = ((b'\xa0\x24', (42, b'\xc1\x24')),) + expect = '%A0%24=42&%A0%24=%C1%24' + result = urllib.parse.urlencode(given, True) + self.assertEqual(expect, result) + + def test_urlencode_encoding_safe_parameter(self): + + # Send '$' (\x24) as safe character + # Default utf-8 encoding + + given = ((b'\xa0\x24', b'\xc1\x24'),) + result = urllib.parse.urlencode(given, safe=":$") + expect = '%A0$=%C1$' + self.assertEqual(expect, result) + + given = ((b'\xa0\x24', b'\xc1\x24'),) + result = urllib.parse.urlencode(given, doseq=True, safe=":$") + expect = '%A0$=%C1$' + self.assertEqual(expect, result) + + # Safe parameter in sequence + given = ((b'\xa0\x24', (b'\xc1\x24', 0xd, 42)),) + expect = '%A0$=%C1$&%A0$=13&%A0$=42' + result = urllib.parse.urlencode(given, True, safe=":$") + self.assertEqual(expect, result) + + # Test all above in latin-1 encoding + + given = ((b'\xa0\x24', b'\xc1\x24'),) + result = urllib.parse.urlencode(given, safe=":$", + encoding="latin-1") + expect = '%A0$=%C1$' + self.assertEqual(expect, result) + + given = ((b'\xa0\x24', b'\xc1\x24'),) + expect = '%A0$=%C1$' + result = urllib.parse.urlencode(given, doseq=True, safe=":$", + encoding="latin-1") + + given = ((b'\xa0\x24', (b'\xc1\x24', 0xd, 42)),) + expect = '%A0$=%C1$&%A0$=13&%A0$=42' + result = urllib.parse.urlencode(given, True, safe=":$", + encoding="latin-1") + self.assertEqual(expect, result) + class Pathname_Tests(unittest.TestCase): """Test pathname2url() and url2pathname()""" diff --git a/Lib/urllib/parse.py b/Lib/urllib/parse.py index b7890d84dd9..27b732b78cd 100644 --- a/Lib/urllib/parse.py +++ b/Lib/urllib/parse.py @@ -533,7 +533,7 @@ def quote_from_bytes(bs, safe='/'): _safe_quoters[cachekey] = quoter return ''.join([quoter[char] for char in bs]) -def urlencode(query, doseq=False): +def urlencode(query, doseq=False, safe='', encoding=None, errors=None): """Encode a sequence of two-element tuples or dictionary into a URL query string. If any values in the query arg are sequences and doseq is true, each @@ -542,6 +542,10 @@ def urlencode(query, doseq=False): If the query arg is a sequence of two-element tuples, the order of the parameters in the output will match the order of parameters in the input. + + The query arg may be either a string or a bytes type. When query arg is a + string, the safe, encoding and error parameters are sent the quote_plus for + encoding. """ if hasattr(query, "items"): @@ -566,14 +570,28 @@ def urlencode(query, doseq=False): l = [] if not doseq: for k, v in query: - k = quote_plus(str(k)) - v = quote_plus(str(v)) + if isinstance(k, bytes): + k = quote_plus(k, safe) + else: + k = quote_plus(str(k), safe, encoding, errors) + + if isinstance(v, bytes): + v = quote_plus(v, safe) + else: + v = quote_plus(str(v), safe, encoding, errors) l.append(k + '=' + v) else: for k, v in query: - k = quote_plus(str(k)) - if isinstance(v, str): - v = quote_plus(v) + if isinstance(k, bytes): + k = quote_plus(k, safe) + else: + k = quote_plus(str(k), safe, encoding, errors) + + if isinstance(v, bytes): + v = quote_plus(v, safe) + l.append(k + '=' + v) + elif isinstance(v, str): + v = quote_plus(v, safe, encoding, errors) l.append(k + '=' + v) else: try: @@ -581,12 +599,16 @@ def urlencode(query, doseq=False): x = len(v) except TypeError: # not a sequence - v = quote_plus(str(v)) + v = quote_plus(str(v), safe, encoding, errors) l.append(k + '=' + v) else: # loop over the sequence for elt in v: - l.append(k + '=' + quote_plus(str(elt))) + if isinstance(elt, bytes): + elt = quote_plus(elt, safe) + else: + elt = quote_plus(str(elt), safe, encoding, errors) + l.append(k + '=' + elt) return '&'.join(l) # Utilities to parse URLs (most of these return None for missing parts): diff --git a/Misc/NEWS b/Misc/NEWS index cf9cf74acae..d630751de8d 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -75,6 +75,9 @@ C-API Library ------- +- Issue #5468: urlencode to handle bytes type and other encodings in its query + parameter. Patch by Dan Mahn. + - Issue #7673: Fix security vulnerability (CVE-2010-2089) in the audioop module, ensure that the input string length is a multiple of the frame size