From 21024f06622c4c55b666adb130797a4ee205d005 Mon Sep 17 00:00:00 2001 From: Ratnadeep Debnath Date: Sat, 25 Feb 2017 14:30:28 +0530 Subject: [PATCH] bpo-16285: Update urllib quoting to RFC 3986 (#173) * bpo-16285: Update urllib quoting to RFC 3986 urllib.parse.quote is now based on RFC 3986, and hence includes `'~'` in the set of characters that is not escaped by default. Patch by Christian Theune and Ratnadeep Debnath. --- Doc/library/urllib.parse.rst | 6 +++++- Doc/whatsnew/3.7.rst | 7 +++++++ Lib/test/test_urllib.py | 4 ++-- Lib/urllib/parse.py | 9 ++++++--- Misc/ACKS | 4 +++- Misc/NEWS | 4 ++++ 6 files changed, 27 insertions(+), 7 deletions(-) diff --git a/Doc/library/urllib.parse.rst b/Doc/library/urllib.parse.rst index 676321b46a2..7a5b56f5da6 100644 --- a/Doc/library/urllib.parse.rst +++ b/Doc/library/urllib.parse.rst @@ -451,13 +451,17 @@ task isn't already covered by the URL parsing functions above. .. function:: quote(string, safe='/', encoding=None, errors=None) Replace special characters in *string* using the ``%xx`` escape. Letters, - digits, and the characters ``'_.-'`` are never quoted. By default, this + digits, and the characters ``'_.-~'`` are never quoted. By default, this function is intended for quoting the path section of URL. The optional *safe* parameter specifies additional ASCII characters that should not be quoted --- its default value is ``'/'``. *string* may be either a :class:`str` or a :class:`bytes`. + .. versionchanged:: 3.7 + Moved from RFC 2396 to RFC 3986 for quoting URL strings. "~" is now + included in the set of reserved characters. + The optional *encoding* and *errors* parameters specify how to deal with non-ASCII characters, as accepted by the :meth:`str.encode` method. *encoding* defaults to ``'utf-8'``. diff --git a/Doc/whatsnew/3.7.rst b/Doc/whatsnew/3.7.rst index 861c53728ee..35eea84e0a6 100644 --- a/Doc/whatsnew/3.7.rst +++ b/Doc/whatsnew/3.7.rst @@ -103,6 +103,13 @@ The :const:`~unittest.mock.sentinel` attributes now preserve their identity when they are :mod:`copied ` or :mod:`pickled `. (Contributed by Serhiy Storchaka in :issue:`20804`.) +urllib.parse +------------ + +:func:`urllib.parse.quote` has been updated to from RFC 2396 to RFC 3986, +adding `~` to the set of characters that is never quoted by default. +(Contributed by Christian Theune and Ratnadeep Debnath in :issue:`16285`.) + Optimizations ============= diff --git a/Lib/test/test_urllib.py b/Lib/test/test_urllib.py index 5084486e5ab..bffbb0a8d1e 100644 --- a/Lib/test/test_urllib.py +++ b/Lib/test/test_urllib.py @@ -733,7 +733,7 @@ FF class QuotingTests(unittest.TestCase): r"""Tests for urllib.quote() and urllib.quote_plus() - According to RFC 2396 (Uniform Resource Identifiers), to escape a + According to RFC 3986 (Uniform Resource Identifiers), to escape a character you write it as '%' + <2 character US-ASCII hex value>. The Python code of ``'%' + hex(ord())[2:]`` escapes a character properly. Case does not matter on the hex letters. @@ -761,7 +761,7 @@ class QuotingTests(unittest.TestCase): do_not_quote = '' .join(["ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz", "0123456789", - "_.-"]) + "_.-~"]) result = urllib.parse.quote(do_not_quote) self.assertEqual(do_not_quote, result, "using quote(): %r != %r" % (do_not_quote, result)) diff --git a/Lib/urllib/parse.py b/Lib/urllib/parse.py index 1d08730a89f..f3a309aacc2 100644 --- a/Lib/urllib/parse.py +++ b/Lib/urllib/parse.py @@ -704,7 +704,7 @@ def unquote_plus(string, encoding='utf-8', errors='replace'): _ALWAYS_SAFE = frozenset(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ' b'abcdefghijklmnopqrstuvwxyz' b'0123456789' - b'_.-') + b'_.-~') _ALWAYS_SAFE_BYTES = bytes(_ALWAYS_SAFE) _safe_quoters = {} @@ -736,15 +736,18 @@ def quote(string, safe='/', encoding=None, errors=None): Each part of a URL, e.g. the path info, the query, etc., has a different set of reserved characters that must be quoted. - RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists + RFC 3986 Uniform Resource Identifiers (URI): Generic Syntax lists the following reserved characters. reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" | - "$" | "," + "$" | "," | "~" Each of these characters is reserved in some component of a URL, but not necessarily in all of them. + Python 3.7 updates from using RFC 2396 to RFC 3986 to quote URL strings. + Now, "~" is included in the set of reserved characters. + By default, the quote function is intended for quoting the path section of a URL. Thus, it will not encode '/'. This character is reserved, but in typical usage the quote function is being diff --git a/Misc/ACKS b/Misc/ACKS index e63a061098e..255318e5724 100644 --- a/Misc/ACKS +++ b/Misc/ACKS @@ -344,6 +344,7 @@ Kushal Das Jonathan Dasteel Pierre-Yves David A. Jesse Jiryu Davis +Ratnadeep Debnath Merlijn van Deen John DeGood Ned Deily @@ -1518,6 +1519,7 @@ Mikhail Terekhov Victor Terrón Richard M. Tew Tobias Thelen +Christian Theune Févry Thibault Lowe Thiderman Nicolas M. Thiéry @@ -1528,7 +1530,7 @@ Stephen Thorne Jeremy Thurgood Eric Tiedemann July Tikhonov -Tracy Tims +Tracy Tims Oren Tirosh Tim Tisdall Jason Tishler diff --git a/Misc/NEWS b/Misc/NEWS index e7ab3df8d77..74ec8c3bdf2 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -249,6 +249,10 @@ Extension Modules Library ------- +- Issue #16285: urrlib.parse.quote is now based on RFC 3986 and hence includes + '~' in the set of characters that is not quoted by default. Patch by + Christian Theune and Ratnadeep Debnath. + - bpo-29532: Altering a kwarg dictionary passed to functools.partial() no longer affects a partial object after creation.