bpo-44002: Switch to lru_cache in urllib.parse. (GH-25798)

Switch to lru_cache in urllib.parse.

urllib.parse now uses functool.lru_cache for its internal URL splitting and
quoting caches instead of rolling its own like its the 90s.

The undocumented internal Quoted class API is now deprecated
as it had no reason to be public and no existing OSS users were found.

The clear_cache() API remains undocumented but gets an explicit test as it
is used in a few projects' (twisted, gevent) tests as well as our own regrtest.
This commit is contained in:
Gregory P. Smith 2021-05-11 17:01:44 -07:00 committed by GitHub
parent e9d7f88d56
commit d597fdc5fd
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 50 additions and 31 deletions

View File

@ -1044,16 +1044,24 @@ class UrlParseTestCase(unittest.TestCase):
self.assertEqual(p1.params, 'phone-context=+1-914-555')
def test_Quoter_repr(self):
quoter = urllib.parse.Quoter(urllib.parse._ALWAYS_SAFE)
quoter = urllib.parse._Quoter(urllib.parse._ALWAYS_SAFE)
self.assertIn('Quoter', repr(quoter))
def test_clear_cache_for_code_coverage(self):
urllib.parse.clear_cache()
def test_urllib_parse_getattr_failure(self):
"""Test that urllib.parse.__getattr__() fails correctly."""
with self.assertRaises(AttributeError):
unused = urllib.parse.this_does_not_exist
def test_all(self):
expected = []
undocumented = {
'splitattr', 'splithost', 'splitnport', 'splitpasswd',
'splitport', 'splitquery', 'splittag', 'splittype', 'splituser',
'splitvalue',
'Quoter', 'ResultBase', 'clear_cache', 'to_bytes', 'unwrap',
'ResultBase', 'clear_cache', 'to_bytes', 'unwrap',
}
for name in dir(urllib.parse):
if name.startswith('_') or name in undocumented:
@ -1245,6 +1253,12 @@ class Utility_Tests(unittest.TestCase):
class DeprecationTest(unittest.TestCase):
def test_Quoter_deprecation(self):
with self.assertWarns(DeprecationWarning) as cm:
old_class = urllib.parse.Quoter
self.assertIs(old_class, urllib.parse._Quoter)
self.assertIn('Quoter will be removed', str(cm.warning))
def test_splittype_deprecation(self):
with self.assertWarns(DeprecationWarning) as cm:
urllib.parse.splittype('')

View File

@ -27,10 +27,11 @@ parsing quirks from older RFCs are retained. The testcases in
test_urlparse.py provides a good indicator of parsing behavior.
"""
from collections import namedtuple
import functools
import re
import sys
import types
import collections
import warnings
__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
@ -81,15 +82,10 @@ scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
# Unsafe bytes to be removed per WHATWG spec
_UNSAFE_URL_BYTES_TO_REMOVE = ['\t', '\r', '\n']
# XXX: Consider replacing with functools.lru_cache
MAX_CACHE_SIZE = 20
_parse_cache = {}
def clear_cache():
"""Clear the parse cache and the quoters cache."""
_parse_cache.clear()
_safe_quoters.clear()
"""Clear internal performance caches. Undocumented; some tests want it."""
urlsplit.cache_clear()
_byte_quoter_factory.cache_clear()
# Helpers for bytes handling
# For 3.2, we deliberately require applications that
@ -243,8 +239,6 @@ class _NetlocResultMixinBytes(_NetlocResultMixinBase, _ResultMixinBytes):
return hostname, port
from collections import namedtuple
_DefragResultBase = namedtuple('DefragResult', 'url fragment')
_SplitResultBase = namedtuple(
'SplitResult', 'scheme netloc path query fragment')
@ -434,6 +428,9 @@ def _checknetloc(netloc):
raise ValueError("netloc '" + netloc + "' contains invalid " +
"characters under NFKC normalization")
# typed=True avoids BytesWarnings being emitted during cache key
# comparison since this API supports both bytes and str input.
@functools.lru_cache(typed=True)
def urlsplit(url, scheme='', allow_fragments=True):
"""Parse a URL into 5 components:
<scheme>://<netloc>/<path>?<query>#<fragment>
@ -462,12 +459,6 @@ def urlsplit(url, scheme='', allow_fragments=True):
scheme = scheme.replace(b, "")
allow_fragments = bool(allow_fragments)
key = url, scheme, allow_fragments, type(url), type(scheme)
cached = _parse_cache.get(key, None)
if cached:
return _coerce_result(cached)
if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
clear_cache()
netloc = query = fragment = ''
i = url.find(':')
if i > 0:
@ -488,7 +479,6 @@ def urlsplit(url, scheme='', allow_fragments=True):
url, query = url.split('?', 1)
_checknetloc(netloc)
v = SplitResult(scheme, netloc, url, query, fragment)
_parse_cache[key] = v
return _coerce_result(v)
def urlunparse(components):
@ -791,23 +781,30 @@ _ALWAYS_SAFE = frozenset(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
b'0123456789'
b'_.-~')
_ALWAYS_SAFE_BYTES = bytes(_ALWAYS_SAFE)
_safe_quoters = {}
class Quoter(collections.defaultdict):
"""A mapping from bytes (in range(0,256)) to strings.
def __getattr__(name):
if name == 'Quoter':
warnings.warn('Deprecated in 3.11. '
'urllib.parse.Quoter will be removed in Python 3.14. '
'It was not intended to be a public API.',
DeprecationWarning, stacklevel=2)
return _Quoter
raise AttributeError(f'module {__name__!r} has no attribute {name!r}')
class _Quoter(dict):
"""A mapping from bytes numbers (in range(0,256)) to strings.
String values are percent-encoded byte values, unless the key < 128, and
in the "safe" set (either the specified safe set, or default set).
in either of the specified safe set, or the always safe set.
"""
# Keeps a cache internally, using defaultdict, for efficiency (lookups
# Keeps a cache internally, via __missing__, for efficiency (lookups
# of cached keys don't call Python code at all).
def __init__(self, safe):
"""safe: bytes object."""
self.safe = _ALWAYS_SAFE.union(safe)
def __repr__(self):
# Without this, will just display as a defaultdict
return "<%s %r>" % (self.__class__.__name__, dict(self))
return f"<Quoter {dict(self)!r}>"
def __missing__(self, b):
# Handle a cache miss. Store quoted string in cache and return.
@ -886,6 +883,11 @@ def quote_plus(string, safe='', encoding=None, errors=None):
string = quote(string, safe + space, encoding, errors)
return string.replace(' ', '+')
# Expectation: A typical program is unlikely to create more than 5 of these.
@functools.lru_cache
def _byte_quoter_factory(safe):
return _Quoter(safe).__getitem__
def quote_from_bytes(bs, safe='/'):
"""Like quote(), but accepts a bytes object rather than a str, and does
not perform string-to-bytes encoding. It always returns an ASCII string.
@ -899,13 +901,11 @@ def quote_from_bytes(bs, safe='/'):
# Normalize 'safe' by converting to bytes and removing non-ASCII chars
safe = safe.encode('ascii', 'ignore')
else:
# List comprehensions are faster than generator expressions.
safe = bytes([c for c in safe if c < 128])
if not bs.rstrip(_ALWAYS_SAFE_BYTES + safe):
return bs.decode()
try:
quoter = _safe_quoters[safe]
except KeyError:
_safe_quoters[safe] = quoter = Quoter(safe).__getitem__
quoter = _byte_quoter_factory(safe)
return ''.join([quoter(char) for char in bs])
def urlencode(query, doseq=False, safe='', encoding=None, errors=None,

View File

@ -0,0 +1,5 @@
:mod:`urllib.parse` now uses :func:`functool.lru_cache` for its internal URL
splitting and quoting caches instead of rolling its own like its the '90s.
The undocumented internal :mod:`urllib.parse` ``Quoted`` class API is now
deprecated, for removal in 3.14.