gh-96346: Use double caching for re._compile() (#96347)

This commit is contained in:
Serhiy Storchaka 2022-10-07 22:21:42 +03:00 committed by GitHub
parent eed80458e8
commit c11b667a1d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 47 additions and 21 deletions

View File

@ -229,6 +229,7 @@ def compile(pattern, flags=0):
def purge():
"Clear the regular expression caches"
_cache.clear()
_cache2.clear()
_compile_repl.cache_clear()
def template(pattern, flags=0):
@ -266,40 +267,64 @@ Match = type(_compiler.compile('', 0).match(''))
# --------------------------------------------------------------------
# internals
_cache = {} # ordered!
# Use the fact that dict keeps the insertion order.
# _cache2 uses the simple FIFO policy which has better latency.
# _cache uses the LRU policy which has better hit rate.
_cache = {} # LRU
_cache2 = {} # FIFO
_MAXCACHE = 512
_MAXCACHE2 = 256
assert _MAXCACHE2 < _MAXCACHE
def _compile(pattern, flags):
# internal: compile pattern
if isinstance(flags, RegexFlag):
flags = flags.value
try:
return _cache[type(pattern), pattern, flags]
return _cache2[type(pattern), pattern, flags]
except KeyError:
pass
if isinstance(pattern, Pattern):
if flags:
raise ValueError(
"cannot process flags argument with a compiled pattern")
return pattern
if not _compiler.isstring(pattern):
raise TypeError("first argument must be string or compiled pattern")
if flags & T:
import warnings
warnings.warn("The re.TEMPLATE/re.T flag is deprecated "
"as it is an undocumented flag "
"without an obvious purpose. "
"Don't use it.",
DeprecationWarning)
p = _compiler.compile(pattern, flags)
if not (flags & DEBUG):
key = (type(pattern), pattern, flags)
# Item in _cache should be moved to the end if found.
p = _cache.pop(key, None)
if p is None:
if isinstance(pattern, Pattern):
if flags:
raise ValueError(
"cannot process flags argument with a compiled pattern")
return pattern
if not _compiler.isstring(pattern):
raise TypeError("first argument must be string or compiled pattern")
if flags & T:
import warnings
warnings.warn("The re.TEMPLATE/re.T flag is deprecated "
"as it is an undocumented flag "
"without an obvious purpose. "
"Don't use it.",
DeprecationWarning)
p = _compiler.compile(pattern, flags)
if flags & DEBUG:
return p
if len(_cache) >= _MAXCACHE:
# Drop the oldest item
# Drop the least recently used item.
# next(iter(_cache)) is known to have linear amortized time,
# but it is used here to avoid a dependency from using OrderedDict.
# For the small _MAXCACHE value it doesn't make much of a difference.
try:
del _cache[next(iter(_cache))]
except (StopIteration, RuntimeError, KeyError):
pass
_cache[type(pattern), pattern, flags] = p
# Append to the end.
_cache[key] = p
if len(_cache2) >= _MAXCACHE2:
# Drop the oldest item.
try:
del _cache2[next(iter(_cache2))]
except (StopIteration, RuntimeError, KeyError):
pass
_cache2[key] = p
return p
@functools.lru_cache(_MAXCACHE)

View File

@ -0,0 +1 @@
Use double caching for compiled RE patterns.