Issue 9396. Apply functools.lru_cache in the place of the

random flushing cache in the re module.
This commit is contained in:
Raymond Hettinger 2010-08-09 04:24:42 +00:00
parent cca65313c4
commit 4f859ed9c7
3 changed files with 14 additions and 84 deletions

View File

@ -118,6 +118,7 @@ This module also defines an exception 'error'.
import sys import sys
import sre_compile import sre_compile
import sre_parse import sre_parse
import functools
# public symbols # public symbols
__all__ = [ "match", "search", "sub", "subn", "split", "findall", __all__ = [ "match", "search", "sub", "subn", "split", "findall",
@ -205,9 +206,9 @@ def compile(pattern, flags=0):
return _compile(pattern, flags) return _compile(pattern, flags)
def purge(): def purge():
"Clear the regular expression cache" "Clear the regular expression caches"
_cache.clear() _compile_typed.clear()
_cache_repl.clear() _compile_repl.clear()
def template(pattern, flags=0): def template(pattern, flags=0):
"Compile a template pattern, returning a pattern object" "Compile a template pattern, returning a pattern object"
@ -289,12 +290,12 @@ def _shrink_cache(cache_dict, max_length, divisor=5):
# Ignore problems if the cache changed from another thread. # Ignore problems if the cache changed from another thread.
pass pass
def _compile(*key): def _compile(*args):
return _compile_typed(type(args[0]), *args)
@functools.lru_cache(maxsize=_MAXCACHE)
def _compile_typed(type, *key):
# internal: compile pattern # internal: compile pattern
cachekey = (type(key[0]),) + key
p = _cache.get(cachekey)
if p is not None:
return p
pattern, flags = key pattern, flags = key
if isinstance(pattern, _pattern_type): if isinstance(pattern, _pattern_type):
if flags: if flags:
@ -303,23 +304,14 @@ def _compile(*key):
return pattern return pattern
if not sre_compile.isstring(pattern): if not sre_compile.isstring(pattern):
raise TypeError("first argument must be string or compiled pattern") raise TypeError("first argument must be string or compiled pattern")
p = sre_compile.compile(pattern, flags) return sre_compile.compile(pattern, flags)
if len(_cache) >= _MAXCACHE:
_shrink_cache(_cache, _MAXCACHE)
_cache[cachekey] = p
return p return p
@functools.lru_cache(maxsize=_MAXCACHE)
def _compile_repl(*key): def _compile_repl(*key):
# internal: compile replacement pattern # internal: compile replacement pattern
p = _cache_repl.get(key)
if p is not None:
return p
repl, pattern = key repl, pattern = key
p = sre_parse.parse_template(repl, pattern) return sre_parse.parse_template(repl, pattern)
if len(_cache_repl) >= _MAXCACHE:
_shrink_cache(_cache_repl, _MAXCACHE)
_cache_repl[key] = p
return p
def _expand(pattern, match, template): def _expand(pattern, match, template):
# internal: match.expand implementation hook # internal: match.expand implementation hook

View File

@ -875,70 +875,8 @@ def run_re_tests():
print('=== Fails on unicode-sensitive match', t) print('=== Fails on unicode-sensitive match', t)
class ReCacheTests(unittest.TestCase):
"""These tests are specific to the re._shrink_cache implementation."""
def setUp(self):
self._orig_maxcache = re._MAXCACHE
def tearDown(self):
re._MAXCACHE = self._orig_maxcache
def test_compile_cache_overflow(self):
# NOTE: If a profiler or debugger is tracing code and compiling
# regular expressions while tracing through this test... expect
# the test to fail. This test is not concurrency safe.
# Explicitly fill the caches.
re._MAXCACHE = 20
max_cache = re._MAXCACHE
unique_chars = tuple(chr(char_num) for char_num in
range(b'a'[0], b'a'[0]+max_cache))
re._cache.clear()
for char in unique_chars:
re._compile(char, 0)
self.assertEqual(max_cache, len(re._cache))
re._cache_repl.clear()
for char in unique_chars:
re._compile_repl(char*2, char)
self.assertEqual(max_cache, len(re._cache_repl))
# Overflow both caches and make sure they have extra room left
# afterwards as well as having more than a single entry.
re._compile('A', 0)
self.assertLess(len(re._cache), max_cache)
self.assertGreater(len(re._cache), 1)
re._compile_repl('A', 'A')
self.assertLess(len(re._cache_repl), max_cache)
self.assertGreater(len(re._cache_repl), 1)
def test_shrink_cache_at_limit(self):
cache = dict(zip(range(6), range(6)))
re._shrink_cache(cache, 6, divisor=3)
self.assertEqual(4, len(cache))
def test_shrink_cache_empty(self):
cache = {}
re._shrink_cache(cache, 6, divisor=3)
# Cache was empty, make sure we didn't raise an exception.
self.assertEqual(0, len(cache))
def test_shrink_cache_overflowing(self):
cache = dict(zip(range(6), range(6)))
re._shrink_cache(cache, 4, divisor=2)
# Cache was larger than the maximum, be sure we shrunk to smaller.
self.assertEqual(2, len(cache))
def test_shrink_cache_underflow(self):
cache = dict(zip(range(6), range(6)))
# No shrinking to do.
re._shrink_cache(cache, 9, divisor=3)
self.assertEqual(6, len(cache))
def test_main(): def test_main():
run_unittest(ReTests) run_unittest(ReTests)
run_unittest(ReCacheTests)
run_re_tests() run_re_tests()
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -697,8 +697,8 @@ Library
- The default size of the re module's compiled regular expression cache has been - The default size of the re module's compiled regular expression cache has been
increased from 100 to 500 and the cache replacement policy has changed from increased from 100 to 500 and the cache replacement policy has changed from
simply clearing the entire cache on overflow to randomly forgetting 20% of the simply clearing the entire cache on overflow to forgetting the least recently
existing cached compiled regular expressions. This is a performance win for used cached compiled regular expressions. This is a performance win for
applications that use a lot of regular expressions and limits the impact of applications that use a lot of regular expressions and limits the impact of
the performance hit anytime the cache is exceeded. the performance hit anytime the cache is exceeded.