diff --git a/Lib/re.py b/Lib/re.py index 269eaef8554..2f1a76eaa52 100644 --- a/Lib/re.py +++ b/Lib/re.py @@ -118,7 +118,6 @@ This module also defines an exception 'error'. import sys import sre_compile import sre_parse -import functools # public symbols __all__ = [ "match", "search", "sub", "subn", "split", "findall", @@ -206,9 +205,9 @@ def compile(pattern, flags=0): return _compile(pattern, flags) def purge(): - "Clear the regular expression caches" - _compile_typed.clear() - _compile_repl.clear() + "Clear the regular expression cache" + _cache.clear() + _cache_repl.clear() def template(pattern, flags=0): "Compile a template pattern, returning a pattern object" @@ -290,12 +289,12 @@ def _shrink_cache(cache_dict, max_length, divisor=5): # Ignore problems if the cache changed from another thread. pass -def _compile(*args): - return _compile_typed(type(args[0]), *args) - -@functools.lru_cache(maxsize=_MAXCACHE) -def _compile_typed(type, *key): +def _compile(*key): # internal: compile pattern + cachekey = (type(key[0]),) + key + p = _cache.get(cachekey) + if p is not None: + return p pattern, flags = key if isinstance(pattern, _pattern_type): if flags: @@ -304,14 +303,23 @@ def _compile_typed(type, *key): return pattern if not sre_compile.isstring(pattern): raise TypeError("first argument must be string or compiled pattern") - return sre_compile.compile(pattern, flags) + p = sre_compile.compile(pattern, flags) + if len(_cache) >= _MAXCACHE: + _shrink_cache(_cache, _MAXCACHE) + _cache[cachekey] = p return p -@functools.lru_cache(maxsize=_MAXCACHE) def _compile_repl(*key): # internal: compile replacement pattern + p = _cache_repl.get(key) + if p is not None: + return p repl, pattern = key - return sre_parse.parse_template(repl, pattern) + p = sre_parse.parse_template(repl, pattern) + if len(_cache_repl) >= _MAXCACHE: + _shrink_cache(_cache_repl, _MAXCACHE) + _cache_repl[key] = p + return p def _expand(pattern, match, template): # internal: match.expand implementation hook diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py index 96a83b88c18..6b11685e9c8 100644 --- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -875,8 +875,70 @@ def run_re_tests(): print('=== Fails on unicode-sensitive match', t) +class ReCacheTests(unittest.TestCase): + """These tests are specific to the re._shrink_cache implementation.""" + + def setUp(self): + self._orig_maxcache = re._MAXCACHE + + def tearDown(self): + re._MAXCACHE = self._orig_maxcache + + def test_compile_cache_overflow(self): + # NOTE: If a profiler or debugger is tracing code and compiling + # regular expressions while tracing through this test... expect + # the test to fail. This test is not concurrency safe. + + # Explicitly fill the caches. + re._MAXCACHE = 20 + max_cache = re._MAXCACHE + unique_chars = tuple(chr(char_num) for char_num in + range(b'a'[0], b'a'[0]+max_cache)) + re._cache.clear() + for char in unique_chars: + re._compile(char, 0) + self.assertEqual(max_cache, len(re._cache)) + re._cache_repl.clear() + for char in unique_chars: + re._compile_repl(char*2, char) + self.assertEqual(max_cache, len(re._cache_repl)) + + # Overflow both caches and make sure they have extra room left + # afterwards as well as having more than a single entry. + re._compile('A', 0) + self.assertLess(len(re._cache), max_cache) + self.assertGreater(len(re._cache), 1) + re._compile_repl('A', 'A') + self.assertLess(len(re._cache_repl), max_cache) + self.assertGreater(len(re._cache_repl), 1) + + def test_shrink_cache_at_limit(self): + cache = dict(zip(range(6), range(6))) + re._shrink_cache(cache, 6, divisor=3) + self.assertEqual(4, len(cache)) + + def test_shrink_cache_empty(self): + cache = {} + re._shrink_cache(cache, 6, divisor=3) + # Cache was empty, make sure we didn't raise an exception. + self.assertEqual(0, len(cache)) + + def test_shrink_cache_overflowing(self): + cache = dict(zip(range(6), range(6))) + re._shrink_cache(cache, 4, divisor=2) + # Cache was larger than the maximum, be sure we shrunk to smaller. + self.assertEqual(2, len(cache)) + + def test_shrink_cache_underflow(self): + cache = dict(zip(range(6), range(6))) + # No shrinking to do. + re._shrink_cache(cache, 9, divisor=3) + self.assertEqual(6, len(cache)) + + def test_main(): run_unittest(ReTests) + run_unittest(ReCacheTests) run_re_tests() if __name__ == "__main__": diff --git a/Misc/NEWS b/Misc/NEWS index 57bcf8ea5f3..00188d4f946 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -697,8 +697,8 @@ Library - The default size of the re module's compiled regular expression cache has been increased from 100 to 500 and the cache replacement policy has changed from - simply clearing the entire cache on overflow to forgetting the least recently - used cached compiled regular expressions. This is a performance win for + simply clearing the entire cache on overflow to randomly forgetting 20% of the + existing cached compiled regular expressions. This is a performance win for applications that use a lot of regular expressions and limits the impact of the performance hit anytime the cache is exceeded.