diff --git a/Lib/re.py b/Lib/re.py index 9bd913a68dd..2f1a76eaa52 100644 --- a/Lib/re.py +++ b/Lib/re.py @@ -254,7 +254,40 @@ _cache_repl = {} _pattern_type = type(sre_compile.compile("", 0)) -_MAXCACHE = 100 +_MAXCACHE = 500 + +def _shrink_cache(cache_dict, max_length, divisor=5): + """Make room in the given cache. + + Args: + cache_dict: The cache dictionary to modify. + max_length: Maximum # of entries in cache_dict before it is shrunk. + divisor: Cache will shrink to max_length - 1/divisor*max_length items. + """ + # Toss out a fraction of the entries at random to make room for new ones. + # A random algorithm was chosen as opposed to simply cache_dict.popitem() + # as popitem could penalize the same regular expression repeatedly based + # on its internal hash value. Being random should spread the cache miss + # love around. + cache_keys = tuple(cache_dict.keys()) + overage = len(cache_keys) - max_length + if overage < 0: + # Cache is already within limits. Normally this should not happen + # but it could due to multithreading. + return + number_to_toss = max_length // divisor + overage + # The import is done here to avoid a circular depencency. + import random + if not hasattr(random, 'sample'): + # Do nothing while resolving the circular dependency: + # re->random->warnings->tokenize->string->re + return + for doomed_key in random.sample(cache_keys, number_to_toss): + try: + del cache_dict[doomed_key] + except KeyError: + # Ignore problems if the cache changed from another thread. + pass def _compile(*key): # internal: compile pattern @@ -272,7 +305,7 @@ def _compile(*key): raise TypeError("first argument must be string or compiled pattern") p = sre_compile.compile(pattern, flags) if len(_cache) >= _MAXCACHE: - _cache.clear() + _shrink_cache(_cache, _MAXCACHE) _cache[cachekey] = p return p @@ -284,7 +317,7 @@ def _compile_repl(*key): repl, pattern = key p = sre_parse.parse_template(repl, pattern) if len(_cache_repl) >= _MAXCACHE: - _cache_repl.clear() + _shrink_cache(_cache_repl, _MAXCACHE) _cache_repl[key] = p return p diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py index 7b0a8dd419c..6b11685e9c8 100644 --- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -874,8 +874,71 @@ def run_re_tests(): if result is None: print('=== Fails on unicode-sensitive match', t) + +class ReCacheTests(unittest.TestCase): + """These tests are specific to the re._shrink_cache implementation.""" + + def setUp(self): + self._orig_maxcache = re._MAXCACHE + + def tearDown(self): + re._MAXCACHE = self._orig_maxcache + + def test_compile_cache_overflow(self): + # NOTE: If a profiler or debugger is tracing code and compiling + # regular expressions while tracing through this test... expect + # the test to fail. This test is not concurrency safe. + + # Explicitly fill the caches. + re._MAXCACHE = 20 + max_cache = re._MAXCACHE + unique_chars = tuple(chr(char_num) for char_num in + range(b'a'[0], b'a'[0]+max_cache)) + re._cache.clear() + for char in unique_chars: + re._compile(char, 0) + self.assertEqual(max_cache, len(re._cache)) + re._cache_repl.clear() + for char in unique_chars: + re._compile_repl(char*2, char) + self.assertEqual(max_cache, len(re._cache_repl)) + + # Overflow both caches and make sure they have extra room left + # afterwards as well as having more than a single entry. + re._compile('A', 0) + self.assertLess(len(re._cache), max_cache) + self.assertGreater(len(re._cache), 1) + re._compile_repl('A', 'A') + self.assertLess(len(re._cache_repl), max_cache) + self.assertGreater(len(re._cache_repl), 1) + + def test_shrink_cache_at_limit(self): + cache = dict(zip(range(6), range(6))) + re._shrink_cache(cache, 6, divisor=3) + self.assertEqual(4, len(cache)) + + def test_shrink_cache_empty(self): + cache = {} + re._shrink_cache(cache, 6, divisor=3) + # Cache was empty, make sure we didn't raise an exception. + self.assertEqual(0, len(cache)) + + def test_shrink_cache_overflowing(self): + cache = dict(zip(range(6), range(6))) + re._shrink_cache(cache, 4, divisor=2) + # Cache was larger than the maximum, be sure we shrunk to smaller. + self.assertEqual(2, len(cache)) + + def test_shrink_cache_underflow(self): + cache = dict(zip(range(6), range(6))) + # No shrinking to do. + re._shrink_cache(cache, 9, divisor=3) + self.assertEqual(6, len(cache)) + + def test_main(): run_unittest(ReTests) + run_unittest(ReCacheTests) run_re_tests() if __name__ == "__main__": diff --git a/Misc/NEWS b/Misc/NEWS index 70a646f65d9..a6a7c20f26a 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -473,6 +473,13 @@ C-API Library ------- +- The default size of the re module's compiled regular expression cache has + been increased from 100 to 500 and the cache replacement policy has changed + from simply clearing the entire cache on overflow to randomly forgetting 20% + of the existing cached compiled regular expressions. This is a performance + win for applications that use a lot of regular expressions and limits the + impact of the performance hit anytime the cache is exceeded. + - Issue #7113: Speed up loading in configparser. Patch by Ɓukasz Langa. - Issue #9032: XML-RPC client retries the request on EPIPE error. The EPIPE