gh-96019: Fix caching of decompositions in makeunicodedata (GH-96020)

This commit is contained in:
Carl Friedrich Bolz-Tereick 2022-08-19 11:20:44 +02:00 committed by GitHub
parent ee9f22d346
commit 2d9f252c0c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 1243 additions and 1404 deletions

View File

@ -0,0 +1,3 @@
Fix a bug in the ``makeunicodedata.py`` script leading to about 13 KiB of
space saving in the ``unicodedata`` module, specifically the character
decomposition data.

2634
Modules/unicodedata_db.h generated

File diff suppressed because it is too large Load Diff

View File

@ -169,6 +169,7 @@ def makeunicodedata(unicode, trace):
# 2) decomposition data
decomp_data_cache = {}
decomp_data = [0]
decomp_prefix = [""]
decomp_index = [0] * len(unicode.chars)
@ -207,12 +208,15 @@ def makeunicodedata(unicode, trace):
comp_first[l] = 1
comp_last[r] = 1
comp_pairs.append((l,r,char))
try:
i = decomp_data.index(decomp)
except ValueError:
key = tuple(decomp)
i = decomp_data_cache.get(key, -1)
if i == -1:
i = len(decomp_data)
decomp_data.extend(decomp)
decomp_size = decomp_size + len(decomp) * 2
decomp_data_cache[key] = i
else:
assert decomp_data[i:i+len(decomp)] == decomp
else:
i = 0
decomp_index[char] = i