Fiddled w/ /F's cool new splitbins function: documented it, generalized it

a bit, sped it a lot primarily by removing the unused assumption that None was
a legit bin entry (the function doesn't really need to assume that there's
anything special about 0), added an optional "trace" argument, and in __debug__
mode added exhaustive verification that the decomposition is both correct and
doesn't overstep any array bounds (which wasn't obvious to me from staring at the
generated C code -- now I feel safe!).  Did not commit a new unicodedata_db.h, as
the one produced by this version is identical to the one already checked in.
This commit is contained in:
Tim Peters 2000-09-25 07:13:41 +00:00
parent 68ded6e6f1
commit 2101348830
1 changed files with 54 additions and 26 deletions

View File

@ -165,38 +165,66 @@ def getsize(data):
else:
return 4
def splitbins(bins):
# split a sparse integer table into two tables, such as:
# value = t2[(t1[char>>shift]<<shift)+(char&mask)]
# and value == 0 means no data
bytes = sys.maxint
for shift in range(16):
bin1 = []
bin2 = []
def splitbins(t, trace=0):
"""t, trace=0 -> (t1, t2, shift). Split a table to save space.
t is a sequence of ints. This function can be useful to save space if
many of the ints are the same. t1 and t2 are lists of ints, and shift
is an int, chosen to minimize the combined size of t1 and t2 (in C
code), and where for each i in range(len(t)),
t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
where mask is a bitmask isolating the last "shift" bits.
If optional arg trace is true (default false), progress info is
printed to sys.stderr.
"""
import sys
if trace:
def dump(t1, t2, shift, bytes):
print >>sys.stderr, "%d+%d bins at shift %d; %d bytes" % (
len(t1), len(t2), shift, bytes)
print >>sys.stderr, "Size of original table:", len(t)*getsize(t), \
"bytes"
n = len(t)-1 # last valid index
maxshift = 0 # the most we can shift n and still have something left
if n > 0:
while n >> 1:
n >>= 1
maxshift += 1
del n
bytes = sys.maxint # smallest total size so far
t = tuple(t) # so slices can be dict keys
for shift in range(maxshift + 1):
t1 = []
t2 = []
size = 2**shift
bincache = {}
for i in range(0, len(bins), size):
bin = bins[i:i+size]
index = bincache.get(tuple(bin))
for i in range(0, len(t), size):
bin = t[i:i+size]
index = bincache.get(bin)
if index is None:
index = len(bin2)
bincache[tuple(bin)] = index
for v in bin:
if v is None:
bin2.append(0)
else:
bin2.append(v)
bin1.append(index>>shift)
index = len(t2)
bincache[bin] = index
t2.extend(bin)
t1.append(index >> shift)
# determine memory size
b = len(bin1)*getsize(bin1) + len(bin2)*getsize(bin2)
b = len(t1)*getsize(t1) + len(t2)*getsize(t2)
if trace:
dump(t1, t2, shift, b)
if b < bytes:
best = shift, bin1, bin2
best = t1, t2, shift
bytes = b
shift, bin1, bin2 = best
## print >>sys.stderr, "%d+%d bins at shift %d; %d bytes" % (
## len(bin1), len(bin2), shift, bytes
## )
return bin1, bin2, shift
t1, t2, shift = best
if trace:
print >>sys.stderr, "Best:",
dump(t1, t2, shift, bytes)
if __debug__:
# exhaustively verify that the decomposition is correct
mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
for i in xrange(len(t)):
assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
return best
if __name__ == "__main__":
maketable()