mirror of https://github.com/python/cpython
bpo-34751: improved hash function for tuples (GH-9471)
This commit is contained in:
parent
53125a53f4
commit
aeb1be5868
|
@ -62,29 +62,104 @@ class TupleTest(seq_tests.CommonTest):
|
|||
yield i
|
||||
self.assertEqual(list(tuple(f())), list(range(1000)))
|
||||
|
||||
def test_hash(self):
|
||||
# See SF bug 942952: Weakness in tuple hash
|
||||
# The hash should:
|
||||
# be non-commutative
|
||||
# should spread-out closely spaced values
|
||||
# should not exhibit cancellation in tuples like (x,(x,y))
|
||||
# should be distinct from element hashes: hash(x)!=hash((x,))
|
||||
# This test exercises those cases.
|
||||
# For a pure random hash and N=50, the expected number of occupied
|
||||
# buckets when tossing 252,600 balls into 2**32 buckets
|
||||
# is 252,592.6, or about 7.4 expected collisions. The
|
||||
# standard deviation is 2.73. On a box with 64-bit hash
|
||||
# codes, no collisions are expected. Here we accept no
|
||||
# more than 15 collisions. Any worse and the hash function
|
||||
# is sorely suspect.
|
||||
|
||||
# Various tests for hashing of tuples to check that we get few collisions.
|
||||
#
|
||||
# Earlier versions of the tuple hash algorithm had collisions
|
||||
# reported at:
|
||||
# - https://bugs.python.org/issue942952
|
||||
# - https://bugs.python.org/issue34751
|
||||
#
|
||||
# Notes:
|
||||
# - The hash of tuples is deterministic: if the test passes once on a given
|
||||
# system, it will always pass. So the probabilities mentioned in the
|
||||
# test_hash functions below should be interpreted assuming that the
|
||||
# hashes are random.
|
||||
# - Due to the structure in the testsuite inputs, collisions are not
|
||||
# independent. For example, if hash((a,b)) == hash((c,d)), then also
|
||||
# hash((a,b,x)) == hash((c,d,x)). But the quoted probabilities assume
|
||||
# independence anyway.
|
||||
# - We limit the hash to 32 bits in the tests to have a good test on
|
||||
# 64-bit systems too. Furthermore, this is also a sanity check that the
|
||||
# lower 32 bits of a 64-bit hash are sufficiently random too.
|
||||
def test_hash1(self):
|
||||
# Check for hash collisions between small integers in range(50) and
|
||||
# certain tuples and nested tuples of such integers.
|
||||
N=50
|
||||
base = list(range(N))
|
||||
xp = [(i, j) for i in base for j in base]
|
||||
inps = base + [(i, j) for i in base for j in xp] + \
|
||||
[(i, j) for i in xp for j in base] + xp + list(zip(base))
|
||||
collisions = len(inps) - len(set(map(hash, inps)))
|
||||
self.assertTrue(collisions <= 15)
|
||||
self.assertEqual(len(inps), 252600)
|
||||
hashes = set(hash(x) % 2**32 for x in inps)
|
||||
collisions = len(inps) - len(hashes)
|
||||
|
||||
# For a pure random 32-bit hash and N = 252,600 test items, the
|
||||
# expected number of collisions equals
|
||||
#
|
||||
# 2**(-32) * N(N-1)/2 = 7.4
|
||||
#
|
||||
# We allow up to 15 collisions, which suffices to make the test
|
||||
# pass with 99.5% confidence.
|
||||
self.assertLessEqual(collisions, 15)
|
||||
|
||||
def test_hash2(self):
|
||||
# Check for hash collisions between small integers (positive and
|
||||
# negative), tuples and nested tuples of such integers.
|
||||
|
||||
# All numbers in the interval [-n, ..., n] except -1 because
|
||||
# hash(-1) == hash(-2).
|
||||
n = 5
|
||||
A = [x for x in range(-n, n+1) if x != -1]
|
||||
|
||||
B = A + [(a,) for a in A]
|
||||
|
||||
L2 = [(a,b) for a in A for b in A]
|
||||
L3 = L2 + [(a,b,c) for a in A for b in A for c in A]
|
||||
L4 = L3 + [(a,b,c,d) for a in A for b in A for c in A for d in A]
|
||||
|
||||
# T = list of testcases. These consist of all (possibly nested
|
||||
# at most 2 levels deep) tuples containing at most 4 items from
|
||||
# the set A.
|
||||
T = A
|
||||
T += [(a,) for a in B + L4]
|
||||
T += [(a,b) for a in L3 for b in B]
|
||||
T += [(a,b) for a in L2 for b in L2]
|
||||
T += [(a,b) for a in B for b in L3]
|
||||
T += [(a,b,c) for a in B for b in B for c in L2]
|
||||
T += [(a,b,c) for a in B for b in L2 for c in B]
|
||||
T += [(a,b,c) for a in L2 for b in B for c in B]
|
||||
T += [(a,b,c,d) for a in B for b in B for c in B for d in B]
|
||||
self.assertEqual(len(T), 345130)
|
||||
hashes = set(hash(x) % 2**32 for x in T)
|
||||
collisions = len(T) - len(hashes)
|
||||
|
||||
# For a pure random 32-bit hash and N = 345,130 test items, the
|
||||
# expected number of collisions equals
|
||||
#
|
||||
# 2**(-32) * N(N-1)/2 = 13.9
|
||||
#
|
||||
# We allow up to 20 collisions, which suffices to make the test
|
||||
# pass with 95.5% confidence.
|
||||
self.assertLessEqual(collisions, 20)
|
||||
|
||||
def test_hash3(self):
|
||||
# Check for hash collisions between tuples containing 0.0 and 0.5.
|
||||
# The hashes of 0.0 and 0.5 itself differ only in one high bit.
|
||||
# So this implicitly tests propagation of high bits to low bits.
|
||||
from itertools import product
|
||||
T = list(product([0.0, 0.5], repeat=18))
|
||||
self.assertEqual(len(T), 262144)
|
||||
hashes = set(hash(x) % 2**32 for x in T)
|
||||
collisions = len(T) - len(hashes)
|
||||
|
||||
# For a pure random 32-bit hash and N = 262,144 test items, the
|
||||
# expected number of collisions equals
|
||||
#
|
||||
# 2**(-32) * N(N-1)/2 = 8.0
|
||||
#
|
||||
# We allow up to 15 collisions, which suffices to make the test
|
||||
# pass with 99.1% confidence.
|
||||
self.assertLessEqual(collisions, 15)
|
||||
|
||||
def test_repr(self):
|
||||
l0 = tuple()
|
||||
|
|
|
@ -0,0 +1,4 @@
|
|||
The hash function for tuples is now based on xxHash
|
||||
which gives better collision results on (formerly) pathological cases.
|
||||
Additionally, on 64-bit systems it improves tuple hashes in general.
|
||||
Patch by Jeroen Demeyer with substantial contributions by Tim Peters.
|
|
@ -333,39 +333,60 @@ error:
|
|||
return NULL;
|
||||
}
|
||||
|
||||
/* The addend 82520, was selected from the range(0, 1000000) for
|
||||
generating the greatest number of prime multipliers for tuples
|
||||
up to length eight:
|
||||
|
||||
1082527, 1165049, 1082531, 1165057, 1247581, 1330103, 1082533,
|
||||
1330111, 1412633, 1165069, 1247599, 1495177, 1577699
|
||||
/* Hash for tuples. This is a slightly simplified version of the xxHash
|
||||
non-cryptographic hash:
|
||||
- we do not use any parallellism, there is only 1 accumulator.
|
||||
- we drop the final mixing since this is just a permutation of the
|
||||
output space: it does not help against collisions.
|
||||
- at the end, we mangle the length with a single constant.
|
||||
For the xxHash specification, see
|
||||
https://github.com/Cyan4973/xxHash/blob/master/doc/xxhash_spec.md
|
||||
|
||||
Tests have shown that it's not worth to cache the hash value, see
|
||||
issue #9685.
|
||||
Below are the official constants from the xxHash specification. Optimizing
|
||||
compilers should emit a single "rotate" instruction for the
|
||||
_PyHASH_XXROTATE() expansion. If that doesn't happen for some important
|
||||
platform, the macro could be changed to expand to a platform-specific rotate
|
||||
spelling instead.
|
||||
*/
|
||||
#if SIZEOF_PY_UHASH_T > 4
|
||||
#define _PyHASH_XXPRIME_1 ((Py_uhash_t)11400714785074694791ULL)
|
||||
#define _PyHASH_XXPRIME_2 ((Py_uhash_t)14029467366897019727ULL)
|
||||
#define _PyHASH_XXPRIME_5 ((Py_uhash_t)2870177450012600261ULL)
|
||||
#define _PyHASH_XXROTATE(x) ((x << 31) | (x >> 33)) /* Rotate left 31 bits */
|
||||
#else
|
||||
#define _PyHASH_XXPRIME_1 ((Py_uhash_t)2654435761UL)
|
||||
#define _PyHASH_XXPRIME_2 ((Py_uhash_t)2246822519UL)
|
||||
#define _PyHASH_XXPRIME_5 ((Py_uhash_t)374761393UL)
|
||||
#define _PyHASH_XXROTATE(x) ((x << 13) | (x >> 19)) /* Rotate left 13 bits */
|
||||
#endif
|
||||
|
||||
/* Tests have shown that it's not worth to cache the hash value, see
|
||||
https://bugs.python.org/issue9685 */
|
||||
static Py_hash_t
|
||||
tuplehash(PyTupleObject *v)
|
||||
{
|
||||
Py_uhash_t x; /* Unsigned for defined overflow behavior. */
|
||||
Py_hash_t y;
|
||||
Py_ssize_t len = Py_SIZE(v);
|
||||
PyObject **p;
|
||||
Py_uhash_t mult = _PyHASH_MULTIPLIER;
|
||||
x = 0x345678UL;
|
||||
p = v->ob_item;
|
||||
while (--len >= 0) {
|
||||
y = PyObject_Hash(*p++);
|
||||
if (y == -1)
|
||||
Py_ssize_t i, len = Py_SIZE(v);
|
||||
PyObject **item = v->ob_item;
|
||||
|
||||
Py_uhash_t acc = _PyHASH_XXPRIME_5;
|
||||
for (i = 0; i < len; i++) {
|
||||
Py_uhash_t lane = PyObject_Hash(item[i]);
|
||||
if (lane == (Py_uhash_t)-1) {
|
||||
return -1;
|
||||
x = (x ^ y) * mult;
|
||||
/* the cast might truncate len; that doesn't change hash stability */
|
||||
mult += (Py_hash_t)(82520UL + len + len);
|
||||
}
|
||||
x += 97531UL;
|
||||
if (x == (Py_uhash_t)-1)
|
||||
x = -2;
|
||||
return x;
|
||||
acc += lane * _PyHASH_XXPRIME_2;
|
||||
acc = _PyHASH_XXROTATE(acc);
|
||||
acc *= _PyHASH_XXPRIME_1;
|
||||
}
|
||||
|
||||
/* Add input length, mangled to keep the historical value of hash(()). */
|
||||
acc += len ^ (_PyHASH_XXPRIME_5 ^ 3527539UL);
|
||||
|
||||
if (acc == (Py_uhash_t)-1) {
|
||||
return 1546275796;
|
||||
}
|
||||
return acc;
|
||||
}
|
||||
|
||||
static Py_ssize_t
|
||||
|
|
Loading…
Reference in New Issue