gh-94155: Reduce hash collisions for code objects (#100183)

* Uses a better hashing algorithm to get better dispersion and remove commutativity.

* Incorporates `co_firstlineno`, `Py_SIZE(co)`, and bytecode instructions.

* This is now the entire set of criteria used in `code_richcompare`, except for `_PyCode_ConstantKey` (which would incorporate the types of `co_consts` rather than just their values).
This commit is contained in:
Dennis Sweeney 2022-12-23 13:15:47 -05:00 committed by GitHub
parent 36d358348d
commit a98d9ea56e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 60 additions and 20 deletions

View File

@ -465,6 +465,32 @@ class CodeTest(unittest.TestCase):
self.assertNotEqual(code_b, code_d)
self.assertNotEqual(code_c, code_d)
def test_code_hash_uses_firstlineno(self):
c1 = (lambda: 1).__code__
c2 = (lambda: 1).__code__
self.assertNotEqual(c1, c2)
self.assertNotEqual(hash(c1), hash(c2))
c3 = c1.replace(co_firstlineno=17)
self.assertNotEqual(c1, c3)
self.assertNotEqual(hash(c1), hash(c3))
def test_code_hash_uses_order(self):
# Swapping posonlyargcount and kwonlyargcount should change the hash.
c = (lambda x, y, *, z=1, w=1: 1).__code__
self.assertEqual(c.co_argcount, 2)
self.assertEqual(c.co_posonlyargcount, 0)
self.assertEqual(c.co_kwonlyargcount, 2)
swapped = c.replace(co_posonlyargcount=2, co_kwonlyargcount=0)
self.assertNotEqual(c, swapped)
self.assertNotEqual(hash(c), hash(swapped))
def test_code_hash_uses_bytecode(self):
c = (lambda x, y: x + y).__code__
d = (lambda x, y: x * y).__code__
c1 = c.replace(co_code=d.co_code)
self.assertNotEqual(c, c1)
self.assertNotEqual(hash(c), hash(c1))
def isinterned(s):
return s is sys.intern(('_' + s + '_')[1:-1])

View File

@ -0,0 +1 @@
Improved the hashing algorithm for code objects, mitigating some hash collisions.

View File

@ -1842,28 +1842,41 @@ code_richcompare(PyObject *self, PyObject *other, int op)
static Py_hash_t
code_hash(PyCodeObject *co)
{
Py_hash_t h, h0, h1, h2, h3;
h0 = PyObject_Hash(co->co_name);
if (h0 == -1) return -1;
h1 = PyObject_Hash(co->co_consts);
if (h1 == -1) return -1;
h2 = PyObject_Hash(co->co_names);
if (h2 == -1) return -1;
h3 = PyObject_Hash(co->co_localsplusnames);
if (h3 == -1) return -1;
Py_hash_t h4 = PyObject_Hash(co->co_linetable);
if (h4 == -1) {
return -1;
Py_uhash_t uhash = 20221211;
#define SCRAMBLE_IN(H) do { \
uhash ^= (Py_uhash_t)(H); \
uhash *= _PyHASH_MULTIPLIER; \
} while (0)
#define SCRAMBLE_IN_HASH(EXPR) do { \
Py_hash_t h = PyObject_Hash(EXPR); \
if (h == -1) { \
return -1; \
} \
SCRAMBLE_IN(h); \
} while (0)
SCRAMBLE_IN_HASH(co->co_name);
SCRAMBLE_IN_HASH(co->co_consts);
SCRAMBLE_IN_HASH(co->co_names);
SCRAMBLE_IN_HASH(co->co_localsplusnames);
SCRAMBLE_IN_HASH(co->co_linetable);
SCRAMBLE_IN_HASH(co->co_exceptiontable);
SCRAMBLE_IN(co->co_argcount);
SCRAMBLE_IN(co->co_posonlyargcount);
SCRAMBLE_IN(co->co_kwonlyargcount);
SCRAMBLE_IN(co->co_flags);
SCRAMBLE_IN(co->co_firstlineno);
SCRAMBLE_IN(Py_SIZE(co));
for (int i = 0; i < Py_SIZE(co); i++) {
int deop = _PyOpcode_Deopt[_Py_OPCODE(_PyCode_CODE(co)[i])];
SCRAMBLE_IN(deop);
SCRAMBLE_IN(_Py_OPARG(_PyCode_CODE(co)[i]));
i += _PyOpcode_Caches[deop];
}
Py_hash_t h5 = PyObject_Hash(co->co_exceptiontable);
if (h5 == -1) {
return -1;
if ((Py_hash_t)uhash == -1) {
return -2;
}
h = h0 ^ h1 ^ h2 ^ h3 ^ h4 ^ h5 ^
co->co_argcount ^ co->co_posonlyargcount ^ co->co_kwonlyargcount ^
co->co_flags;
if (h == -1) h = -2;
return h;
return (Py_hash_t)uhash;
}