* Add short-circuit code for in-place operations with self (such as

s|=s, s&=s, s-=s, or s^=s).  Add related tests.

* Improve names for several variables and functions.

* Provide alternate table access functions (next, contains, add, and discard)
  that work with an entry argument instead of just a key.  This improves
  set-vs-set operations because we already have a hash value for each key
  and can avoid unnecessary calls to PyObject_Hash().  Provides a 5% to 20%
  speed-up for quick hashing elements like strings and integers.  Provides
  much more substantial improvements for slow hashing elements like tuples
  or objects defining a custom __hash__() function.

* Have difference operations resize() when 1/5 of the elements are dummies.
  Formerly, it was 1/6.  The new ratio triggers less frequently and only
  in cases that it can resize quicker and with greater benefit.  The right
  answer is probably either 1/4, 1/5, or 1/6.  Picked the middle value for
  an even trade-off between resize time and the space/time costs of dummy
  entries.
This commit is contained in:
Raymond Hettinger 2005-08-11 07:58:45 +00:00
parent 9f3ae3e69d
commit c991db240c
2 changed files with 168 additions and 92 deletions

View File

@ -370,6 +370,18 @@ class TestSet(TestJointOps):
else:
self.assert_(c not in self.s)
def test_inplace_on_self(self):
t = self.s.copy()
t |= t
self.assertEqual(t, self.s)
t &= t
self.assertEqual(t, self.s)
t -= t
self.assertEqual(t, self.thetype())
t = self.s.copy()
t ^= t
self.assertEqual(t, self.thetype())
def test_weakref(self):
s = self.thetype('gallahad')
p = proxy(s)

View File

@ -1,3 +1,4 @@
/* set object implementation
Written and maintained by Raymond D. Hettinger <python@rcn.com>
Derived from Lib/sets.py and Objects/dictobject.c.
@ -226,7 +227,6 @@ set_insert_key(register PySetObject *so, PyObject *key, long hash)
typedef setentry *(*lookupfunc)(PySetObject *, PyObject *, long);
assert(so->lookup != NULL);
entry = so->lookup(so, key, hash);
if (entry->key == NULL) {
/* UNUSED */
@ -336,18 +336,30 @@ set_table_resize(PySetObject *so, int minused)
return 0;
}
/* CAUTION: set_add_internal() must guarantee that it won't resize the table */
/* CAUTION: set_add_key/entry() must guarantee it won't resize the table */
static int
set_add_internal(register PySetObject *so, PyObject *key)
set_add_entry(register PySetObject *so, setentry *entry)
{
register int n_used;
assert(so->fill <= so->mask); /* at least one empty slot */
n_used = so->used;
Py_INCREF(entry->key);
set_insert_key(so, entry->key, entry->hash);
if (!(so->used > n_used && so->fill*3 >= (so->mask+1)*2))
return 0;
return set_table_resize(so, so->used>50000 ? so->used*2 : so->used*4);
}
static int
set_add_key(register PySetObject *so, PyObject *key)
{
register long hash;
register int n_used;
if (PyString_CheckExact(key)) {
hash = ((PyStringObject *)key)->ob_shash;
if (hash == -1)
hash = PyObject_Hash(key);
} else {
if (!PyString_CheckExact(key) ||
(hash = ((PyStringObject *) key)->ob_shash) == -1) {
hash = PyObject_Hash(key);
if (hash == -1)
return -1;
@ -365,7 +377,23 @@ set_add_internal(register PySetObject *so, PyObject *key)
#define DISCARD_FOUND 1
static int
set_discard_internal(PySetObject *so, PyObject *key)
set_discard_entry(PySetObject *so, setentry *oldentry)
{ register setentry *entry;
PyObject *old_key;
entry = (so->lookup)(so, oldentry->key, oldentry->hash);
if (entry->key == NULL || entry->key == dummy)
return DISCARD_NOTFOUND;
old_key = entry->key;
Py_INCREF(dummy);
entry->key = dummy;
so->used--;
Py_DECREF(old_key);
return DISCARD_FOUND;
}
static int
set_discard_key(PySetObject *so, PyObject *key)
{
register long hash;
register setentry *entry;
@ -457,39 +485,39 @@ set_clear_internal(PySetObject *so)
* Iterate over a set table. Use like so:
*
* int pos;
* PyObject *key;
* setentry *entry;
* pos = 0; # important! pos should not otherwise be changed by you
* while (set_next_internal(yourset, &pos, &key)) {
* Refer to borrowed reference in key.
* while (set_next(yourset, &pos, &entry)) {
* Refer to borrowed reference in entry->key.
* }
*
* CAUTION: In general, it isn't safe to use set_next_internal in a loop that
* CAUTION: In general, it isn't safe to use set_next in a loop that
* mutates the table.
*/
static int
set_next_internal(PySetObject *so, int *pos, PyObject **key)
set_next(PySetObject *so, int *pos_ptr, setentry **entry_ptr)
{
register int i, mask;
register setentry *entry;
register setentry *table;
assert (PyAnySet_Check(so));
i = *pos;
i = *pos_ptr;
if (i < 0)
return 0;
entry = so->table;
table = so->table;
mask = so->mask;
while (i <= mask && (entry[i].key == NULL || entry[i].key == dummy))
while (i <= mask && (table[i].key == NULL || table[i].key == dummy))
i++;
*pos = i+1;
*pos_ptr = i+1;
if (i > mask)
return 0;
if (key)
*key = entry[i].key;
if (table[i].key)
*entry_ptr = &table[i];
return 1;
}
static int
set_merge_internal(PySetObject *so, PyObject *otherset)
set_merge(PySetObject *so, PyObject *otherset)
{
PySetObject *other;
register int i;
@ -525,7 +553,7 @@ set_merge_internal(PySetObject *so, PyObject *otherset)
}
static int
set_contains_internal(PySetObject *so, PyObject *key)
set_contains_key(PySetObject *so, PyObject *key)
{
long hash;
@ -539,6 +567,15 @@ set_contains_internal(PySetObject *so, PyObject *key)
return key != NULL && key != dummy;
}
static int
set_contains_entry(PySetObject *so, setentry *entry)
{
PyObject *key;
key = (so->lookup)(so, entry->key, entry->hash)->key;
return key != NULL && key != dummy;
}
/***** Set iterator type ***********************************************/
static PyTypeObject PySetIter_Type; /* Forward */
@ -667,13 +704,13 @@ set_update_internal(PySetObject *so, PyObject *other)
PyObject *key, *it;
if (PyAnySet_Check(other))
return set_merge_internal(so, other);
return set_merge(so, other);
if (PyDict_Check(other)) {
PyObject *key, *value;
int pos = 0;
while (PyDict_Next(other, &pos, &key, &value)) {
if (set_add_internal(so, key) == -1)
if (set_add_key(so, key) == -1)
return -1;
}
return 0;
@ -684,7 +721,7 @@ set_update_internal(PySetObject *so, PyObject *other)
return -1;
while ((key = PyIter_Next(it)) != NULL) {
if (set_add_internal(so, key) == -1) {
if (set_add_key(so, key) == -1) {
Py_DECREF(it);
Py_DECREF(key);
return -1;
@ -833,10 +870,10 @@ static int
set_traverse(PySetObject *so, visitproc visit, void *arg)
{
int pos = 0;
PyObject *key;
setentry *entry;
while (set_next_internal(so, &pos, &key))
Py_VISIT(key);
while (set_next(so, &pos, &entry))
Py_VISIT(entry->key);
return 0;
}
@ -897,14 +934,14 @@ set_contains(PySetObject *so, PyObject *key)
PyObject *tmpkey;
int result;
result = set_contains_internal(so, key);
result = set_contains_key(so, key);
if (result == -1 && PyAnySet_Check(key)) {
PyErr_Clear();
tmpkey = make_new_set(&PyFrozenSet_Type, NULL);
if (tmpkey == NULL)
return -1;
set_swap_bodies((PySetObject *)tmpkey, (PySetObject *)key);
result = set_contains_internal(so, tmpkey);
result = set_contains_key(so, tmpkey);
set_swap_bodies((PySetObject *)tmpkey, (PySetObject *)key);
Py_DECREF(tmpkey);
}
@ -942,6 +979,15 @@ frozenset_copy(PySetObject *so)
PyDoc_STRVAR(copy_doc, "Return a shallow copy of a set.");
static PyObject *
set_clear(PySetObject *so)
{
set_clear_internal(so);
Py_RETURN_NONE;
}
PyDoc_STRVAR(clear_doc, "Remove all elements from this set.");
static PyObject *
set_union(PySetObject *so, PyObject *other)
{
@ -991,6 +1037,11 @@ set_intersection(PySetObject *so, PyObject *other)
PySetObject *result;
PyObject *key, *it, *tmp;
if ((PyObject *)so == other) {
Py_INCREF(other);
return other;
}
result = (PySetObject *)make_new_set(so->ob_type, NULL);
if (result == NULL)
return NULL;
@ -1001,11 +1052,12 @@ set_intersection(PySetObject *so, PyObject *other)
other = tmp;
}
if (PyAnySet_Check(other)) {
if (PyAnySet_Check(other)) {
int pos = 0;
while (set_next_internal((PySetObject *)other, &pos, &key)) {
if (set_contains_internal(so, key)) {
if (set_add_internal(result, key) == -1) {
setentry *entry;
while (set_next((PySetObject *)other, &pos, &entry)) {
if (set_contains_entry(so, entry)) {
if (set_add_entry(result, entry) == -1) {
Py_DECREF(result);
return NULL;
}
@ -1021,8 +1073,8 @@ set_intersection(PySetObject *so, PyObject *other)
}
while ((key = PyIter_Next(it)) != NULL) {
if (set_contains_internal(so, key)) {
if (set_add_internal(result, key) == -1) {
if (set_contains_key(so, key)) {
if (set_add_key(result, key) == -1) {
Py_DECREF(it);
Py_DECREF(result);
Py_DECREF(key);
@ -1087,32 +1139,48 @@ set_iand(PySetObject *so, PyObject *other)
return (PyObject *)so;
}
int
set_difference_update_internal(PySetObject *so, PyObject *other)
{
if ((PyObject *)so == other)
return set_clear_internal(so);
if (PyAnySet_Check(other)) {
setentry *entry;
int pos = 0;
while (set_next((PySetObject *)other, &pos, &entry))
set_discard_entry(so, entry);
} else {
PyObject *key, *it;
it = PyObject_GetIter(other);
if (it == NULL)
return -1;
while ((key = PyIter_Next(it)) != NULL) {
if (set_discard_key(so, key) == -1) {
Py_DECREF(it);
Py_DECREF(key);
return -1;
}
Py_DECREF(key);
}
Py_DECREF(it);
if (PyErr_Occurred())
return -1;
}
/* If more than 1/5 are dummies, then resize them away. */
if ((so->fill - so->used) * 5 < so->mask)
return 0;
return set_table_resize(so, so->used>50000 ? so->used*2 : so->used*4);
}
static PyObject *
set_difference_update(PySetObject *so, PyObject *other)
{
PyObject *key, *it;
it = PyObject_GetIter(other);
if (it == NULL)
return NULL;
while ((key = PyIter_Next(it)) != NULL) {
if (set_discard_internal(so, key) == -1) {
Py_DECREF(it);
Py_DECREF(key);
return NULL;
}
Py_DECREF(key);
}
Py_DECREF(it);
if (PyErr_Occurred())
return NULL;
/* If more than 1/6 are dummies, then resize them away. */
if ((so->fill - so->used) * 6 < so->mask)
if (set_difference_update_internal(so, other) != -1)
Py_RETURN_NONE;
if (set_table_resize(so, so->used>50000 ? so->used*2 : so->used*4) == -1)
return NULL;
Py_RETURN_NONE;
return NULL;
}
PyDoc_STRVAR(difference_update_doc,
@ -1121,18 +1189,16 @@ PyDoc_STRVAR(difference_update_doc,
static PyObject *
set_difference(PySetObject *so, PyObject *other)
{
PyObject *tmp, *key, *result;
PyObject *result;
setentry *entry;
int pos = 0;
if (!PyAnySet_Check(other) && !PyDict_Check(other)) {
result = set_copy(so);
if (result == NULL)
return NULL;
if (set_difference_update_internal((PySetObject *)result, other) != -1)
return result;
tmp = set_difference_update((PySetObject *)result, other);
if (tmp != NULL) {
Py_DECREF(tmp);
return result;
}
Py_DECREF(result);
return NULL;
}
@ -1142,18 +1208,21 @@ set_difference(PySetObject *so, PyObject *other)
return NULL;
if (PyDict_Check(other)) {
while (set_next_internal(so, &pos, &key)) {
if (!PyDict_Contains(other, key)) {
if (set_add_internal((PySetObject *)result, key) == -1)
while (set_next(so, &pos, &entry)) {
setentry entrycopy;
entrycopy.hash = entry->hash;
entrycopy.key = entry->key;
if (!PyDict_Contains(other, entry->key)) {
if (set_add_entry((PySetObject *)result, &entrycopy) == -1)
return NULL;
}
}
return result;
}
while (set_next_internal(so, &pos, &key)) {
if (!set_contains_internal((PySetObject *)other, key)) {
if (set_add_internal((PySetObject *)result, key) == -1)
while (set_next(so, &pos, &entry)) {
if (!set_contains_entry((PySetObject *)other, entry)) {
if (set_add_entry((PySetObject *)result, entry) == -1)
return NULL;
}
}
@ -1197,16 +1266,20 @@ set_symmetric_difference_update(PySetObject *so, PyObject *other)
PySetObject *otherset;
PyObject *key;
int pos = 0;
setentry *entry;
if ((PyObject *)so == other)
return set_clear(so);
if (PyDict_Check(other)) {
PyObject *value;
int rv;
while (PyDict_Next(other, &pos, &key, &value)) {
rv = set_discard_internal(so, key);
rv = set_discard_key(so, key);
if (rv == -1)
return NULL;
if (rv == DISCARD_NOTFOUND) {
if (set_add_internal(so, key) == -1)
if (set_add_key(so, key) == -1)
return NULL;
}
}
@ -1222,14 +1295,14 @@ set_symmetric_difference_update(PySetObject *so, PyObject *other)
return NULL;
}
while (set_next_internal(otherset, &pos, &key)) {
int rv = set_discard_internal(so, key);
while (set_next(otherset, &pos, &entry)) {
int rv = set_discard_entry(so, entry);
if (rv == -1) {
Py_XDECREF(otherset);
return NULL;
}
if (rv == DISCARD_NOTFOUND) {
if (set_add_internal(so, key) == -1) {
if (set_add_entry(so, entry) == -1) {
Py_XDECREF(otherset);
return NULL;
}
@ -1312,7 +1385,7 @@ set_issubset(PySetObject *so, PyObject *other)
for (i=so->used ; i ; entry++, i--) {
while (entry->key == NULL || entry->key==dummy)
entry++;
if (!set_contains_internal((PySetObject *)other, entry->key))
if (!set_contains_entry((PySetObject *)other, entry))
Py_RETURN_FALSE;
}
Py_RETURN_TRUE;
@ -1448,35 +1521,26 @@ set_repr(PySetObject *so)
static int
set_tp_print(PySetObject *so, FILE *fp, int flags)
{
PyObject *key;
setentry *entry;
int pos=0;
char *emit = ""; /* No separator emitted on first pass */
char *separator = ", ";
fprintf(fp, "%s([", so->ob_type->tp_name);
while (set_next_internal(so, &pos, &key)) {
while (set_next(so, &pos, &entry)) {
fputs(emit, fp);
emit = separator;
if (PyObject_Print(key, fp, 0) != 0)
if (PyObject_Print(entry->key, fp, 0) != 0)
return -1;
}
fputs("])", fp);
return 0;
}
static PyObject *
set_clear(PySetObject *so)
{
set_clear_internal(so);
Py_RETURN_NONE;
}
PyDoc_STRVAR(clear_doc, "Remove all elements from this set.");
static PyObject *
set_add(PySetObject *so, PyObject *key)
{
if (set_add_internal(so, key) == -1)
if (set_add_key(so, key) == -1)
return NULL;
Py_RETURN_NONE;
}
@ -1503,7 +1567,7 @@ set_remove(PySetObject *so, PyObject *key)
return result;
}
rv = set_discard_internal(so, key);
rv = set_discard_key(so, key);
if (rv == -1)
return NULL;
else if (rv == DISCARD_NOTFOUND) {
@ -1534,7 +1598,7 @@ set_discard(PySetObject *so, PyObject *key)
return result;
}
if (set_discard_internal(so, key) == -1)
if (set_discard_key(so, key) == -1)
return NULL;
Py_RETURN_NONE;
}