gh-112529: Make the GC scheduling thread-safe (#114880)

The GC keeps track of the number of allocations (less deallocations) since the last GC. This buffers the count in thread-local state and uses atomic operations to modify the per-interpreter count. The thread-local buffering avoids contention on shared state. A consequence is that the GC scheduling is not as precise, so "test_sneaky_frame_object" is skipped because it requires that the GC be run exactly after allocating a frame object.
2024-02-16 11:22:27 -05:00 · 2024-02-16 11:22:27 -05:00 · b24c9161a6
parent f92857a930
commit b24c9161a6
7 changed files with 71 additions and 16 deletions
--- a/Include/internal/pycore_gc.h
+++ b/Include/internal/pycore_gc.h
@ -260,6 +260,13 @@ struct _gc_runtime_state {
    Py_ssize_t long_lived_pending;
 };

+#ifdef Py_GIL_DISABLED
+struct _gc_thread_state {
+    /* Thread-local allocation count. */
+    Py_ssize_t alloc_count;
+};
+#endif
+

 extern void _PyGC_InitState(struct _gc_runtime_state *);

--- a/Include/internal/pycore_tstate.h
+++ b/Include/internal/pycore_tstate.h
@ -28,6 +28,7 @@ typedef struct _PyThreadStateImpl {
    PyThreadState base;

 #ifdef Py_GIL_DISABLED
+    struct _gc_thread_state gc;
    struct _mimalloc_thread_state mimalloc;
    struct _Py_object_freelists freelists;
    struct _brc_thread_state brc;
--- a/Lib/test/test_frame.py
+++ b/Lib/test/test_frame.py
@ -13,7 +13,7 @@ except ImportError:
    _testcapi = None

 from test import support
-from test.support import threading_helper
+from test.support import threading_helper, Py_GIL_DISABLED
 from test.support.script_helper import assert_python_ok


@ -294,6 +294,7 @@ class TestIncompleteFrameAreInvisible(unittest.TestCase):
        assert_python_ok("-c", code)

    @support.cpython_only
+    @unittest.skipIf(Py_GIL_DISABLED, "test requires precise GC scheduling")
    def test_sneaky_frame_object(self):

        def trace(frame, event, arg):
--- a/Lib/test/test_gc.py
+++ b/Lib/test/test_gc.py
@ -363,6 +363,7 @@ class GCTests(unittest.TestCase):
    # To minimize variations, though, we first store the get_count() results
    # and check them at the end.
    @refcount_test
+    @unittest.skipIf(Py_GIL_DISABLED, 'needs precise allocation counts')
    def test_get_count(self):
        gc.collect()
        a, b, c = gc.get_count()
--- a/Modules/gcmodule.c
+++ b/Modules/gcmodule.c
@ -201,6 +201,16 @@ gc_get_count_impl(PyObject *module)
 /*[clinic end generated code: output=354012e67b16398f input=a392794a08251751]*/
 {
    GCState *gcstate = get_gc_state();
+
+#ifdef Py_GIL_DISABLED
+    _PyThreadStateImpl *tstate = (_PyThreadStateImpl *)_PyThreadState_GET();
+    struct _gc_thread_state *gc = &tstate->gc;
+
+    // Flush the local allocation count to the global count
+    _Py_atomic_add_int(&gcstate->generations[0].count, (int)gc->alloc_count);
+    gc->alloc_count = 0;
+#endif
+
    return Py_BuildValue("(iii)",
                         gcstate->generations[0].count,
                         gcstate->generations[1].count,
--- a/Objects/typeobject.c
+++ b/Objects/typeobject.c
@ -1835,6 +1835,8 @@ _PyType_AllocNoTrack(PyTypeObject *type, Py_ssize_t nitems)
    if (presize) {
        ((PyObject **)alloc)[0] = NULL;
        ((PyObject **)alloc)[1] = NULL;
+    }
+    if (PyType_IS_GC(type)) {
        _PyObject_GC_Link(obj);
    }
    memset(obj, '\0', size);
--- a/Python/gc_free_threading.c
+++ b/Python/gc_free_threading.c
@ -23,6 +23,11 @@ typedef struct _gc_runtime_state GCState;
 #  define GC_DEBUG
 #endif

+// Each thread buffers the count of allocated objects in a thread-local
+// variable up to +/- this amount to reduce the overhead of updating
+// the global count.
+#define LOCAL_ALLOC_COUNT_THRESHOLD 512
+
 // Automatically choose the generation that needs collecting.
 #define GENERATION_AUTO (-1)

@ -959,6 +964,41 @@ gc_should_collect(GCState *gcstate)
            gcstate->generations[1].threshold == 0);
 }

+static void
+record_allocation(PyThreadState *tstate)
+{
+    struct _gc_thread_state *gc = &((_PyThreadStateImpl *)tstate)->gc;
+
+    // We buffer the allocation count to avoid the overhead of atomic
+    // operations for every allocation.
+    gc->alloc_count++;
+    if (gc->alloc_count >= LOCAL_ALLOC_COUNT_THRESHOLD) {
+        // TODO: Use Py_ssize_t for the generation count.
+        GCState *gcstate = &tstate->interp->gc;
+        _Py_atomic_add_int(&gcstate->generations[0].count, (int)gc->alloc_count);
+        gc->alloc_count = 0;
+
+        if (gc_should_collect(gcstate) &&
+            !_Py_atomic_load_int_relaxed(&gcstate->collecting))
+        {
+            _Py_ScheduleGC(tstate->interp);
+        }
+    }
+}
+
+static void
+record_deallocation(PyThreadState *tstate)
+{
+    struct _gc_thread_state *gc = &((_PyThreadStateImpl *)tstate)->gc;
+
+    gc->alloc_count--;
+    if (gc->alloc_count <= -LOCAL_ALLOC_COUNT_THRESHOLD) {
+        GCState *gcstate = &tstate->interp->gc;
+        _Py_atomic_add_int(&gcstate->generations[0].count, (int)gc->alloc_count);
+        gc->alloc_count = 0;
+    }
+}
+
 static void
 gc_collect_internal(PyInterpreterState *interp, struct collection_state *state)
 {
@ -981,6 +1021,9 @@ gc_collect_internal(PyInterpreterState *interp, struct collection_state *state)
        }
    }

+    // Record the number of live GC objects
+    interp->gc.long_lived_total = state->long_lived_total;
+
    // Clear weakrefs and enqueue callbacks (but do not call them).
    clear_weakrefs(state);
    _PyEval_StartTheWorld(interp);
@ -1090,7 +1133,6 @@ gc_collect_main(PyThreadState *tstate, int generation, _PyGC_Reason reason)

    m = state.collected;
    n = state.uncollectable;
-    gcstate->long_lived_total = state.long_lived_total;

    if (gcstate->debug & _PyGC_DEBUG_STATS) {
        double d = _PyTime_AsSecondsDouble(_PyTime_GetPerfCounter() - t1);
@ -1530,15 +1572,7 @@ _Py_ScheduleGC(PyInterpreterState *interp)
 void
 _PyObject_GC_Link(PyObject *op)
 {
-    PyThreadState *tstate = _PyThreadState_GET();
-    GCState *gcstate = &tstate->interp->gc;
-    gcstate->generations[0].count++;
-
-    if (gc_should_collect(gcstate) &&
-        !_Py_atomic_load_int_relaxed(&gcstate->collecting))
-    {
-        _Py_ScheduleGC(tstate->interp);
-    }
+    record_allocation(_PyThreadState_GET());
 }

 void
@ -1564,7 +1598,7 @@ gc_alloc(PyTypeObject *tp, size_t basicsize, size_t presize)
        ((PyObject **)mem)[1] = NULL;
    }
    PyObject *op = (PyObject *)(mem + presize);
-    _PyObject_GC_Link(op);
+    record_allocation(tstate);
    return op;
 }

@ -1646,10 +1680,9 @@ PyObject_GC_Del(void *op)
        PyErr_SetRaisedException(exc);
 #endif
    }
-    GCState *gcstate = get_gc_state();
-    if (gcstate->generations[0].count > 0) {
-        gcstate->generations[0].count--;
-    }
+
+    record_deallocation(_PyThreadState_GET());
+
    PyObject_Free(((char *)op)-presize);
 }