From acf3bcc8861983dcd6896682283a480450f9a1e3 Mon Sep 17 00:00:00 2001 From: Sam Gross Date: Tue, 26 Dec 2023 11:53:20 -0500 Subject: [PATCH] gh-112532: Use separate mimalloc heaps for GC objects (gh-113263) * gh-112532: Use separate mimalloc heaps for GC objects In `--disable-gil` builds, we now use four separate heaps in anticipation of using mimalloc to find GC objects when the GIL is disabled. To support this, we also make a few changes to mimalloc: * `mi_heap_t` and `mi_tld_t` initialization is split from allocation. This allows us to have a `mi_tld_t` per-`PyThreadState`, which is important to keep interpreter isolation, since the same OS thread may run in multiple interpreters (using different PyThreadStates.) * Heap abandoning (mi_heap_collect_ex) can now be called from a different thread than the one that created the heap. This is necessary because we may clear and delete the containing PyThreadStates from a different thread during finalization and after fork(). * Use enum instead of defines and guard mimalloc includes. * The enum typedef will be convenient for future PRs that use the type. * Guarding the mimalloc includes allows us to unconditionally include pycore_mimalloc.h from other header files that rely on things like `struct _mimalloc_thread_state`. * Only define _mimalloc_thread_state in Py_GIL_DISABLED builds --- Include/internal/mimalloc/mimalloc/internal.h | 2 + Include/internal/pycore_mimalloc.h | 26 +++++++++ Include/internal/pycore_pystate.h | 1 + Include/internal/pycore_tstate.h | 7 ++- Objects/mimalloc/heap.c | 33 +++++++---- Objects/mimalloc/init.c | 24 ++++---- Objects/obmalloc.c | 36 ++++++++++++ Python/pylifecycle.c | 4 ++ Python/pystate.c | 55 +++++++++++++++++++ 9 files changed, 163 insertions(+), 25 deletions(-) diff --git a/Include/internal/mimalloc/mimalloc/internal.h b/Include/internal/mimalloc/mimalloc/internal.h index f076bc6a40f..cb6e211de5b 100644 --- a/Include/internal/mimalloc/mimalloc/internal.h +++ b/Include/internal/mimalloc/mimalloc/internal.h @@ -85,6 +85,7 @@ mi_threadid_t _mi_thread_id(void) mi_attr_noexcept; mi_heap_t* _mi_heap_main_get(void); // statically allocated main backing heap void _mi_thread_done(mi_heap_t* heap); void _mi_thread_data_collect(void); +void _mi_tld_init(mi_tld_t* tld, mi_heap_t* bheap); // os.c void _mi_os_init(void); // called from process init @@ -170,6 +171,7 @@ size_t _mi_bin_size(uint8_t bin); // for stats uint8_t _mi_bin(size_t size); // for stats // "heap.c" +void _mi_heap_init_ex(mi_heap_t* heap, mi_tld_t* tld, mi_arena_id_t arena_id); void _mi_heap_destroy_pages(mi_heap_t* heap); void _mi_heap_collect_abandon(mi_heap_t* heap); void _mi_heap_set_default_direct(mi_heap_t* heap); diff --git a/Include/internal/pycore_mimalloc.h b/Include/internal/pycore_mimalloc.h index c29dc82a427..adebb559dae 100644 --- a/Include/internal/pycore_mimalloc.h +++ b/Include/internal/pycore_mimalloc.h @@ -9,11 +9,37 @@ # error "pycore_mimalloc.h must be included before mimalloc.h" #endif +typedef enum { + _Py_MIMALLOC_HEAP_MEM = 0, // PyMem_Malloc() and friends + _Py_MIMALLOC_HEAP_OBJECT = 1, // non-GC objects + _Py_MIMALLOC_HEAP_GC = 2, // GC objects without pre-header + _Py_MIMALLOC_HEAP_GC_PRE = 3, // GC objects with pre-header + _Py_MIMALLOC_HEAP_COUNT +} _Py_mimalloc_heap_id; + #include "pycore_pymem.h" + +#ifdef WITH_MIMALLOC #define MI_DEBUG_UNINIT PYMEM_CLEANBYTE #define MI_DEBUG_FREED PYMEM_DEADBYTE #define MI_DEBUG_PADDING PYMEM_FORBIDDENBYTE +#ifdef Py_DEBUG +# define MI_DEBUG 1 +#else +# define MI_DEBUG 0 +#endif #include "mimalloc.h" +#include "mimalloc/types.h" +#include "mimalloc/internal.h" +#endif + +#ifdef Py_GIL_DISABLED +struct _mimalloc_thread_state { + mi_heap_t *current_object_heap; + mi_heap_t heaps[_Py_MIMALLOC_HEAP_COUNT]; + mi_tld_t tld; +}; +#endif #endif // Py_INTERNAL_MIMALLOC_H diff --git a/Include/internal/pycore_pystate.h b/Include/internal/pycore_pystate.h index c031a38cd6b..37b45faf8a7 100644 --- a/Include/internal/pycore_pystate.h +++ b/Include/internal/pycore_pystate.h @@ -187,6 +187,7 @@ extern PyThreadState * _PyThreadState_New( int whence); extern void _PyThreadState_Bind(PyThreadState *tstate); extern void _PyThreadState_DeleteExcept(PyThreadState *tstate); +extern void _PyThreadState_ClearMimallocHeaps(PyThreadState *tstate); // Export for '_testinternalcapi' shared extension PyAPI_FUNC(PyObject*) _PyThreadState_GetDict(PyThreadState *tstate); diff --git a/Include/internal/pycore_tstate.h b/Include/internal/pycore_tstate.h index 17f3e865641..856ddd5e7e5 100644 --- a/Include/internal/pycore_tstate.h +++ b/Include/internal/pycore_tstate.h @@ -8,6 +8,8 @@ extern "C" { # error "this header requires Py_BUILD_CORE define" #endif +#include "pycore_mimalloc.h" // struct _mimalloc_thread_state + // Every PyThreadState is actually allocated as a _PyThreadStateImpl. The // PyThreadState fields are exposed as part of the C API, although most fields @@ -16,7 +18,10 @@ typedef struct _PyThreadStateImpl { // semi-public fields are in PyThreadState. PyThreadState base; - // TODO: add private fields here +#ifdef Py_GIL_DISABLED + struct _mimalloc_thread_state mimalloc; +#endif + } _PyThreadStateImpl; diff --git a/Objects/mimalloc/heap.c b/Objects/mimalloc/heap.c index 4eb622ed4ba..c50e3b05590 100644 --- a/Objects/mimalloc/heap.c +++ b/Objects/mimalloc/heap.c @@ -123,6 +123,9 @@ static void mi_heap_collect_ex(mi_heap_t* heap, mi_collect_t collect) const bool force = collect >= MI_FORCE; _mi_deferred_free(heap, force); + // gh-112532: we may be called from a thread that is not the owner of the heap + bool is_main_thread = _mi_is_main_thread() && heap->thread_id == _mi_thread_id(); + // note: never reclaim on collect but leave it to threads that need storage to reclaim const bool force_main = #ifdef NDEBUG @@ -130,7 +133,7 @@ static void mi_heap_collect_ex(mi_heap_t* heap, mi_collect_t collect) #else collect >= MI_FORCE #endif - && _mi_is_main_thread() && mi_heap_is_backing(heap) && !heap->no_reclaim; + && is_main_thread && mi_heap_is_backing(heap) && !heap->no_reclaim; if (force_main) { // the main thread is abandoned (end-of-program), try to reclaim all abandoned segments. @@ -164,7 +167,7 @@ static void mi_heap_collect_ex(mi_heap_t* heap, mi_collect_t collect) } // collect regions on program-exit (or shared library unload) - if (force && _mi_is_main_thread() && mi_heap_is_backing(heap)) { + if (force && is_main_thread && mi_heap_is_backing(heap)) { _mi_thread_data_collect(); // collect thread data cache _mi_arena_collect(true /* force purge */, &heap->tld->stats); } @@ -206,18 +209,28 @@ mi_heap_t* mi_heap_get_backing(void) { return bheap; } +void _mi_heap_init_ex(mi_heap_t* heap, mi_tld_t* tld, mi_arena_id_t arena_id) +{ + _mi_memcpy_aligned(heap, &_mi_heap_empty, sizeof(mi_heap_t)); + heap->tld = tld; + heap->thread_id = _mi_thread_id(); + heap->arena_id = arena_id; + if (heap == tld->heap_backing) { + _mi_random_init(&heap->random); + } + else { + _mi_random_split(&tld->heap_backing->random, &heap->random); + } + heap->cookie = _mi_heap_random_next(heap) | 1; + heap->keys[0] = _mi_heap_random_next(heap); + heap->keys[1] = _mi_heap_random_next(heap); +} + mi_decl_nodiscard mi_heap_t* mi_heap_new_in_arena(mi_arena_id_t arena_id) { mi_heap_t* bheap = mi_heap_get_backing(); mi_heap_t* heap = mi_heap_malloc_tp(bheap, mi_heap_t); // todo: OS allocate in secure mode? if (heap == NULL) return NULL; - _mi_memcpy_aligned(heap, &_mi_heap_empty, sizeof(mi_heap_t)); - heap->tld = bheap->tld; - heap->thread_id = _mi_thread_id(); - heap->arena_id = arena_id; - _mi_random_split(&bheap->random, &heap->random); - heap->cookie = _mi_heap_random_next(heap) | 1; - heap->keys[0] = _mi_heap_random_next(heap); - heap->keys[1] = _mi_heap_random_next(heap); + _mi_heap_init_ex(heap, bheap->tld, arena_id); heap->no_reclaim = true; // don't reclaim abandoned pages or otherwise destroy is unsafe // push on the thread local heaps list heap->next = heap->tld->heaps; diff --git a/Objects/mimalloc/init.c b/Objects/mimalloc/init.c index 7dfa7657737..376e14b49b7 100644 --- a/Objects/mimalloc/init.c +++ b/Objects/mimalloc/init.c @@ -297,24 +297,20 @@ static bool _mi_heap_init(void) { mi_thread_data_t* td = mi_thread_data_zalloc(); if (td == NULL) return false; - mi_tld_t* tld = &td->tld; - mi_heap_t* heap = &td->heap; + _mi_tld_init(&td->tld, &td->heap); + _mi_heap_init_ex(&td->heap, &td->tld, _mi_arena_id_none()); + _mi_heap_set_default_direct(&td->heap); + } + return false; +} + +void _mi_tld_init(mi_tld_t* tld, mi_heap_t* bheap) { _mi_memcpy_aligned(tld, &tld_empty, sizeof(*tld)); - _mi_memcpy_aligned(heap, &_mi_heap_empty, sizeof(*heap)); - heap->thread_id = _mi_thread_id(); - _mi_random_init(&heap->random); - heap->cookie = _mi_heap_random_next(heap) | 1; - heap->keys[0] = _mi_heap_random_next(heap); - heap->keys[1] = _mi_heap_random_next(heap); - heap->tld = tld; - tld->heap_backing = heap; - tld->heaps = heap; tld->segments.stats = &tld->stats; tld->segments.os = &tld->os; tld->os.stats = &tld->stats; - _mi_heap_set_default_direct(heap); - } - return false; + tld->heap_backing = bheap; + tld->heaps = bheap; } // Free the thread local default heap (called from `mi_thread_done`) diff --git a/Objects/obmalloc.c b/Objects/obmalloc.c index 99c95d90658..883adcb1c19 100644 --- a/Objects/obmalloc.c +++ b/Objects/obmalloc.c @@ -88,19 +88,37 @@ _PyMem_RawFree(void *Py_UNUSED(ctx), void *ptr) void * _PyMem_MiMalloc(void *ctx, size_t size) { +#ifdef Py_GIL_DISABLED + _PyThreadStateImpl *tstate = (_PyThreadStateImpl *)_PyThreadState_GET(); + mi_heap_t *heap = &tstate->mimalloc.heaps[_Py_MIMALLOC_HEAP_MEM]; + return mi_heap_malloc(heap, size); +#else return mi_malloc(size); +#endif } void * _PyMem_MiCalloc(void *ctx, size_t nelem, size_t elsize) { +#ifdef Py_GIL_DISABLED + _PyThreadStateImpl *tstate = (_PyThreadStateImpl *)_PyThreadState_GET(); + mi_heap_t *heap = &tstate->mimalloc.heaps[_Py_MIMALLOC_HEAP_MEM]; + return mi_heap_calloc(heap, nelem, elsize); +#else return mi_calloc(nelem, elsize); +#endif } void * _PyMem_MiRealloc(void *ctx, void *ptr, size_t size) { +#ifdef Py_GIL_DISABLED + _PyThreadStateImpl *tstate = (_PyThreadStateImpl *)_PyThreadState_GET(); + mi_heap_t *heap = &tstate->mimalloc.heaps[_Py_MIMALLOC_HEAP_MEM]; + return mi_heap_realloc(heap, ptr, size); +#else return mi_realloc(ptr, size); +#endif } void @@ -112,20 +130,38 @@ _PyMem_MiFree(void *ctx, void *ptr) void * _PyObject_MiMalloc(void *ctx, size_t nbytes) { +#ifdef Py_GIL_DISABLED + _PyThreadStateImpl *tstate = (_PyThreadStateImpl *)_PyThreadState_GET(); + mi_heap_t *heap = tstate->mimalloc.current_object_heap; + return mi_heap_malloc(heap, nbytes); +#else return mi_malloc(nbytes); +#endif } void * _PyObject_MiCalloc(void *ctx, size_t nelem, size_t elsize) { +#ifdef Py_GIL_DISABLED + _PyThreadStateImpl *tstate = (_PyThreadStateImpl *)_PyThreadState_GET(); + mi_heap_t *heap = tstate->mimalloc.current_object_heap; + return mi_heap_calloc(heap, nelem, elsize); +#else return mi_calloc(nelem, elsize); +#endif } void * _PyObject_MiRealloc(void *ctx, void *ptr, size_t nbytes) { +#ifdef Py_GIL_DISABLED + _PyThreadStateImpl *tstate = (_PyThreadStateImpl *)_PyThreadState_GET(); + mi_heap_t *heap = tstate->mimalloc.current_object_heap; + return mi_heap_realloc(heap, ptr, nbytes); +#else return mi_realloc(ptr, nbytes); +#endif } void diff --git a/Python/pylifecycle.c b/Python/pylifecycle.c index 0ec29846b08..1d8af26e4a1 100644 --- a/Python/pylifecycle.c +++ b/Python/pylifecycle.c @@ -1794,6 +1794,10 @@ finalize_interp_clear(PyThreadState *tstate) } finalize_interp_types(tstate->interp); + + /* finalize_interp_types may allocate Python objects so we may need to + abandon mimalloc segments again */ + _PyThreadState_ClearMimallocHeaps(tstate); } diff --git a/Python/pystate.c b/Python/pystate.c index 632a119ea6d..84e2d6ea172 100644 --- a/Python/pystate.c +++ b/Python/pystate.c @@ -236,6 +236,8 @@ tstate_is_bound(PyThreadState *tstate) static void bind_gilstate_tstate(PyThreadState *); static void unbind_gilstate_tstate(PyThreadState *); +static void tstate_mimalloc_bind(PyThreadState *); + static void bind_tstate(PyThreadState *tstate) { @@ -256,6 +258,9 @@ bind_tstate(PyThreadState *tstate) tstate->native_thread_id = PyThread_get_thread_native_id(); #endif + // mimalloc state needs to be initialized from the active thread. + tstate_mimalloc_bind(tstate); + tstate->_status.bound = 1; } @@ -1533,6 +1538,8 @@ PyThreadState_Clear(PyThreadState *tstate) tstate->on_delete(tstate->on_delete_data); } + _PyThreadState_ClearMimallocHeaps(tstate); + tstate->_status.cleared = 1; // XXX Call _PyThreadStateSwap(runtime, NULL) here if "current". @@ -2509,3 +2516,51 @@ _PyThreadState_MustExit(PyThreadState *tstate) } return 1; } + +/********************/ +/* mimalloc support */ +/********************/ + +static void +tstate_mimalloc_bind(PyThreadState *tstate) +{ +#ifdef Py_GIL_DISABLED + struct _mimalloc_thread_state *mts = &((_PyThreadStateImpl*)tstate)->mimalloc; + + // Initialize the mimalloc thread state. This must be called from the + // same thread that will use the thread state. The "mem" heap doubles as + // the "backing" heap. + mi_tld_t *tld = &mts->tld; + _mi_tld_init(tld, &mts->heaps[_Py_MIMALLOC_HEAP_MEM]); + + // Initialize each heap + for (Py_ssize_t i = 0; i < _Py_MIMALLOC_HEAP_COUNT; i++) { + _mi_heap_init_ex(&mts->heaps[i], tld, _mi_arena_id_none()); + } + + // By default, object allocations use _Py_MIMALLOC_HEAP_OBJECT. + // _PyObject_GC_New() and similar functions temporarily override this to + // use one of the GC heaps. + mts->current_object_heap = &mts->heaps[_Py_MIMALLOC_HEAP_OBJECT]; +#endif +} + +void +_PyThreadState_ClearMimallocHeaps(PyThreadState *tstate) +{ +#ifdef Py_GIL_DISABLED + if (!tstate->_status.bound) { + // The mimalloc heaps are only initialized when the thread is bound. + return; + } + + _PyThreadStateImpl *tstate_impl = (_PyThreadStateImpl *)tstate; + for (Py_ssize_t i = 0; i < _Py_MIMALLOC_HEAP_COUNT; i++) { + // Abandon all segments in use by this thread. This pushes them to + // a shared pool to later be reclaimed by other threads. It's important + // to do this before the thread state is destroyed so that objects + // remain visible to the GC. + _mi_heap_collect_abandon(&tstate_impl->mimalloc.heaps[i]); + } +#endif +}