bpo-37448: Use radix tree for pymalloc address_in_range(). (GH-14474)

The radix tree approach is a relatively simple and memory sanitary
alternative to the old (slightly) unsanitary address_in_range().
To disable the radix tree map, set a preprocessor flag as follows:
-DWITH_PYMALLOC_RADIX_TREE=0.

Co-authored-by: Tim Peters <tim.peters@gmail.com>
This commit is contained in:
Neil Schemenauer 2021-03-29 19:51:15 -07:00 committed by GitHub
parent a54fc683f2
commit 85b6b70589
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 351 additions and 5 deletions

View File

@ -0,0 +1,15 @@
Add a radix tree based memory map to track in-use obmalloc arenas. Use to
replace the old implementation of address_in_range(). The radix tree
approach makes it easy to increase pool sizes beyond the OS page size.
Boosting the pool and arena size allows obmalloc to handle a significantly
higher percentage of requests from its ultra-fast paths.
It also has the advantage of eliminating the memory unsanitary behavior of
the previous address_in_range(). The old address_in_range() was marked with
the annotations _Py_NO_SANITIZE_ADDRESS, _Py_NO_SANITIZE_THREAD, and
_Py_NO_SANITIZE_MEMORY. Those annotations are no longer needed.
To disable the radix tree map, set a preprocessor flag as follows:
`-DWITH_PYMALLOC_RADIX_TREE=0`.
Co-authored-by: Tim Peters <tim.peters@gmail.com>

View File

@ -894,6 +894,22 @@ static int running_on_valgrind = -1;
#endif
#endif
#if !defined(WITH_PYMALLOC_RADIX_TREE)
/* Use radix-tree to track arena memory regions, for address_in_range().
* Enable by default since it allows larger pool sizes. Can be disabled
* using -DWITH_PYMALLOC_RADIX_TREE=0 */
#define WITH_PYMALLOC_RADIX_TREE 1
#endif
#if SIZEOF_VOID_P > 4
/* on 64-bit platforms use larger pools and arenas if we can */
#define USE_LARGE_ARENAS
#if WITH_PYMALLOC_RADIX_TREE
/* large pools only supported if radix-tree is enabled */
#define USE_LARGE_POOLS
#endif
#endif
/*
* The allocator sub-allocates <Big> blocks of memory (called arenas) aligned
* on a page boundary. This is a reserved virtual address space for the
@ -907,18 +923,34 @@ static int running_on_valgrind = -1;
* Arenas are allocated with mmap() on systems supporting anonymous memory
* mappings to reduce heap fragmentation.
*/
#define ARENA_SIZE (256 << 10) /* 256KB */
#ifdef USE_LARGE_ARENAS
#define ARENA_BITS 20 /* 1 MiB */
#else
#define ARENA_BITS 18 /* 256 KiB */
#endif
#define ARENA_SIZE (1 << ARENA_BITS)
#define ARENA_SIZE_MASK (ARENA_SIZE - 1)
#ifdef WITH_MEMORY_LIMITS
#define MAX_ARENAS (SMALL_MEMORY_LIMIT / ARENA_SIZE)
#endif
/*
* Size of the pools used for small blocks. Should be a power of 2,
* between 1K and SYSTEM_PAGE_SIZE, that is: 1k, 2k, 4k.
* Size of the pools used for small blocks. Must be a power of 2.
*/
#define POOL_SIZE SYSTEM_PAGE_SIZE /* must be 2^N */
#define POOL_SIZE_MASK SYSTEM_PAGE_SIZE_MASK
#ifdef USE_LARGE_POOLS
#define POOL_BITS 14 /* 16 KiB */
#else
#define POOL_BITS 12 /* 4 KiB */
#endif
#define POOL_SIZE (1 << POOL_BITS)
#define POOL_SIZE_MASK (POOL_SIZE - 1)
#if !WITH_PYMALLOC_RADIX_TREE
#if POOL_SIZE != SYSTEM_PAGE_SIZE
# error "pool size must be equal to system page size"
#endif
#endif
#define MAX_POOLS_IN_ARENA (ARENA_SIZE / POOL_SIZE)
#if MAX_POOLS_IN_ARENA * POOL_SIZE != ARENA_SIZE
@ -1233,6 +1265,264 @@ _Py_GetAllocatedBlocks(void)
return n;
}
#if WITH_PYMALLOC_RADIX_TREE
/*==========================================================================*/
/* radix tree for tracking arena usage
bit allocation for keys
64-bit pointers and 2^20 arena size:
16 -> ignored (POINTER_BITS - ADDRESS_BITS)
10 -> MAP_TOP
10 -> MAP_MID
8 -> MAP_BOT
20 -> ideal aligned arena
----
64
32-bit pointers and 2^18 arena size:
14 -> MAP_BOT
18 -> ideal aligned arena
----
32
*/
#if SIZEOF_VOID_P == 8
/* number of bits in a pointer */
#define POINTER_BITS 64
/* Current 64-bit processors are limited to 48-bit physical addresses. For
* now, the top 17 bits of addresses will all be equal to bit 2**47. If that
* changes in the future, this must be adjusted upwards.
*/
#define ADDRESS_BITS 48
/* use the top and mid layers of the radix tree */
#define USE_INTERIOR_NODES
#elif SIZEOF_VOID_P == 4
#define POINTER_BITS 32
#define ADDRESS_BITS 32
#else
/* Currently this code works for 64-bit or 32-bit pointers only. */
#error "obmalloc radix tree requires 64-bit or 32-bit pointers."
#endif /* SIZEOF_VOID_P */
/* arena_coverage_t members require this to be true */
#if ARENA_BITS >= 32
# error "arena size must be < 2^32"
#endif
#ifdef USE_INTERIOR_NODES
/* number of bits used for MAP_TOP and MAP_MID nodes */
#define INTERIOR_BITS ((ADDRESS_BITS - ARENA_BITS + 2) / 3)
#else
#define INTERIOR_BITS 0
#endif
#define MAP_TOP_BITS INTERIOR_BITS
#define MAP_TOP_LENGTH (1 << MAP_TOP_BITS)
#define MAP_TOP_MASK (MAP_BOT_LENGTH - 1)
#define MAP_MID_BITS INTERIOR_BITS
#define MAP_MID_LENGTH (1 << MAP_MID_BITS)
#define MAP_MID_MASK (MAP_MID_LENGTH - 1)
#define MAP_BOT_BITS (ADDRESS_BITS - ARENA_BITS - 2*INTERIOR_BITS)
#define MAP_BOT_LENGTH (1 << MAP_BOT_BITS)
#define MAP_BOT_MASK (MAP_BOT_LENGTH - 1)
#define MAP_BOT_SHIFT ARENA_BITS
#define MAP_MID_SHIFT (MAP_BOT_BITS + MAP_BOT_SHIFT)
#define MAP_TOP_SHIFT (MAP_MID_BITS + MAP_MID_SHIFT)
#define AS_UINT(p) ((uintptr_t)(p))
#define MAP_BOT_INDEX(p) ((AS_UINT(p) >> MAP_BOT_SHIFT) & MAP_BOT_MASK)
#define MAP_MID_INDEX(p) ((AS_UINT(p) >> MAP_MID_SHIFT) & MAP_MID_MASK)
#define MAP_TOP_INDEX(p) ((AS_UINT(p) >> MAP_TOP_SHIFT) & MAP_TOP_MASK)
#if ADDRESS_BITS > POINTER_BITS
/* Return non-physical address bits of a pointer. Those bits should be same
* for all valid pointers if ADDRESS_BITS set correctly. Linux has support for
* 57-bit address space (Intel 5-level paging) but will not currently give
* those addresses to user space.
*/
#define HIGH_BITS(p) (AS_UINT(p) >> ADDRESS_BITS)
#else
#define HIGH_BITS(p) 0
#endif
/* This is the leaf of the radix tree. See arena_map_mark_used() for the
* meaning of these members. */
typedef struct {
int32_t tail_hi;
int32_t tail_lo;
} arena_coverage_t;
typedef struct arena_map_bot {
/* The members tail_hi and tail_lo are accessed together. So, it
* better to have them as an array of structs, rather than two
* arrays.
*/
arena_coverage_t arenas[MAP_BOT_LENGTH];
} arena_map_bot_t;
#ifdef USE_INTERIOR_NODES
typedef struct arena_map_mid {
struct arena_map_bot *ptrs[MAP_MID_LENGTH];
} arena_map_mid_t;
typedef struct arena_map_top {
struct arena_map_mid *ptrs[MAP_TOP_LENGTH];
} arena_map_top_t;
#endif
/* The root of radix tree. Note that by initializing like this, the memory
* should be in the BSS. The OS will only memory map pages as the MAP_MID
* nodes get used (OS pages are demand loaded as needed).
*/
#ifdef USE_INTERIOR_NODES
static arena_map_top_t arena_map_root;
/* accounting for number of used interior nodes */
static int arena_map_mid_count;
static int arena_map_bot_count;
#else
static arena_map_bot_t arena_map_root;
#endif
/* Return a pointer to a bottom tree node, return NULL if it doesn't exist or
* it cannot be created */
static arena_map_bot_t *
arena_map_get(block *p, int create)
{
#ifdef USE_INTERIOR_NODES
/* sanity check that ADDRESS_BITS is correct */
assert(HIGH_BITS(p) == HIGH_BITS(&arena_map_root));
int i1 = MAP_TOP_INDEX(p);
if (arena_map_root.ptrs[i1] == NULL) {
if (!create) {
return NULL;
}
arena_map_mid_t *n = PyMem_RawCalloc(1, sizeof(arena_map_mid_t));
if (n == NULL) {
return NULL;
}
arena_map_root.ptrs[i1] = n;
arena_map_mid_count++;
}
int i2 = MAP_MID_INDEX(p);
if (arena_map_root.ptrs[i1]->ptrs[i2] == NULL) {
if (!create) {
return NULL;
}
arena_map_bot_t *n = PyMem_RawCalloc(1, sizeof(arena_map_bot_t));
if (n == NULL) {
return NULL;
}
arena_map_root.ptrs[i1]->ptrs[i2] = n;
arena_map_bot_count++;
}
return arena_map_root.ptrs[i1]->ptrs[i2];
#else
return &arena_map_root;
#endif
}
/* The radix tree only tracks arenas. So, for 16 MiB arenas, we throw
* away 24 bits of the address. That reduces the space requirement of
* the tree compared to similar radix tree page-map schemes. In
* exchange for slashing the space requirement, it needs more
* computation to check an address.
*
* Tracking coverage is done by "ideal" arena address. It is easier to
* explain in decimal so let's say that the arena size is 100 bytes.
* Then, ideal addresses are 100, 200, 300, etc. For checking if a
* pointer address is inside an actual arena, we have to check two ideal
* arena addresses. E.g. if pointer is 357, we need to check 200 and
* 300. In the rare case that an arena is aligned in the ideal way
* (e.g. base address of arena is 200) then we only have to check one
* ideal address.
*
* The tree nodes for 200 and 300 both store the address of arena.
* There are two cases: the arena starts at a lower ideal arena and
* extends to this one, or the arena starts in this arena and extends to
* the next ideal arena. The tail_lo and tail_hi members correspond to
* these two cases.
*/
/* mark or unmark addresses covered by arena */
static int
arena_map_mark_used(uintptr_t arena_base, int is_used)
{
/* sanity check that ADDRESS_BITS is correct */
assert(HIGH_BITS(arena_base) == HIGH_BITS(&arena_map_root));
arena_map_bot_t *n_hi = arena_map_get((block *)arena_base, is_used);
if (n_hi == NULL) {
assert(is_used); /* otherwise node should already exist */
return 0; /* failed to allocate space for node */
}
int i3 = MAP_BOT_INDEX((block *)arena_base);
int32_t tail = (int32_t)(arena_base & ARENA_SIZE_MASK);
if (tail == 0) {
/* is ideal arena address */
n_hi->arenas[i3].tail_hi = is_used ? -1 : 0;
}
else {
/* arena_base address is not ideal (aligned to arena size) and
* so it potentially covers two MAP_BOT nodes. Get the MAP_BOT node
* for the next arena. Note that it might be in different MAP_TOP
* and MAP_MID nodes as well so we need to call arena_map_get()
* again (do the full tree traversal).
*/
n_hi->arenas[i3].tail_hi = is_used ? tail : 0;
uintptr_t arena_base_next = arena_base + ARENA_SIZE;
/* If arena_base is a legit arena address, so is arena_base_next - 1
* (last address in arena). If arena_base_next overflows then it
* must overflow to 0. However, that would mean arena_base was
* "ideal" and we should not be in this case. */
assert(arena_base < arena_base_next);
arena_map_bot_t *n_lo = arena_map_get((block *)arena_base_next, is_used);
if (n_lo == NULL) {
assert(is_used); /* otherwise should already exist */
n_hi->arenas[i3].tail_hi = 0;
return 0; /* failed to allocate space for node */
}
int i3_next = MAP_BOT_INDEX(arena_base_next);
n_lo->arenas[i3_next].tail_lo = is_used ? tail : 0;
}
return 1;
}
/* Return true if 'p' is a pointer inside an obmalloc arena.
* _PyObject_Free() calls this so it needs to be very fast. */
static int
arena_map_is_used(block *p)
{
arena_map_bot_t *n = arena_map_get(p, 0);
if (n == NULL) {
return 0;
}
int i3 = MAP_BOT_INDEX(p);
/* ARENA_BITS must be < 32 so that the tail is a non-negative int32_t. */
int32_t hi = n->arenas[i3].tail_hi;
int32_t lo = n->arenas[i3].tail_lo;
int32_t tail = (int32_t)(AS_UINT(p) & ARENA_SIZE_MASK);
return (tail < lo) || (tail >= hi && hi != 0);
}
/* end of radix tree logic */
/*==========================================================================*/
#endif /* WITH_PYMALLOC_RADIX_TREE */
/* Allocate a new arena. If we run out of memory, return NULL. Else
* allocate a new arena, and return the address of an arena_object
@ -1302,6 +1592,15 @@ new_arena(void)
unused_arena_objects = arenaobj->nextarena;
assert(arenaobj->address == 0);
address = _PyObject_Arena.alloc(_PyObject_Arena.ctx, ARENA_SIZE);
#if WITH_PYMALLOC_RADIX_TREE
if (address != NULL) {
if (!arena_map_mark_used((uintptr_t)address, 1)) {
/* marking arena in radix tree failed, abort */
_PyObject_Arena.free(_PyObject_Arena.ctx, address, ARENA_SIZE);
address = NULL;
}
}
#endif
if (address == NULL) {
/* The allocation failed: return NULL after putting the
* arenaobj back.
@ -1332,6 +1631,17 @@ new_arena(void)
}
#if WITH_PYMALLOC_RADIX_TREE
/* Return true if and only if P is an address that was allocated by
pymalloc. When the radix tree is used, 'poolp' is unused.
*/
static bool
address_in_range(void *p, poolp pool)
{
return arena_map_is_used(p);
}
#else
/*
address_in_range(P, POOL)
@ -1423,6 +1733,7 @@ address_in_range(void *p, poolp pool)
arenas[arenaindex].address != 0;
}
#endif /* !WITH_PYMALLOC_RADIX_TREE */
/*==========================================================================*/
@ -1768,6 +2079,11 @@ insert_to_freepool(poolp pool)
ao->nextarena = unused_arena_objects;
unused_arena_objects = ao;
#if WITH_PYMALLOC_RADIX_TREE
/* mark arena region as not under control of obmalloc */
arena_map_mark_used(ao->address, 0);
#endif
/* Free the entire arena. */
_PyObject_Arena.free(_PyObject_Arena.ctx,
(void *)ao->address, ARENA_SIZE);
@ -2711,6 +3027,12 @@ _PyObject_DebugMallocStats(FILE *out)
(void)printone(out, "# arenas reclaimed", ntimes_arena_allocated - narenas);
(void)printone(out, "# arenas highwater mark", narenas_highwater);
(void)printone(out, "# arenas allocated current", narenas);
#ifdef USE_INTERIOR_NODES
(void)printone(out, "# arena map mid nodes", arena_map_mid_count);
(void)printone(out, "# arena map bot nodes", arena_map_bot_count);
fputc('\n', out);
#endif
PyOS_snprintf(buf, sizeof(buf),
"%zu arenas * %d bytes/arena",
@ -2729,6 +3051,15 @@ _PyObject_DebugMallocStats(FILE *out)
total += printone(out, "# bytes lost to pool headers", pool_header_bytes);
total += printone(out, "# bytes lost to quantization", quantization);
total += printone(out, "# bytes lost to arena alignment", arena_alignment);
#ifdef WITH_PYMALLOC_RADIX_TREE
total += printone(out, "# bytes lost to arena map root", sizeof(arena_map_root));
#endif
#ifdef USE_INTERIOR_NODES
total += printone(out, "# bytes lost to arena map mid",
sizeof(arena_map_mid_t) * arena_map_mid_count);
total += printone(out, "# bytes lost to arena map bot",
sizeof(arena_map_bot_t) * arena_map_bot_count);
#endif
(void)printone(out, "Total", total);
return 1;
}