bpo-37543: optimize pymalloc (#14674)

PyObject_Malloc() and PyObject_Free() inlines pymalloc_alloc and
pymalloc_free partially.
But when PGO is not used, compiler don't know where is the hot part
in pymalloc_alloc and pymalloc_free.
This commit is contained in:
Inada Naoki 2019-07-17 21:23:57 +09:00 committed by GitHub
parent 7036e1de3a
commit fb26504d14
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 227 additions and 218 deletions

View File

@ -0,0 +1 @@
Optimized pymalloc for non PGO build.

View File

@ -710,19 +710,21 @@ PyObject_Free(void *ptr)
}
#ifdef WITH_PYMALLOC
#ifdef WITH_VALGRIND
#include <valgrind/valgrind.h>
/* If we're using GCC, use __builtin_expect() to reduce overhead of
the valgrind checks */
#if defined(__GNUC__) && (__GNUC__ > 2) && defined(__OPTIMIZE__)
# define UNLIKELY(value) __builtin_expect((value), 0)
# define LIKELY(value) __builtin_expect((value), 1)
#else
# define UNLIKELY(value) (value)
# define LIKELY(value) (value)
#endif
#ifdef WITH_PYMALLOC
#ifdef WITH_VALGRIND
#include <valgrind/valgrind.h>
/* -1 indicates that we haven't checked that we're running on valgrind yet. */
static int running_on_valgrind = -1;
#endif
@ -1424,96 +1426,48 @@ address_in_range(void *p, poolp pool)
/*==========================================================================*/
/* pymalloc allocator
The basic blocks are ordered by decreasing execution frequency,
which minimizes the number of jumps in the most common cases,
improves branching prediction and instruction scheduling (small
block allocations typically result in a couple of instructions).
Unless the optimizer reorders everything, being too smart...
Return 1 if pymalloc allocated memory and wrote the pointer into *ptr_p.
Return 0 if pymalloc failed to allocate the memory block: on bigger
requests, on error in the code below (as a last chance to serve the request)
or when the max memory limit has been reached. */
static int
pymalloc_alloc(void *ctx, void **ptr_p, size_t nbytes)
// Called when freelist is exhausted. Extend the freelist if there is
// space for a block. Otherwise, remove this pool from usedpools.
static void
pymalloc_pool_extend(poolp pool, uint size)
{
block *bp;
poolp pool;
if (UNLIKELY(pool->nextoffset <= pool->maxnextoffset)) {
/* There is room for another block. */
pool->freeblock = (block*)pool + pool->nextoffset;
pool->nextoffset += INDEX2SIZE(size);
*(block **)(pool->freeblock) = NULL;
return;
}
/* Pool is full, unlink from used pools. */
poolp next;
uint size;
#ifdef WITH_VALGRIND
if (UNLIKELY(running_on_valgrind == -1)) {
running_on_valgrind = RUNNING_ON_VALGRIND;
}
if (UNLIKELY(running_on_valgrind)) {
return 0;
}
#endif
if (nbytes == 0) {
return 0;
}
if (nbytes > SMALL_REQUEST_THRESHOLD) {
return 0;
}
/*
* Most frequent paths first
*/
size = (uint)(nbytes - 1) >> ALIGNMENT_SHIFT;
pool = usedpools[size + size];
if (pool != pool->nextpool) {
/*
* There is a used pool for this size class.
* Pick up the head block of its free list.
*/
++pool->ref.count;
bp = pool->freeblock;
assert(bp != NULL);
if ((pool->freeblock = *(block **)bp) != NULL) {
goto success;
}
/*
* Reached the end of the free list, try to extend it.
*/
if (pool->nextoffset <= pool->maxnextoffset) {
/* There is room for another block. */
pool->freeblock = (block*)pool +
pool->nextoffset;
pool->nextoffset += INDEX2SIZE(size);
*(block **)(pool->freeblock) = NULL;
goto success;
}
/* Pool is full, unlink from used pools. */
next = pool->nextpool;
pool = pool->prevpool;
next->prevpool = pool;
pool->nextpool = next;
goto success;
}
next = pool->nextpool;
pool = pool->prevpool;
next->prevpool = pool;
pool->nextpool = next;
}
/* called when pymalloc_alloc can not allocate a block from usedpool.
* This function takes new pool and allocate a block from it.
*/
static void*
allocate_from_new_pool(uint size)
{
/* There isn't a pool of the right size class immediately
* available: use a free pool.
*/
if (usable_arenas == NULL) {
if (UNLIKELY(usable_arenas == NULL)) {
/* No arena has a free pool: allocate a new arena. */
#ifdef WITH_MEMORY_LIMITS
if (narenas_currently_allocated >= MAX_ARENAS) {
goto failed;
return NULL;
}
#endif
usable_arenas = new_arena();
if (usable_arenas == NULL) {
goto failed;
return NULL;
}
usable_arenas->nextarena =
usable_arenas->prevarena = NULL;
usable_arenas->nextarena = usable_arenas->prevarena = NULL;
assert(nfp2lasta[usable_arenas->nfreepools] == NULL);
nfp2lasta[usable_arenas->nfreepools] = usable_arenas;
}
@ -1536,12 +1490,12 @@ pymalloc_alloc(void *ctx, void **ptr_p, size_t nbytes)
}
/* Try to get a cached free pool. */
pool = usable_arenas->freepools;
if (pool != NULL) {
poolp pool = usable_arenas->freepools;
if (LIKELY(pool != NULL)) {
/* Unlink from cached pools. */
usable_arenas->freepools = pool->nextpool;
--usable_arenas->nfreepools;
if (usable_arenas->nfreepools == 0) {
usable_arenas->nfreepools--;
if (UNLIKELY(usable_arenas->nfreepools == 0)) {
/* Wholly allocated: remove. */
assert(usable_arenas->freepools == NULL);
assert(usable_arenas->nextarena == NULL ||
@ -1564,73 +1518,123 @@ pymalloc_alloc(void *ctx, void **ptr_p, size_t nbytes)
(block*)usable_arenas->address +
ARENA_SIZE - POOL_SIZE);
}
}
else {
/* Carve off a new pool. */
assert(usable_arenas->nfreepools > 0);
assert(usable_arenas->freepools == NULL);
pool = (poolp)usable_arenas->pool_address;
assert((block*)pool <= (block*)usable_arenas->address +
ARENA_SIZE - POOL_SIZE);
pool->arenaindex = (uint)(usable_arenas - arenas);
assert(&arenas[pool->arenaindex] == usable_arenas);
pool->szidx = DUMMY_SIZE_IDX;
usable_arenas->pool_address += POOL_SIZE;
--usable_arenas->nfreepools;
init_pool:
/* Frontlink to used pools. */
next = usedpools[size + size]; /* == prev */
pool->nextpool = next;
pool->prevpool = next;
next->nextpool = pool;
next->prevpool = pool;
pool->ref.count = 1;
if (pool->szidx == size) {
/* Luckily, this pool last contained blocks
* of the same size class, so its header
* and free list are already initialized.
*/
bp = pool->freeblock;
assert(bp != NULL);
pool->freeblock = *(block **)bp;
goto success;
if (usable_arenas->nfreepools == 0) {
assert(usable_arenas->nextarena == NULL ||
usable_arenas->nextarena->prevarena ==
usable_arenas);
/* Unlink the arena: it is completely allocated. */
usable_arenas = usable_arenas->nextarena;
if (usable_arenas != NULL) {
usable_arenas->prevarena = NULL;
assert(usable_arenas->address != 0);
}
}
/*
* Initialize the pool header, set up the free list to
* contain just the second block, and return the first
* block.
}
/* Frontlink to used pools. */
block *bp;
poolp next = usedpools[size + size]; /* == prev */
pool->nextpool = next;
pool->prevpool = next;
next->nextpool = pool;
next->prevpool = pool;
pool->ref.count = 1;
if (pool->szidx == size) {
/* Luckily, this pool last contained blocks
* of the same size class, so its header
* and free list are already initialized.
*/
pool->szidx = size;
size = INDEX2SIZE(size);
bp = (block *)pool + POOL_OVERHEAD;
pool->nextoffset = POOL_OVERHEAD + (size << 1);
pool->maxnextoffset = POOL_SIZE - size;
pool->freeblock = bp + size;
*(block **)(pool->freeblock) = NULL;
goto success;
bp = pool->freeblock;
assert(bp != NULL);
pool->freeblock = *(block **)bp;
return bp;
}
/*
* Initialize the pool header, set up the free list to
* contain just the second block, and return the first
* block.
*/
pool->szidx = size;
size = INDEX2SIZE(size);
bp = (block *)pool + POOL_OVERHEAD;
pool->nextoffset = POOL_OVERHEAD + (size << 1);
pool->maxnextoffset = POOL_SIZE - size;
pool->freeblock = bp + size;
*(block **)(pool->freeblock) = NULL;
return bp;
}
/* pymalloc allocator
Return 1 if pymalloc allocated memory and wrote the pointer into *ptr_p.
Return 0 if pymalloc failed to allocate the memory block: on bigger
requests, on error in the code below (as a last chance to serve the request)
or when the max memory limit has been reached.
*/
static inline int
pymalloc_alloc(void *ctx, void **ptr_p, size_t nbytes)
{
#ifdef WITH_VALGRIND
if (UNLIKELY(running_on_valgrind == -1)) {
running_on_valgrind = RUNNING_ON_VALGRIND;
}
if (UNLIKELY(running_on_valgrind)) {
return 0;
}
#endif
if (UNLIKELY(nbytes == 0)) {
return 0;
}
if (UNLIKELY(nbytes > SMALL_REQUEST_THRESHOLD)) {
return 0;
}
/* Carve off a new pool. */
assert(usable_arenas->nfreepools > 0);
assert(usable_arenas->freepools == NULL);
pool = (poolp)usable_arenas->pool_address;
assert((block*)pool <= (block*)usable_arenas->address +
ARENA_SIZE - POOL_SIZE);
pool->arenaindex = (uint)(usable_arenas - arenas);
assert(&arenas[pool->arenaindex] == usable_arenas);
pool->szidx = DUMMY_SIZE_IDX;
usable_arenas->pool_address += POOL_SIZE;
--usable_arenas->nfreepools;
uint size = (uint)(nbytes - 1) >> ALIGNMENT_SHIFT;
poolp pool = usedpools[size + size];
block *bp;
if (LIKELY(pool != pool->nextpool)) {
/*
* There is a used pool for this size class.
* Pick up the head block of its free list.
*/
++pool->ref.count;
bp = pool->freeblock;
if (usable_arenas->nfreepools == 0) {
assert(usable_arenas->nextarena == NULL ||
usable_arenas->nextarena->prevarena ==
usable_arenas);
/* Unlink the arena: it is completely allocated. */
usable_arenas = usable_arenas->nextarena;
if (usable_arenas != NULL) {
usable_arenas->prevarena = NULL;
assert(usable_arenas->address != 0);
if (UNLIKELY((pool->freeblock = *(block **)bp) == NULL)) {
// Reached the end of the free list, try to extend it.
pymalloc_pool_extend(pool, size);
}
}
else {
/* There isn't a pool of the right size class immediately
* available: use a free pool.
*/
bp = allocate_from_new_pool(size);
if (UNLIKELY(bp == NULL)) {
return 0;
}
}
goto init_pool;
success:
assert(bp != NULL);
*ptr_p = (void *)bp;
return 1;
failed:
return 0;
}
@ -1638,7 +1642,7 @@ static void *
_PyObject_Malloc(void *ctx, size_t nbytes)
{
void* ptr;
if (pymalloc_alloc(ctx, &ptr, nbytes)) {
if (LIKELY(pymalloc_alloc(ctx, &ptr, nbytes))) {
return ptr;
}
@ -1658,7 +1662,7 @@ _PyObject_Calloc(void *ctx, size_t nelem, size_t elsize)
assert(elsize == 0 || nelem <= (size_t)PY_SSIZE_T_MAX / elsize);
size_t nbytes = nelem * elsize;
if (pymalloc_alloc(ctx, &ptr, nbytes)) {
if (LIKELY(pymalloc_alloc(ctx, &ptr, nbytes))) {
memset(ptr, 0, nbytes);
return ptr;
}
@ -1671,88 +1675,37 @@ _PyObject_Calloc(void *ctx, size_t nelem, size_t elsize)
}
/* Free a memory block allocated by pymalloc_alloc().
Return 1 if it was freed.
Return 0 if the block was not allocated by pymalloc_alloc(). */
static int
pymalloc_free(void *ctx, void *p)
static void
insert_to_usedpool(poolp pool)
{
poolp pool;
block *lastfree;
poolp next, prev;
uint size;
assert(pool->ref.count > 0); /* else the pool is empty */
assert(p != NULL);
uint size = pool->szidx;
poolp next = usedpools[size + size];
poolp prev = next->prevpool;
#ifdef WITH_VALGRIND
if (UNLIKELY(running_on_valgrind > 0)) {
return 0;
}
#endif
/* insert pool before next: prev <-> pool <-> next */
pool->nextpool = next;
pool->prevpool = prev;
next->prevpool = pool;
prev->nextpool = pool;
}
pool = POOL_ADDR(p);
if (!address_in_range(p, pool)) {
return 0;
}
/* We allocated this address. */
/* Link p to the start of the pool's freeblock list. Since
* the pool had at least the p block outstanding, the pool
* wasn't empty (so it's already in a usedpools[] list, or
* was full and is in no list -- it's not in the freeblocks
* list in any case).
*/
assert(pool->ref.count > 0); /* else it was empty */
*(block **)p = lastfree = pool->freeblock;
pool->freeblock = (block *)p;
if (!lastfree) {
/* Pool was full, so doesn't currently live in any list:
* link it to the front of the appropriate usedpools[] list.
* This mimics LRU pool usage for new allocations and
* targets optimal filling when several pools contain
* blocks of the same size class.
*/
--pool->ref.count;
assert(pool->ref.count > 0); /* else the pool is empty */
size = pool->szidx;
next = usedpools[size + size];
prev = next->prevpool;
/* insert pool before next: prev <-> pool <-> next */
pool->nextpool = next;
pool->prevpool = prev;
next->prevpool = pool;
prev->nextpool = pool;
goto success;
}
struct arena_object* ao;
uint nf; /* ao->nfreepools */
/* freeblock wasn't NULL, so the pool wasn't full,
* and the pool is in a usedpools[] list.
*/
if (--pool->ref.count != 0) {
/* pool isn't empty: leave it in usedpools */
goto success;
}
/* Pool is now empty: unlink from usedpools, and
* link to the front of freepools. This ensures that
* previously freed pools will be allocated later
* (being not referenced, they are perhaps paged out).
*/
next = pool->nextpool;
prev = pool->prevpool;
static void
insert_to_freepool(poolp pool)
{
poolp next = pool->nextpool;
poolp prev = pool->prevpool;
next->prevpool = prev;
prev->nextpool = next;
/* Link the pool to freepools. This is a singly-linked
* list, and pool->prevpool isn't used there.
*/
ao = &arenas[pool->arenaindex];
struct arena_object *ao = &arenas[pool->arenaindex];
pool->nextpool = ao->freepools;
ao->freepools = pool;
nf = ao->nfreepools;
uint nf = ao->nfreepools;
/* If this is the rightmost arena with this number of free pools,
* nfp2lasta[nf] needs to change. Caution: if nf is 0, there
* are no arenas in usable_arenas with that value.
@ -1826,7 +1779,7 @@ pymalloc_free(void *ctx, void *p)
ao->address = 0; /* mark unassociated */
--narenas_currently_allocated;
goto success;
return;
}
if (nf == 1) {
@ -1845,7 +1798,7 @@ pymalloc_free(void *ctx, void *p)
nfp2lasta[1] = ao;
}
goto success;
return;
}
/* If this arena is now out of order, we need to keep
@ -1862,7 +1815,7 @@ pymalloc_free(void *ctx, void *p)
/* If this was the rightmost of the old size, it remains in place. */
if (ao == lastnf) {
/* Case 4. Nothing to do. */
goto success;
return;
}
/* If ao were the only arena in the list, the last block would have
* gotten us out.
@ -1898,10 +1851,65 @@ pymalloc_free(void *ctx, void *p)
assert(ao->nextarena == NULL || ao->nextarena->prevarena == ao);
assert((usable_arenas == ao && ao->prevarena == NULL)
|| ao->prevarena->nextarena == ao);
}
goto success;
/* Free a memory block allocated by pymalloc_alloc().
Return 1 if it was freed.
Return 0 if the block was not allocated by pymalloc_alloc(). */
static inline int
pymalloc_free(void *ctx, void *p)
{
assert(p != NULL);
success:
#ifdef WITH_VALGRIND
if (UNLIKELY(running_on_valgrind > 0)) {
return 0;
}
#endif
poolp pool = POOL_ADDR(p);
if (UNLIKELY(!address_in_range(p, pool))) {
return 0;
}
/* We allocated this address. */
/* Link p to the start of the pool's freeblock list. Since
* the pool had at least the p block outstanding, the pool
* wasn't empty (so it's already in a usedpools[] list, or
* was full and is in no list -- it's not in the freeblocks
* list in any case).
*/
assert(pool->ref.count > 0); /* else it was empty */
block *lastfree = pool->freeblock;
*(block **)p = lastfree;
pool->freeblock = (block *)p;
pool->ref.count--;
if (UNLIKELY(lastfree == NULL)) {
/* Pool was full, so doesn't currently live in any list:
* link it to the front of the appropriate usedpools[] list.
* This mimics LRU pool usage for new allocations and
* targets optimal filling when several pools contain
* blocks of the same size class.
*/
insert_to_usedpool(pool);
return 1;
}
/* freeblock wasn't NULL, so the pool wasn't full,
* and the pool is in a usedpools[] list.
*/
if (LIKELY(pool->ref.count != 0)) {
/* pool isn't empty: leave it in usedpools */
return 1;
}
/* Pool is now empty: unlink from usedpools, and
* link to the front of freepools. This ensures that
* previously freed pools will be allocated later
* (being not referenced, they are perhaps paged out).
*/
insert_to_freepool(pool);
return 1;
}
@ -1914,7 +1922,7 @@ _PyObject_Free(void *ctx, void *p)
return;
}
if (!pymalloc_free(ctx, p)) {
if (UNLIKELY(!pymalloc_free(ctx, p))) {
/* pymalloc didn't allocate this address */
PyMem_RawFree(p);
raw_allocated_blocks--;