From fb26504d14a08fcd61bb92bb989b6d2b12188535 Mon Sep 17 00:00:00 2001 From: Inada Naoki Date: Wed, 17 Jul 2019 21:23:57 +0900 Subject: [PATCH] bpo-37543: optimize pymalloc (#14674) PyObject_Malloc() and PyObject_Free() inlines pymalloc_alloc and pymalloc_free partially. But when PGO is not used, compiler don't know where is the hot part in pymalloc_alloc and pymalloc_free. --- .../2019-07-10-20-33-53.bpo-37543.EvI19D.rst | 1 + Objects/obmalloc.c | 444 +++++++++--------- 2 files changed, 227 insertions(+), 218 deletions(-) create mode 100644 Misc/NEWS.d/next/Core and Builtins/2019-07-10-20-33-53.bpo-37543.EvI19D.rst diff --git a/Misc/NEWS.d/next/Core and Builtins/2019-07-10-20-33-53.bpo-37543.EvI19D.rst b/Misc/NEWS.d/next/Core and Builtins/2019-07-10-20-33-53.bpo-37543.EvI19D.rst new file mode 100644 index 00000000000..3f74bbc5d07 --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2019-07-10-20-33-53.bpo-37543.EvI19D.rst @@ -0,0 +1 @@ +Optimized pymalloc for non PGO build. diff --git a/Objects/obmalloc.c b/Objects/obmalloc.c index 2c00efc255d..8d5c700d5c1 100644 --- a/Objects/obmalloc.c +++ b/Objects/obmalloc.c @@ -710,19 +710,21 @@ PyObject_Free(void *ptr) } -#ifdef WITH_PYMALLOC - -#ifdef WITH_VALGRIND -#include - /* If we're using GCC, use __builtin_expect() to reduce overhead of the valgrind checks */ #if defined(__GNUC__) && (__GNUC__ > 2) && defined(__OPTIMIZE__) # define UNLIKELY(value) __builtin_expect((value), 0) +# define LIKELY(value) __builtin_expect((value), 1) #else # define UNLIKELY(value) (value) +# define LIKELY(value) (value) #endif +#ifdef WITH_PYMALLOC + +#ifdef WITH_VALGRIND +#include + /* -1 indicates that we haven't checked that we're running on valgrind yet. */ static int running_on_valgrind = -1; #endif @@ -1424,96 +1426,48 @@ address_in_range(void *p, poolp pool) /*==========================================================================*/ -/* pymalloc allocator - - The basic blocks are ordered by decreasing execution frequency, - which minimizes the number of jumps in the most common cases, - improves branching prediction and instruction scheduling (small - block allocations typically result in a couple of instructions). - Unless the optimizer reorders everything, being too smart... - - Return 1 if pymalloc allocated memory and wrote the pointer into *ptr_p. - - Return 0 if pymalloc failed to allocate the memory block: on bigger - requests, on error in the code below (as a last chance to serve the request) - or when the max memory limit has been reached. */ -static int -pymalloc_alloc(void *ctx, void **ptr_p, size_t nbytes) +// Called when freelist is exhausted. Extend the freelist if there is +// space for a block. Otherwise, remove this pool from usedpools. +static void +pymalloc_pool_extend(poolp pool, uint size) { - block *bp; - poolp pool; + if (UNLIKELY(pool->nextoffset <= pool->maxnextoffset)) { + /* There is room for another block. */ + pool->freeblock = (block*)pool + pool->nextoffset; + pool->nextoffset += INDEX2SIZE(size); + *(block **)(pool->freeblock) = NULL; + return; + } + + /* Pool is full, unlink from used pools. */ poolp next; - uint size; - -#ifdef WITH_VALGRIND - if (UNLIKELY(running_on_valgrind == -1)) { - running_on_valgrind = RUNNING_ON_VALGRIND; - } - if (UNLIKELY(running_on_valgrind)) { - return 0; - } -#endif - - if (nbytes == 0) { - return 0; - } - if (nbytes > SMALL_REQUEST_THRESHOLD) { - return 0; - } - - /* - * Most frequent paths first - */ - size = (uint)(nbytes - 1) >> ALIGNMENT_SHIFT; - pool = usedpools[size + size]; - if (pool != pool->nextpool) { - /* - * There is a used pool for this size class. - * Pick up the head block of its free list. - */ - ++pool->ref.count; - bp = pool->freeblock; - assert(bp != NULL); - if ((pool->freeblock = *(block **)bp) != NULL) { - goto success; - } - - /* - * Reached the end of the free list, try to extend it. - */ - if (pool->nextoffset <= pool->maxnextoffset) { - /* There is room for another block. */ - pool->freeblock = (block*)pool + - pool->nextoffset; - pool->nextoffset += INDEX2SIZE(size); - *(block **)(pool->freeblock) = NULL; - goto success; - } - - /* Pool is full, unlink from used pools. */ - next = pool->nextpool; - pool = pool->prevpool; - next->prevpool = pool; - pool->nextpool = next; - goto success; - } + next = pool->nextpool; + pool = pool->prevpool; + next->prevpool = pool; + pool->nextpool = next; +} +/* called when pymalloc_alloc can not allocate a block from usedpool. + * This function takes new pool and allocate a block from it. + */ +static void* +allocate_from_new_pool(uint size) +{ /* There isn't a pool of the right size class immediately * available: use a free pool. */ - if (usable_arenas == NULL) { + if (UNLIKELY(usable_arenas == NULL)) { /* No arena has a free pool: allocate a new arena. */ #ifdef WITH_MEMORY_LIMITS if (narenas_currently_allocated >= MAX_ARENAS) { - goto failed; + return NULL; } #endif usable_arenas = new_arena(); if (usable_arenas == NULL) { - goto failed; + return NULL; } - usable_arenas->nextarena = - usable_arenas->prevarena = NULL; + usable_arenas->nextarena = usable_arenas->prevarena = NULL; assert(nfp2lasta[usable_arenas->nfreepools] == NULL); nfp2lasta[usable_arenas->nfreepools] = usable_arenas; } @@ -1536,12 +1490,12 @@ pymalloc_alloc(void *ctx, void **ptr_p, size_t nbytes) } /* Try to get a cached free pool. */ - pool = usable_arenas->freepools; - if (pool != NULL) { + poolp pool = usable_arenas->freepools; + if (LIKELY(pool != NULL)) { /* Unlink from cached pools. */ usable_arenas->freepools = pool->nextpool; - --usable_arenas->nfreepools; - if (usable_arenas->nfreepools == 0) { + usable_arenas->nfreepools--; + if (UNLIKELY(usable_arenas->nfreepools == 0)) { /* Wholly allocated: remove. */ assert(usable_arenas->freepools == NULL); assert(usable_arenas->nextarena == NULL || @@ -1564,73 +1518,123 @@ pymalloc_alloc(void *ctx, void **ptr_p, size_t nbytes) (block*)usable_arenas->address + ARENA_SIZE - POOL_SIZE); } + } + else { + /* Carve off a new pool. */ + assert(usable_arenas->nfreepools > 0); + assert(usable_arenas->freepools == NULL); + pool = (poolp)usable_arenas->pool_address; + assert((block*)pool <= (block*)usable_arenas->address + + ARENA_SIZE - POOL_SIZE); + pool->arenaindex = (uint)(usable_arenas - arenas); + assert(&arenas[pool->arenaindex] == usable_arenas); + pool->szidx = DUMMY_SIZE_IDX; + usable_arenas->pool_address += POOL_SIZE; + --usable_arenas->nfreepools; - init_pool: - /* Frontlink to used pools. */ - next = usedpools[size + size]; /* == prev */ - pool->nextpool = next; - pool->prevpool = next; - next->nextpool = pool; - next->prevpool = pool; - pool->ref.count = 1; - if (pool->szidx == size) { - /* Luckily, this pool last contained blocks - * of the same size class, so its header - * and free list are already initialized. - */ - bp = pool->freeblock; - assert(bp != NULL); - pool->freeblock = *(block **)bp; - goto success; + if (usable_arenas->nfreepools == 0) { + assert(usable_arenas->nextarena == NULL || + usable_arenas->nextarena->prevarena == + usable_arenas); + /* Unlink the arena: it is completely allocated. */ + usable_arenas = usable_arenas->nextarena; + if (usable_arenas != NULL) { + usable_arenas->prevarena = NULL; + assert(usable_arenas->address != 0); + } } - /* - * Initialize the pool header, set up the free list to - * contain just the second block, and return the first - * block. + } + + /* Frontlink to used pools. */ + block *bp; + poolp next = usedpools[size + size]; /* == prev */ + pool->nextpool = next; + pool->prevpool = next; + next->nextpool = pool; + next->prevpool = pool; + pool->ref.count = 1; + if (pool->szidx == size) { + /* Luckily, this pool last contained blocks + * of the same size class, so its header + * and free list are already initialized. */ - pool->szidx = size; - size = INDEX2SIZE(size); - bp = (block *)pool + POOL_OVERHEAD; - pool->nextoffset = POOL_OVERHEAD + (size << 1); - pool->maxnextoffset = POOL_SIZE - size; - pool->freeblock = bp + size; - *(block **)(pool->freeblock) = NULL; - goto success; + bp = pool->freeblock; + assert(bp != NULL); + pool->freeblock = *(block **)bp; + return bp; + } + /* + * Initialize the pool header, set up the free list to + * contain just the second block, and return the first + * block. + */ + pool->szidx = size; + size = INDEX2SIZE(size); + bp = (block *)pool + POOL_OVERHEAD; + pool->nextoffset = POOL_OVERHEAD + (size << 1); + pool->maxnextoffset = POOL_SIZE - size; + pool->freeblock = bp + size; + *(block **)(pool->freeblock) = NULL; + return bp; +} + +/* pymalloc allocator + + Return 1 if pymalloc allocated memory and wrote the pointer into *ptr_p. + + Return 0 if pymalloc failed to allocate the memory block: on bigger + requests, on error in the code below (as a last chance to serve the request) + or when the max memory limit has been reached. +*/ +static inline int +pymalloc_alloc(void *ctx, void **ptr_p, size_t nbytes) +{ +#ifdef WITH_VALGRIND + if (UNLIKELY(running_on_valgrind == -1)) { + running_on_valgrind = RUNNING_ON_VALGRIND; + } + if (UNLIKELY(running_on_valgrind)) { + return 0; + } +#endif + + if (UNLIKELY(nbytes == 0)) { + return 0; + } + if (UNLIKELY(nbytes > SMALL_REQUEST_THRESHOLD)) { + return 0; } - /* Carve off a new pool. */ - assert(usable_arenas->nfreepools > 0); - assert(usable_arenas->freepools == NULL); - pool = (poolp)usable_arenas->pool_address; - assert((block*)pool <= (block*)usable_arenas->address + - ARENA_SIZE - POOL_SIZE); - pool->arenaindex = (uint)(usable_arenas - arenas); - assert(&arenas[pool->arenaindex] == usable_arenas); - pool->szidx = DUMMY_SIZE_IDX; - usable_arenas->pool_address += POOL_SIZE; - --usable_arenas->nfreepools; + uint size = (uint)(nbytes - 1) >> ALIGNMENT_SHIFT; + poolp pool = usedpools[size + size]; + block *bp; + + if (LIKELY(pool != pool->nextpool)) { + /* + * There is a used pool for this size class. + * Pick up the head block of its free list. + */ + ++pool->ref.count; + bp = pool->freeblock; - if (usable_arenas->nfreepools == 0) { - assert(usable_arenas->nextarena == NULL || - usable_arenas->nextarena->prevarena == - usable_arenas); - /* Unlink the arena: it is completely allocated. */ - usable_arenas = usable_arenas->nextarena; - if (usable_arenas != NULL) { - usable_arenas->prevarena = NULL; - assert(usable_arenas->address != 0); + if (UNLIKELY((pool->freeblock = *(block **)bp) == NULL)) { + // Reached the end of the free list, try to extend it. + pymalloc_pool_extend(pool, size); + } + } + else { + /* There isn't a pool of the right size class immediately + * available: use a free pool. + */ + bp = allocate_from_new_pool(size); + if (UNLIKELY(bp == NULL)) { + return 0; } } - goto init_pool; - -success: assert(bp != NULL); *ptr_p = (void *)bp; return 1; - -failed: - return 0; } @@ -1638,7 +1642,7 @@ static void * _PyObject_Malloc(void *ctx, size_t nbytes) { void* ptr; - if (pymalloc_alloc(ctx, &ptr, nbytes)) { + if (LIKELY(pymalloc_alloc(ctx, &ptr, nbytes))) { return ptr; } @@ -1658,7 +1662,7 @@ _PyObject_Calloc(void *ctx, size_t nelem, size_t elsize) assert(elsize == 0 || nelem <= (size_t)PY_SSIZE_T_MAX / elsize); size_t nbytes = nelem * elsize; - if (pymalloc_alloc(ctx, &ptr, nbytes)) { + if (LIKELY(pymalloc_alloc(ctx, &ptr, nbytes))) { memset(ptr, 0, nbytes); return ptr; } @@ -1671,88 +1675,37 @@ _PyObject_Calloc(void *ctx, size_t nelem, size_t elsize) } -/* Free a memory block allocated by pymalloc_alloc(). - Return 1 if it was freed. - Return 0 if the block was not allocated by pymalloc_alloc(). */ -static int -pymalloc_free(void *ctx, void *p) +static void +insert_to_usedpool(poolp pool) { - poolp pool; - block *lastfree; - poolp next, prev; - uint size; + assert(pool->ref.count > 0); /* else the pool is empty */ - assert(p != NULL); + uint size = pool->szidx; + poolp next = usedpools[size + size]; + poolp prev = next->prevpool; -#ifdef WITH_VALGRIND - if (UNLIKELY(running_on_valgrind > 0)) { - return 0; - } -#endif + /* insert pool before next: prev <-> pool <-> next */ + pool->nextpool = next; + pool->prevpool = prev; + next->prevpool = pool; + prev->nextpool = pool; +} - pool = POOL_ADDR(p); - if (!address_in_range(p, pool)) { - return 0; - } - /* We allocated this address. */ - - /* Link p to the start of the pool's freeblock list. Since - * the pool had at least the p block outstanding, the pool - * wasn't empty (so it's already in a usedpools[] list, or - * was full and is in no list -- it's not in the freeblocks - * list in any case). - */ - assert(pool->ref.count > 0); /* else it was empty */ - *(block **)p = lastfree = pool->freeblock; - pool->freeblock = (block *)p; - if (!lastfree) { - /* Pool was full, so doesn't currently live in any list: - * link it to the front of the appropriate usedpools[] list. - * This mimics LRU pool usage for new allocations and - * targets optimal filling when several pools contain - * blocks of the same size class. - */ - --pool->ref.count; - assert(pool->ref.count > 0); /* else the pool is empty */ - size = pool->szidx; - next = usedpools[size + size]; - prev = next->prevpool; - - /* insert pool before next: prev <-> pool <-> next */ - pool->nextpool = next; - pool->prevpool = prev; - next->prevpool = pool; - prev->nextpool = pool; - goto success; - } - - struct arena_object* ao; - uint nf; /* ao->nfreepools */ - - /* freeblock wasn't NULL, so the pool wasn't full, - * and the pool is in a usedpools[] list. - */ - if (--pool->ref.count != 0) { - /* pool isn't empty: leave it in usedpools */ - goto success; - } - /* Pool is now empty: unlink from usedpools, and - * link to the front of freepools. This ensures that - * previously freed pools will be allocated later - * (being not referenced, they are perhaps paged out). - */ - next = pool->nextpool; - prev = pool->prevpool; +static void +insert_to_freepool(poolp pool) +{ + poolp next = pool->nextpool; + poolp prev = pool->prevpool; next->prevpool = prev; prev->nextpool = next; /* Link the pool to freepools. This is a singly-linked * list, and pool->prevpool isn't used there. */ - ao = &arenas[pool->arenaindex]; + struct arena_object *ao = &arenas[pool->arenaindex]; pool->nextpool = ao->freepools; ao->freepools = pool; - nf = ao->nfreepools; + uint nf = ao->nfreepools; /* If this is the rightmost arena with this number of free pools, * nfp2lasta[nf] needs to change. Caution: if nf is 0, there * are no arenas in usable_arenas with that value. @@ -1826,7 +1779,7 @@ pymalloc_free(void *ctx, void *p) ao->address = 0; /* mark unassociated */ --narenas_currently_allocated; - goto success; + return; } if (nf == 1) { @@ -1845,7 +1798,7 @@ pymalloc_free(void *ctx, void *p) nfp2lasta[1] = ao; } - goto success; + return; } /* If this arena is now out of order, we need to keep @@ -1862,7 +1815,7 @@ pymalloc_free(void *ctx, void *p) /* If this was the rightmost of the old size, it remains in place. */ if (ao == lastnf) { /* Case 4. Nothing to do. */ - goto success; + return; } /* If ao were the only arena in the list, the last block would have * gotten us out. @@ -1898,10 +1851,65 @@ pymalloc_free(void *ctx, void *p) assert(ao->nextarena == NULL || ao->nextarena->prevarena == ao); assert((usable_arenas == ao && ao->prevarena == NULL) || ao->prevarena->nextarena == ao); +} - goto success; +/* Free a memory block allocated by pymalloc_alloc(). + Return 1 if it was freed. + Return 0 if the block was not allocated by pymalloc_alloc(). */ +static inline int +pymalloc_free(void *ctx, void *p) +{ + assert(p != NULL); -success: +#ifdef WITH_VALGRIND + if (UNLIKELY(running_on_valgrind > 0)) { + return 0; + } +#endif + + poolp pool = POOL_ADDR(p); + if (UNLIKELY(!address_in_range(p, pool))) { + return 0; + } + /* We allocated this address. */ + + /* Link p to the start of the pool's freeblock list. Since + * the pool had at least the p block outstanding, the pool + * wasn't empty (so it's already in a usedpools[] list, or + * was full and is in no list -- it's not in the freeblocks + * list in any case). + */ + assert(pool->ref.count > 0); /* else it was empty */ + block *lastfree = pool->freeblock; + *(block **)p = lastfree; + pool->freeblock = (block *)p; + pool->ref.count--; + + if (UNLIKELY(lastfree == NULL)) { + /* Pool was full, so doesn't currently live in any list: + * link it to the front of the appropriate usedpools[] list. + * This mimics LRU pool usage for new allocations and + * targets optimal filling when several pools contain + * blocks of the same size class. + */ + insert_to_usedpool(pool); + return 1; + } + + /* freeblock wasn't NULL, so the pool wasn't full, + * and the pool is in a usedpools[] list. + */ + if (LIKELY(pool->ref.count != 0)) { + /* pool isn't empty: leave it in usedpools */ + return 1; + } + + /* Pool is now empty: unlink from usedpools, and + * link to the front of freepools. This ensures that + * previously freed pools will be allocated later + * (being not referenced, they are perhaps paged out). + */ + insert_to_freepool(pool); return 1; } @@ -1914,7 +1922,7 @@ _PyObject_Free(void *ctx, void *p) return; } - if (!pymalloc_free(ctx, p)) { + if (UNLIKELY(!pymalloc_free(ctx, p))) { /* pymalloc didn't allocate this address */ PyMem_RawFree(p); raw_allocated_blocks--;