GH-90997: Improve inline cache performance for MSVC (GH-96781)

This commit is contained in:
Brandt Bucher 2022-09-14 17:05:04 -07:00 committed by GitHub
parent 9f1814723f
commit a83fdf2563
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 25 additions and 78 deletions

View File

@ -285,110 +285,55 @@ PyAPI_FUNC(PyObject*) _Py_GetSpecializationStats(void);
#define EVAL_CALL_STAT_INC_IF_FUNCTION(name, callable) ((void)0)
#endif // !Py_STATS
// Cache values are only valid in memory, so use native endianness.
#ifdef WORDS_BIGENDIAN
// Utility functions for reading/writing 32/64-bit values in the inline caches.
// Great care should be taken to ensure that these functions remain correct and
// performant! They should compile to just "move" instructions on all supported
// compilers and platforms.
// We use memcpy to let the C compiler handle unaligned accesses and endianness
// issues for us. It also seems to produce better code than manual copying for
// most compilers (see https://blog.regehr.org/archives/959 for more info).
static inline void
write_u32(uint16_t *p, uint32_t val)
{
p[0] = (uint16_t)(val >> 16);
p[1] = (uint16_t)(val >> 0);
memcpy(p, &val, sizeof(val));
}
static inline void
write_u64(uint16_t *p, uint64_t val)
{
p[0] = (uint16_t)(val >> 48);
p[1] = (uint16_t)(val >> 32);
p[2] = (uint16_t)(val >> 16);
p[3] = (uint16_t)(val >> 0);
memcpy(p, &val, sizeof(val));
}
static inline void
write_obj(uint16_t *p, PyObject *val)
{
memcpy(p, &val, sizeof(val));
}
static inline uint32_t
read_u32(uint16_t *p)
{
uint32_t val = 0;
val |= (uint32_t)p[0] << 16;
val |= (uint32_t)p[1] << 0;
uint32_t val;
memcpy(&val, p, sizeof(val));
return val;
}
static inline uint64_t
read_u64(uint16_t *p)
{
uint64_t val = 0;
val |= (uint64_t)p[0] << 48;
val |= (uint64_t)p[1] << 32;
val |= (uint64_t)p[2] << 16;
val |= (uint64_t)p[3] << 0;
uint64_t val;
memcpy(&val, p, sizeof(val));
return val;
}
#else
static inline void
write_u32(uint16_t *p, uint32_t val)
{
p[0] = (uint16_t)(val >> 0);
p[1] = (uint16_t)(val >> 16);
}
static inline void
write_u64(uint16_t *p, uint64_t val)
{
p[0] = (uint16_t)(val >> 0);
p[1] = (uint16_t)(val >> 16);
p[2] = (uint16_t)(val >> 32);
p[3] = (uint16_t)(val >> 48);
}
static inline uint32_t
read_u32(uint16_t *p)
{
uint32_t val = 0;
val |= (uint32_t)p[0] << 0;
val |= (uint32_t)p[1] << 16;
return val;
}
static inline uint64_t
read_u64(uint16_t *p)
{
uint64_t val = 0;
val |= (uint64_t)p[0] << 0;
val |= (uint64_t)p[1] << 16;
val |= (uint64_t)p[2] << 32;
val |= (uint64_t)p[3] << 48;
return val;
}
#endif
static inline void
write_obj(uint16_t *p, PyObject *obj)
{
uintptr_t val = (uintptr_t)obj;
#if SIZEOF_VOID_P == 8
write_u64(p, val);
#elif SIZEOF_VOID_P == 4
write_u32(p, val);
#else
#error "SIZEOF_VOID_P must be 4 or 8"
#endif
}
static inline PyObject *
read_obj(uint16_t *p)
{
uintptr_t val;
#if SIZEOF_VOID_P == 8
val = read_u64(p);
#elif SIZEOF_VOID_P == 4
val = read_u32(p);
#else
#error "SIZEOF_VOID_P must be 4 or 8"
#endif
return (PyObject *)val;
PyObject *val;
memcpy(&val, p, sizeof(val));
return val;
}
/* See Objects/exception_handling_notes.txt for details.

View File

@ -0,0 +1,2 @@
Improve the performance of reading and writing inline bytecode caches on
some platforms.