From ddea208be9e2a8fa281e25ebbc890378dd2aa286 Mon Sep 17 00:00:00 2001 From: Tim Peters Date: Sat, 23 Mar 2002 10:03:50 +0000 Subject: [PATCH] Give Python a debug-mode pymalloc, much as sketched on Python-Dev. When WITH_PYMALLOC is defined, define PYMALLOC_DEBUG to enable the debug allocator. This can be done independent of build type (release or debug). A debug build automatically defines PYMALLOC_DEBUG when pymalloc is enabled. It's a detected error to define PYMALLOC_DEBUG when pymalloc isn't enabled. Two debugging entry points defined only under PYMALLOC_DEBUG: + _PyMalloc_DebugCheckAddress(const void *p) can be used (e.g., from gdb) to sanity-check a memory block obtained from pymalloc. It sprays info to stderr (see next) and dies via Py_FatalError if the block is detectably damaged. + _PyMalloc_DebugDumpAddress(const void *p) can be used to spray info about a debug memory block to stderr. A tiny start at implementing "API family" checks isn't good for anything yet. _PyMalloc_DebugRealloc() has been optimized to do little when the new size is <= old size. However, if the new size is larger, it really can't call the underlying realloc() routine without either violating its contract, or knowing something non-trivial about how the underlying realloc() works. A memcpy is always done in this case. This was a disaster for (and only) one of the std tests: test_bufio creates single text file lines up to a million characters long. On Windows, fileobject.c's get_line() uses the horridly funky getline_via_fgets(), which keeps growing and growing a string object hoping to find a newline. It grew the string object 1000 bytes each time, so for a million-character string it took approximately forever (I gave up after a few minutes). So, also: fileobject.c, getline_via_fgets(): When a single line is outrageously long, grow the string object at a mildly exponential rate, instead of just 1000 bytes at a time. That's enough so that a debug-build test_bufio finishes in about 5 seconds on my Win98SE box. I'm curious to try this on Win2K, because it has very different memory behavior than Win9X, and test_bufio always took a factor of 10 longer to complete on Win2K. It *could* be that the endless reallocs were simply killing it on Win2K even in the release build. --- Include/Python.h | 9 ++ Include/pymem.h | 20 ++- Objects/fileobject.c | 13 +- Objects/obmalloc.c | 332 ++++++++++++++++++++++++++++++++++++++++--- 4 files changed, 346 insertions(+), 28 deletions(-) diff --git a/Include/Python.h b/Include/Python.h index 9724fb7965a..d1ffc73c416 100644 --- a/Include/Python.h +++ b/Include/Python.h @@ -61,6 +61,15 @@ #include "pyport.h" +/* Debug-mode build with pymalloc implies PYMALLOC_DEBUG. + * PYMALLOC_DEBUG is in error if pymalloc is not in use. + */ +#if defined(Py_DEBUG) && defined(WITH_PYMALLOC) && !defined(PYMALLOC_DEBUG) +#define PYMALLOC_DEBUG +#endif +#if defined(PYMALLOC_DEBUG) && !defined(WITH_PYMALLOC) +#error "PYMALLOC_DEBUG requires WITH_PYMALLOC" +#endif #include "pymem.h" #include "object.h" diff --git a/Include/pymem.h b/Include/pymem.h index 10e915f3ba6..a321f5b23bb 100644 --- a/Include/pymem.h +++ b/Include/pymem.h @@ -89,20 +89,34 @@ extern DL_IMPORT(void) PyMem_Free(void *); it is recommended to write the test explicitly in the code. Note that according to ANSI C, free(NULL) has no effect. */ - + /* pymalloc (private to the interpreter) */ #ifdef WITH_PYMALLOC DL_IMPORT(void *) _PyMalloc_Malloc(size_t nbytes); DL_IMPORT(void *) _PyMalloc_Realloc(void *p, size_t nbytes); DL_IMPORT(void) _PyMalloc_Free(void *p); + +#ifdef PYMALLOC_DEBUG +DL_IMPORT(void *) _PyMalloc_DebugMalloc(size_t nbytes, int family); +DL_IMPORT(void *) _PyMalloc_DebugRealloc(void *p, size_t nbytes, int family); +DL_IMPORT(void) _PyMalloc_DebugFree(void *p, int family); +DL_IMPORT(void) _PyMalloc_DebugDumpAddress(const void *p); +DL_IMPORT(void) _PyMalloc_DebugCheckAddress(const void *p); +#define _PyMalloc_MALLOC(N) _PyMalloc_DebugMalloc(N, 0) +#define _PyMalloc_REALLOC(P, N) _PyMalloc_DebugRealloc(P, N, 0) +#define _PyMalloc_FREE(P) _PyMalloc_DebugFree(P, 0) + +#else /* WITH_PYMALLOC && ! PYMALLOC_DEBUG */ #define _PyMalloc_MALLOC _PyMalloc_Malloc #define _PyMalloc_REALLOC _PyMalloc_Realloc #define _PyMalloc_FREE _PyMalloc_Free -#else +#endif + +#else /* ! WITH_PYMALLOC */ #define _PyMalloc_MALLOC PyMem_MALLOC #define _PyMalloc_REALLOC PyMem_REALLOC #define _PyMalloc_FREE PyMem_FREE -#endif +#endif /* WITH_PYMALLOC */ #ifdef __cplusplus diff --git a/Objects/fileobject.c b/Objects/fileobject.c index 47e6b17a88e..6a82cce7030 100644 --- a/Objects/fileobject.c +++ b/Objects/fileobject.c @@ -772,13 +772,9 @@ getline_via_fgets(FILE *fp) * cautions about boosting that. 300 was chosen because the worst real-life * text-crunching job reported on Python-Dev was a mail-log crawler where over * half the lines were 254 chars. - * INCBUFSIZE is the amount by which we grow the buffer, if MAXBUFSIZE isn't - * enough. It doesn't much matter what this is set to: we only get here for - * absurdly long lines anyway. */ #define INITBUFSIZE 100 #define MAXBUFSIZE 300 -#define INCBUFSIZE 1000 char* p; /* temp */ char buf[MAXBUFSIZE]; PyObject* v; /* the string object result */ @@ -786,6 +782,7 @@ getline_via_fgets(FILE *fp) char* pvend; /* address one beyond last free slot */ size_t nfree; /* # of free buffer slots; pvend-pvfree */ size_t total_v_size; /* total # of slots in buffer */ + size_t increment; /* amount to increment the buffer */ /* Optimize for normal case: avoid _PyString_Resize if at all * possible via first reading into stack buffer "buf". @@ -853,7 +850,7 @@ getline_via_fgets(FILE *fp) /* The stack buffer isn't big enough; malloc a string object and read * into its buffer. */ - total_v_size = MAXBUFSIZE + INCBUFSIZE; + total_v_size = MAXBUFSIZE << 1; v = PyString_FromStringAndSize((char*)NULL, (int)total_v_size); if (v == NULL) return v; @@ -897,7 +894,8 @@ getline_via_fgets(FILE *fp) } /* expand buffer and try again */ assert(*(pvend-1) == '\0'); - total_v_size += INCBUFSIZE; + increment = total_v_size >> 2; /* mild exponential growth */ + total_v_size += increment; if (total_v_size > INT_MAX) { PyErr_SetString(PyExc_OverflowError, "line is longer than a Python string can hold"); @@ -907,14 +905,13 @@ getline_via_fgets(FILE *fp) if (_PyString_Resize(&v, (int)total_v_size) < 0) return NULL; /* overwrite the trailing null byte */ - pvfree = BUF(v) + (total_v_size - INCBUFSIZE - 1); + pvfree = BUF(v) + (total_v_size - increment - 1); } if (BUF(v) + total_v_size != p) _PyString_Resize(&v, p - BUF(v)); return v; #undef INITBUFSIZE #undef MAXBUFSIZE -#undef INCBUFSIZE } #endif /* ifdef USE_FGETS_IN_GETLINE */ diff --git a/Objects/obmalloc.c b/Objects/obmalloc.c index 4e141c75c81..3e9465f72a1 100644 --- a/Objects/obmalloc.c +++ b/Objects/obmalloc.c @@ -623,24 +623,10 @@ _PyMalloc_Realloc(void *p, size_t nbytes) return (void *)bp; } -/* calloc */ - -/* -- unused -- -void * -_PyMalloc_Calloc(size_t nbel, size_t elsz) -{ - void *p; - size_t nbytes; - - nbytes = nbel * elsz; - p = _PyMalloc_Malloc(nbytes); - if (p != NULL) - memset(p, 0, nbytes); - return p; -} -*/ - #else /* ! WITH_PYMALLOC */ + +/*==========================================================================*/ +/* pymalloc not enabled: Redirect the entry points to the PyMem family. */ void * _PyMalloc_Malloc(size_t n) { @@ -686,3 +672,315 @@ _PyMalloc_Del(PyObject *op) { _PyMalloc_FREE(op); } + +#ifdef PYMALLOC_DEBUG +/*==========================================================================*/ +/* A x-platform debugging allocator. */ + +#define PYMALLOC_CLEANBYTE 0xCB /* uninitialized memory */ +#define PYMALLOC_DEADBYTE 0xDB /* free()ed memory */ +#define PYMALLOC_FORBIDDENBYTE 0xFB /* unusable memory */ + +static ulong serialno = 0; /* incremented on each debug {m,re}alloc */ + +/* Read 4 bytes at p as a big-endian ulong. */ +static ulong +read4(const void *p) +{ + const unsigned char *q = (unsigned char *)p; + return ((ulong)q[0] << 24) | + ((ulong)q[1] << 16) | + ((ulong)q[2] << 8) | + (ulong)q[3]; +} + +/* Write the 4 least-significant bytes of n as a big-endian unsigned int, + MSB at address p, LSB at p+3. */ +static void +write4(void *p, ulong n) +{ + unsigned char *q = (unsigned char *)p; + q[0] = (unsigned char)((n >> 24) & 0xff); + q[1] = (unsigned char)((n >> 16) & 0xff); + q[2] = (unsigned char)((n >> 8) & 0xff); + q[3] = (unsigned char)( n & 0xff); +} + +static void +check_family(const void *p, int family) +{ + const uchar *q = (const uchar *)p; + int original_family; + char buf[200]; + + assert(p != NULL); + original_family = (int)*(q-4); + if (family != original_family) { + /* XXX better msg */ + PyOS_snprintf(buf, sizeof(buf), + "free or realloc from family #%d called, " + "but block was allocated by family #%d", + family, original_family); + _PyMalloc_DebugDumpAddress(p); + Py_FatalError(buf); + } +} + +/* The debug malloc asks for 16 extra bytes and fills them with useful stuff, + here calling the underlying malloc's result p: + +p[0:4] + Number of bytes originally asked for. 4-byte unsigned integer, + big-endian (easier to read in a memory dump). +p[4] + The API "family" this malloc call belongs to. XXX todo XXX +p[5:8] + Copies of PYMALLOC_FORBIDDENBYTE. Used to catch under- writes + and reads. +p[8:8+n] + The requested memory, filled with copies of PYMALLOC_CLEANBYTE. + Used to catch reference to uninitialized memory. + &p[8] is returned. Note that this is 8-byte aligned if PyMalloc + handled the request itself. +p[8+n:8+n+4] + Copies of PYMALLOC_FORBIDDENBYTE. Used to catch over- writes + and reads. +p[8+n+4:8+n+8] + A serial number, incremented by 1 on each call to _PyMalloc_DebugMalloc + and _PyMalloc_DebugRealloc. + 4-byte unsigned integer, big-endian. + If "bad memory" is detected later, the serial number gives an + excellent way to set a breakpoint on the next run, to capture the + instant at which this block was passed out. +*/ + +void * +_PyMalloc_DebugMalloc(size_t nbytes, int family) +{ + uchar *p; /* base address of malloc'ed block */ + uchar *q; /* p + 8 + nbytes + */ + size_t total; /* nbytes + 16 */ + + assert(family == 0); + + ++serialno; + total = nbytes + 16; + if (total < nbytes || (total >> 31) > 1) { + /* overflow, or we can't represent it in 4 bytes */ + /* Obscure: can't do (total >> 32) != 0 instead, because + C doesn't define what happens for a right-shift of 32 + when size_t is a 32-bit type. At least C guarantees + size_t is an unsigned type. */ + return NULL; + } + + p = _PyMalloc_Malloc(total); /* XXX derive from family */ + if (p == NULL) + return NULL; + + write4(p, nbytes); + p[4] = (uchar)family; + p[5] = p[6] = p[7] = PYMALLOC_FORBIDDENBYTE; + + if (nbytes > 0) + memset(p+8, PYMALLOC_CLEANBYTE, nbytes); + + q = p + 8 + nbytes; + q[0] = q[1] = q[2] = q[3] = PYMALLOC_FORBIDDENBYTE; + write4(q+4, serialno); + + return p+8; +} + +/* The debug free first uses the address to find the number of bytes + originally asked for, then checks the 8 bytes on each end for + sanity (in particular, that the PYMALLOC_FORBIDDENBYTEs are still + intact). + Then fills the original bytes with PYMALLOC_DEADBYTE. + Then calls the underlying free. +*/ +void +_PyMalloc_DebugFree(void *p, int family) +{ + uchar *q = (uchar*)p; + size_t nbytes; + + assert(family == 0); + + if (p == NULL) + return; + check_family(p, family); + _PyMalloc_DebugCheckAddress(p); + nbytes = read4(q-8); + if (nbytes > 0) + memset(q, PYMALLOC_DEADBYTE, nbytes); + _PyMalloc_Free(q-8); /* XXX derive from family */ +} + +void * +_PyMalloc_DebugRealloc(void *p, size_t nbytes, int family) +{ + uchar *q = (uchar *)p; + size_t original_nbytes; + uchar *fresh; /* new memory block, if needed */ + + assert(family == 0); + + if (p == NULL) + return _PyMalloc_DebugMalloc(nbytes, family); + + check_family(p, family); + _PyMalloc_DebugCheckAddress(p); + + original_nbytes = read4(q-8); + if (nbytes == original_nbytes) { + /* note that this case is likely to be common due to the + way Python appends to lists */ + ++serialno; + write4(q + nbytes + 4, serialno); + return p; + } + + if (nbytes < original_nbytes) { + /* shrinking -- leave the guts alone, except to + fill the excess with DEADBYTE */ + const size_t excess = original_nbytes - nbytes; + ++serialno; + write4(q-8, nbytes); + /* kill the excess bytes plus the trailing 8 pad bytes */ + memset(q + nbytes, PYMALLOC_DEADBYTE, excess + 8); + q += nbytes; + q[0] = q[1] = q[2] = q[3] = PYMALLOC_FORBIDDENBYTE; + write4(q+4, serialno); + return p; + } + + /* More memory is needed: get it, copy over the first original_nbytes + of the original data, and free the original memory. */ + fresh = (uchar *)_PyMalloc_DebugMalloc(nbytes, family); + if (fresh != NULL && original_nbytes > 0) + memcpy(fresh, p, original_nbytes); + _PyMalloc_DebugFree(p, family); + return fresh; +} + +void +_PyMalloc_DebugCheckAddress(const void *p) +{ + const uchar *q = (const uchar *)p; + char *msg = NULL; + + if (p == NULL) + msg = "didn't expect a NULL pointer"; + + else if (*(q-3) != PYMALLOC_FORBIDDENBYTE || + *(q-2) != PYMALLOC_FORBIDDENBYTE || + *(q-1) != PYMALLOC_FORBIDDENBYTE) + msg = "bad leading pad byte"; + + else { + const ulong nbytes = read4(q-8); + const uchar *tail = q + nbytes; + int i; + for (i = 0; i < 4; ++i) { + if (tail[i] != PYMALLOC_FORBIDDENBYTE) { + msg = "bad trailing pad byte"; + break; + } + } + } + + if (msg != NULL) { + _PyMalloc_DebugDumpAddress(p); + Py_FatalError(msg); + } +} + +void +_PyMalloc_DebugDumpAddress(const void *p) +{ + const uchar *q = (const uchar *)p; + const uchar *tail; + ulong nbytes, serial; + + fprintf(stderr, "Debug memory block at address p=%p:\n", p); + if (p == NULL) + return; + + nbytes = read4(q-8); + fprintf(stderr, " %lu bytes originally allocated\n", nbytes); + fprintf(stderr, " from API family #%d\n", *(q-4)); + + /* In case this is nuts, check the pad bytes before trying to read up + the serial number (the address deref could blow up). */ + + fprintf(stderr, " the 3 pad bytes at p-3 are "); + if (*(q-3) == PYMALLOC_FORBIDDENBYTE && + *(q-2) == PYMALLOC_FORBIDDENBYTE && + *(q-1) == PYMALLOC_FORBIDDENBYTE) { + fprintf(stderr, "PYMALLOC_FORBIDDENBYTE, as expected\n"); + } + else { + int i; + fprintf(stderr, "not all PYMALLOC_FORBIDDENBYTE (0x%02x):\n", + PYMALLOC_FORBIDDENBYTE); + for (i = 3; i >= 1; --i) { + const uchar byte = *(q-i); + fprintf(stderr, " at p-%d: 0x%02x", i, byte); + if (byte != PYMALLOC_FORBIDDENBYTE) + fputs(" *** OUCH", stderr); + fputc('\n', stderr); + } + } + + tail = q + nbytes; + fprintf(stderr, " the 4 pad bytes at tail=%p are ", tail); + if (tail[0] == PYMALLOC_FORBIDDENBYTE && + tail[1] == PYMALLOC_FORBIDDENBYTE && + tail[2] == PYMALLOC_FORBIDDENBYTE && + tail[3] == PYMALLOC_FORBIDDENBYTE) { + fprintf(stderr, "PYMALLOC_FORBIDDENBYTE, as expected\n"); + } + else { + int i; + fprintf(stderr, "not all PYMALLOC_FORBIDDENBYTE (0x%02x):\n", + PYMALLOC_FORBIDDENBYTE); + for (i = 0; i < 4; ++i) { + const uchar byte = tail[i]; + fprintf(stderr, " at tail+%d: 0x%02x", + i, byte); + if (byte != PYMALLOC_FORBIDDENBYTE) + fputs(" *** OUCH", stderr); + fputc('\n', stderr); + } + } + + serial = read4(tail+4); + fprintf(stderr, " the block was made by call #%lu to " + "debug malloc/realloc\n", serial); + + if (nbytes > 0) { + int i = 0; + fprintf(stderr, " data at p:"); + /* print up to 8 bytes at the start */ + while (q < tail && i < 8) { + fprintf(stderr, " %02x", *q); + ++i; + ++q; + } + /* and up to 8 at the end */ + if (q < tail) { + if (tail - q > 8) { + fprintf(stderr, " ..."); + q = tail - 8; + } + while (q < tail) { + fprintf(stderr, " %02x", *q); + ++q; + } + } + fprintf(stderr, "\n"); + } +} + +#endif /* PYMALLOC_DEBUG */