Small function call optimization and special build option for call stats.

-DCALL_PROFILE: Count the number of function calls executed.

When this symbol is defined, the ceval mainloop and helper functions
count the number of function calls made.  It keeps detailed statistics
about what kind of object was called and whether the call hit any of
the special fast paths in the code.

Optimization:

When we take the fast_function() path, which seems to be taken for
most function calls, and there is minimal frame setup to do, avoid
call PyEval_EvalCodeEx().  The eval code ex function does a lot of
work to handle keywords args and star args, free variables,
generators, etc.  The inlined version simply allocates the frame and
copies the arguments values into the frame.

The optimization gets a little help from compile.c which adds a
CO_NOFREE flag to code objects that don't have free variables or cell
variables.  This change allows fast_function() to get into the fast
path with fewer tests.

I measure a couple of percent speedup in pystone with this change, but
there's surely more that can be done.
This commit is contained in:
Jeremy Hylton 2003-02-05 23:13:00 +00:00
parent f3f4af5521
commit 985eba53f5
6 changed files with 189 additions and 14 deletions

View File

@ -48,6 +48,8 @@ PyAPI_FUNC(int) Py_GetRecursionLimit(void);
PyAPI_FUNC(char *) PyEval_GetFuncName(PyObject *);
PyAPI_FUNC(char *) PyEval_GetFuncDesc(PyObject *);
PyAPI_FUNC(PyObject *) PyEval_GetCallStats(PyObject *);
/* this used to be handled on a per-thread basis - now just two globals */
PyAPI_DATA(volatile int) _Py_Ticker;
PyAPI_DATA(int) _Py_CheckInterval;

View File

@ -34,6 +34,12 @@ typedef struct {
#define CO_VARKEYWORDS 0x0008
#define CO_NESTED 0x0010
#define CO_GENERATOR 0x0020
/* The CO_NOFREE flag is set if there are no free or cell variables.
This information is redundant, but it allows a single flag test
to determine whether there is any extra work to be done when the
call frame it setup.
*/
#define CO_NOFREE 0x0040
/* XXX Temporary hack. Until generators are a permanent part of the
language, we need a way for a code object to record that generators
were *possible* when it was compiled. This is so code dynamically

View File

@ -199,3 +199,13 @@ sprayed to stdout, such as every opcode and opcode argument and values
pushed onto and popped off the value stack.
Not useful very often, but very useful when needed.
---------------------------------------------------------------------------
CALL_PROFILE introduced for Python 2.3
Count the number of function calls executed.
When this symbol is defined, the ceval mainloop and helper functions
count the number of function calls made. It keeps detailed statistics
about what kind of object was called and whether the call hit any of
the special fast paths in the code.

View File

@ -87,6 +87,62 @@ static long dxp[256];
#endif
#endif
/* Function call profile */
#ifdef CALL_PROFILE
#define PCALL_NUM 11
static int pcall[PCALL_NUM];
#define PCALL_ALL 0
#define PCALL_FUNCTION 1
#define PCALL_FAST_FUNCTION 2
#define PCALL_FASTER_FUNCTION 3
#define PCALL_METHOD 4
#define PCALL_BOUND_METHOD 5
#define PCALL_CFUNCTION 6
#define PCALL_TYPE 7
#define PCALL_GENERATOR 8
#define PCALL_OTHER 9
#define PCALL_POP 10
/* Notes about the statistics
PCALL_FAST stats
FAST_FUNCTION means no argument tuple needs to be created.
FASTER_FUNCTION means that the fast-path frame setup code is used.
If there is a method call where the call can be optimized by changing
the argument tuple and calling the function directly, it gets recorded
twice.
As a result, the relationship among the statistics appears to be
PCALL_ALL == PCALL_FUNCTION + PCALL_METHOD - PCALL_BOUND_METHOD +
PCALL_CFUNCTION + PCALL_TYPE + PCALL_GENERATOR + PCALL_OTHER
PCALL_FUNCTION > PCALL_FAST_FUNCTION > PCALL_FASTER_FUNCTION
PCALL_METHOD > PCALL_BOUND_METHOD
*/
#define PCALL(POS) pcall[POS]++
PyObject *
PyEval_GetCallStats(PyObject *self)
{
return Py_BuildValue("iiiiiiiiii",
pcall[0], pcall[1], pcall[2], pcall[3],
pcall[4], pcall[5], pcall[6], pcall[7],
pcall[8], pcall[9]);
}
#else
#define PCALL(O)
PyObject *
PyEval_GetCallStats(PyObject *self)
{
Py_INCREF(Py_None);
return Py_None;
}
#endif
static PyTypeObject gentype;
typedef struct {
@ -475,6 +531,7 @@ volatile int _Py_Ticker = 100;
PyObject *
PyEval_EvalCode(PyCodeObject *co, PyObject *globals, PyObject *locals)
{
/* XXX raise SystemError if globals is NULL */
return PyEval_EvalCodeEx(co,
globals, locals,
(PyObject **)NULL, 0,
@ -1980,6 +2037,7 @@ eval_frame(PyFrameObject *f)
continue;
case CALL_FUNCTION:
PCALL(PCALL_ALL);
x = call_function(&stack_pointer, oparg);
PUSH(x);
if (x != NULL)
@ -1995,6 +2053,7 @@ eval_frame(PyFrameObject *f)
int flags = (opcode - CALL_FUNCTION) & 3;
int n = na + 2 * nk;
PyObject **pfunc, *func;
PCALL(PCALL_ALL);
if (flags & CALL_FLAG_VAR)
n++;
if (flags & CALL_FLAG_KW)
@ -2317,9 +2376,8 @@ PyEval_EvalCodeEx(PyCodeObject *co, PyObject *globals, PyObject *locals,
return NULL;
}
f = PyFrame_New(tstate, /*back*/
co, /*code*/
globals, locals);
assert(globals != NULL);
f = PyFrame_New(tstate, co, globals, locals);
if (f == NULL)
return NULL;
@ -2520,6 +2578,8 @@ PyEval_EvalCodeEx(PyCodeObject *co, PyObject *globals, PyObject *locals,
Py_XDECREF(f->f_back);
f->f_back = NULL;
PCALL(PCALL_GENERATOR);
/* Create a new generator that owns the ready to run frame
* and return that as the value. */
return gen_new(f);
@ -3198,12 +3258,12 @@ call_function(PyObject ***pp_stack, int oparg)
PyObject *func = *pfunc;
PyObject *x, *w;
/* Always dispatch PyCFunction first, because
these are presumed to be the most frequent
callable object.
/* Always dispatch PyCFunction first, because these are
presumed to be the most frequent callable object.
*/
if (PyCFunction_Check(func) && nk == 0) {
int flags = PyCFunction_GET_FLAGS(func);
PCALL(PCALL_CFUNCTION);
if (flags & (METH_NOARGS | METH_O)) {
PyCFunction meth = PyCFunction_GET_FUNCTION(func);
PyObject *self = PyCFunction_GET_SELF(func);
@ -3229,6 +3289,8 @@ call_function(PyObject ***pp_stack, int oparg)
if (PyMethod_Check(func) && PyMethod_GET_SELF(func) != NULL) {
/* optimize access to bound methods */
PyObject *self = PyMethod_GET_SELF(func);
PCALL(PCALL_METHOD);
PCALL(PCALL_BOUND_METHOD);
Py_INCREF(self);
func = PyMethod_GET_FUNCTION(func);
Py_INCREF(func);
@ -3245,35 +3307,75 @@ call_function(PyObject ***pp_stack, int oparg)
Py_DECREF(func);
}
/* What does this do? */
while ((*pp_stack) > pfunc) {
w = EXT_POP(*pp_stack);
Py_DECREF(w);
PCALL(PCALL_POP);
}
return x;
}
/* The fast_function() function optimize calls for which no argument
tuple is necessary; the objects are passed directly from the stack.
For the simplest case -- a function that takes only positional
arguments and is called with only positional arguments -- it
inlines the most primitive frame setup code from
PyEval_EvalCodeEx(), which vastly reduces the checks that must be
done before evaluating the frame.
*/
static PyObject *
fast_function(PyObject *func, PyObject ***pp_stack, int n, int na, int nk)
{
PyObject *co = PyFunction_GET_CODE(func);
PyCodeObject *co = (PyCodeObject *)PyFunction_GET_CODE(func);
PyObject *globals = PyFunction_GET_GLOBALS(func);
PyObject *argdefs = PyFunction_GET_DEFAULTS(func);
PyObject *closure = PyFunction_GET_CLOSURE(func);
PyObject **d = NULL;
int nd = 0;
PCALL(PCALL_FUNCTION);
PCALL(PCALL_FAST_FUNCTION);
if (argdefs == NULL && co->co_argcount == n &&
co->co_flags == (CO_OPTIMIZED | CO_NEWLOCALS | CO_NOFREE)) {
PyFrameObject *f;
PyObject *retval = NULL;
PyThreadState *tstate = PyThreadState_GET();
PyObject **fastlocals, **stack;
int i;
PCALL(PCALL_FASTER_FUNCTION);
assert(globals != NULL);
/* XXX Perhaps we should create a specialized
PyFrame_New() that doesn't take locals, but does
take builtins without sanity checking them.
*/
f = PyFrame_New(tstate, co, globals, NULL);
if (f == NULL)
return NULL;
fastlocals = f->f_localsplus;
stack = (*pp_stack) - n;
for (i = 0; i < n; i++) {
Py_INCREF(*stack);
fastlocals[i] = *stack++;
}
retval = eval_frame(f);
assert(tstate != NULL);
++tstate->recursion_depth;
Py_DECREF(f);
--tstate->recursion_depth;
return retval;
}
if (argdefs != NULL) {
d = &PyTuple_GET_ITEM(argdefs, 0);
nd = ((PyTupleObject *)argdefs)->ob_size;
}
return PyEval_EvalCodeEx((PyCodeObject *)co, globals,
(PyObject *)NULL, (*pp_stack)-n, na,
(*pp_stack)-2*nk, nk, d, nd,
closure);
return PyEval_EvalCodeEx(co, globals,
(PyObject *)NULL, (*pp_stack)-n, na,
(*pp_stack)-2*nk, nk, d, nd,
PyFunction_GET_CLOSURE(func));
}
static PyObject *
@ -3371,6 +3473,20 @@ do_call(PyObject *func, PyObject ***pp_stack, int na, int nk)
callargs = load_args(pp_stack, na);
if (callargs == NULL)
goto call_fail;
#ifdef CALL_PROFILE
/* At this point, we have to look at the type of func to
update the call stats properly. Do it here so as to avoid
exposing the call stats machinery outside ceval.c
*/
if (PyFunction_Check(func))
PCALL(PCALL_FUNCTION);
else if (PyMethod_Check(func))
PCALL(PCALL_METHOD);
else if (PyType_Check(func))
PCALL(PCALL_TYPE);
else
PCALL(PCALL_OTHER);
#endif
result = PyObject_Call(func, callargs, kwdict);
call_fail:
Py_XDECREF(callargs);
@ -3426,6 +3542,20 @@ ext_do_call(PyObject *func, PyObject ***pp_stack, int flags, int na, int nk)
callargs = update_star_args(na, nstar, stararg, pp_stack);
if (callargs == NULL)
goto ext_call_fail;
#ifdef CALL_PROFILE
/* At this point, we have to look at the type of func to
update the call stats properly. Do it here so as to avoid
exposing the call stats machinery outside ceval.c
*/
if (PyFunction_Check(func))
PCALL(PCALL_FUNCTION);
else if (PyMethod_Check(func))
PCALL(PCALL_METHOD);
else if (PyType_Check(func))
PCALL(PCALL_TYPE);
else
PCALL(PCALL_OTHER);
#endif
result = PyObject_Call(func, callargs, kwdict);
ext_call_fail:
Py_XDECREF(callargs);

View File

@ -385,6 +385,9 @@ PyCode_New(int argcount, int nlocals, int stacksize, int flags,
co->co_firstlineno = firstlineno;
Py_INCREF(lnotab);
co->co_lnotab = lnotab;
if (PyTuple_GET_SIZE(freevars) == 0 &&
PyTuple_GET_SIZE(cellvars) == 0)
co->co_flags |= CO_NOFREE;
}
return co;
}

View File

@ -562,6 +562,28 @@ sys_getframe(PyObject *self, PyObject *args)
return (PyObject*)f;
}
PyDoc_STRVAR(callstats_doc,
"callstats() -> tuple of integers\n\
\n\
Return a tuple of function call statistics, if CALL_PROFILE was defined\n\
when Python was built. Otherwise, return None.\n\
\n\
When enabled, this function returns detailed, implementation-specific\n\
details about the number of function calls executed. The return value is\n\
a 11-tuple where the entries in the tuple are counts of:\n\
0. all function calls\n\
1. calls to PyFunction_Type objects\n\
2. PyFunction calls that do not create an argument tuple\n\
3. PyFunction calls that do not create an argument tuple\n\
and bypass PyEval_EvalCodeEx()\n\
4. PyMethod calls\n\
5. PyMethod calls on bound methods\n\
6. PyType calls\n\
7. PyCFunction calls\n\
8. generator calls\n\
9. All other calls\n\
10. Number of stack pops performed by call_function()"
);
#ifdef Py_TRACE_REFS
/* Defined in objects.c because it uses static globals if that file */
@ -575,13 +597,15 @@ extern PyObject *_Py_GetDXProfile(PyObject *, PyObject *);
static PyMethodDef sys_methods[] = {
/* Might as well keep this in alphabetic order */
{"callstats", (PyCFunction)PyEval_GetCallStats, METH_NOARGS,
callstats_doc},
{"displayhook", sys_displayhook, METH_O, displayhook_doc},
{"exc_info", (PyCFunction)sys_exc_info, METH_NOARGS, exc_info_doc},
{"excepthook", sys_excepthook, METH_VARARGS, excepthook_doc},
{"exit", sys_exit, METH_VARARGS, exit_doc},
#ifdef Py_USING_UNICODE
{"getdefaultencoding", (PyCFunction)sys_getdefaultencoding, METH_NOARGS,
getdefaultencoding_doc},
{"getdefaultencoding", (PyCFunction)sys_getdefaultencoding,
METH_NOARGS, getdefaultencoding_doc},
#endif
#ifdef HAVE_DLOPEN
{"getdlopenflags", (PyCFunction)sys_getdlopenflags, METH_NOARGS,