mirror of https://github.com/python/cpython
Small function call optimization and special build option for call stats.
-DCALL_PROFILE: Count the number of function calls executed. When this symbol is defined, the ceval mainloop and helper functions count the number of function calls made. It keeps detailed statistics about what kind of object was called and whether the call hit any of the special fast paths in the code. Optimization: When we take the fast_function() path, which seems to be taken for most function calls, and there is minimal frame setup to do, avoid call PyEval_EvalCodeEx(). The eval code ex function does a lot of work to handle keywords args and star args, free variables, generators, etc. The inlined version simply allocates the frame and copies the arguments values into the frame. The optimization gets a little help from compile.c which adds a CO_NOFREE flag to code objects that don't have free variables or cell variables. This change allows fast_function() to get into the fast path with fewer tests. I measure a couple of percent speedup in pystone with this change, but there's surely more that can be done.
This commit is contained in:
parent
f3f4af5521
commit
985eba53f5
|
@ -48,6 +48,8 @@ PyAPI_FUNC(int) Py_GetRecursionLimit(void);
|
|||
PyAPI_FUNC(char *) PyEval_GetFuncName(PyObject *);
|
||||
PyAPI_FUNC(char *) PyEval_GetFuncDesc(PyObject *);
|
||||
|
||||
PyAPI_FUNC(PyObject *) PyEval_GetCallStats(PyObject *);
|
||||
|
||||
/* this used to be handled on a per-thread basis - now just two globals */
|
||||
PyAPI_DATA(volatile int) _Py_Ticker;
|
||||
PyAPI_DATA(int) _Py_CheckInterval;
|
||||
|
|
|
@ -34,6 +34,12 @@ typedef struct {
|
|||
#define CO_VARKEYWORDS 0x0008
|
||||
#define CO_NESTED 0x0010
|
||||
#define CO_GENERATOR 0x0020
|
||||
/* The CO_NOFREE flag is set if there are no free or cell variables.
|
||||
This information is redundant, but it allows a single flag test
|
||||
to determine whether there is any extra work to be done when the
|
||||
call frame it setup.
|
||||
*/
|
||||
#define CO_NOFREE 0x0040
|
||||
/* XXX Temporary hack. Until generators are a permanent part of the
|
||||
language, we need a way for a code object to record that generators
|
||||
were *possible* when it was compiled. This is so code dynamically
|
||||
|
|
|
@ -199,3 +199,13 @@ sprayed to stdout, such as every opcode and opcode argument and values
|
|||
pushed onto and popped off the value stack.
|
||||
|
||||
Not useful very often, but very useful when needed.
|
||||
|
||||
---------------------------------------------------------------------------
|
||||
CALL_PROFILE introduced for Python 2.3
|
||||
|
||||
Count the number of function calls executed.
|
||||
|
||||
When this symbol is defined, the ceval mainloop and helper functions
|
||||
count the number of function calls made. It keeps detailed statistics
|
||||
about what kind of object was called and whether the call hit any of
|
||||
the special fast paths in the code.
|
||||
|
|
150
Python/ceval.c
150
Python/ceval.c
|
@ -87,6 +87,62 @@ static long dxp[256];
|
|||
#endif
|
||||
#endif
|
||||
|
||||
/* Function call profile */
|
||||
#ifdef CALL_PROFILE
|
||||
#define PCALL_NUM 11
|
||||
static int pcall[PCALL_NUM];
|
||||
|
||||
#define PCALL_ALL 0
|
||||
#define PCALL_FUNCTION 1
|
||||
#define PCALL_FAST_FUNCTION 2
|
||||
#define PCALL_FASTER_FUNCTION 3
|
||||
#define PCALL_METHOD 4
|
||||
#define PCALL_BOUND_METHOD 5
|
||||
#define PCALL_CFUNCTION 6
|
||||
#define PCALL_TYPE 7
|
||||
#define PCALL_GENERATOR 8
|
||||
#define PCALL_OTHER 9
|
||||
#define PCALL_POP 10
|
||||
|
||||
/* Notes about the statistics
|
||||
|
||||
PCALL_FAST stats
|
||||
|
||||
FAST_FUNCTION means no argument tuple needs to be created.
|
||||
FASTER_FUNCTION means that the fast-path frame setup code is used.
|
||||
|
||||
If there is a method call where the call can be optimized by changing
|
||||
the argument tuple and calling the function directly, it gets recorded
|
||||
twice.
|
||||
|
||||
As a result, the relationship among the statistics appears to be
|
||||
PCALL_ALL == PCALL_FUNCTION + PCALL_METHOD - PCALL_BOUND_METHOD +
|
||||
PCALL_CFUNCTION + PCALL_TYPE + PCALL_GENERATOR + PCALL_OTHER
|
||||
PCALL_FUNCTION > PCALL_FAST_FUNCTION > PCALL_FASTER_FUNCTION
|
||||
PCALL_METHOD > PCALL_BOUND_METHOD
|
||||
*/
|
||||
|
||||
#define PCALL(POS) pcall[POS]++
|
||||
|
||||
PyObject *
|
||||
PyEval_GetCallStats(PyObject *self)
|
||||
{
|
||||
return Py_BuildValue("iiiiiiiiii",
|
||||
pcall[0], pcall[1], pcall[2], pcall[3],
|
||||
pcall[4], pcall[5], pcall[6], pcall[7],
|
||||
pcall[8], pcall[9]);
|
||||
}
|
||||
#else
|
||||
#define PCALL(O)
|
||||
|
||||
PyObject *
|
||||
PyEval_GetCallStats(PyObject *self)
|
||||
{
|
||||
Py_INCREF(Py_None);
|
||||
return Py_None;
|
||||
}
|
||||
#endif
|
||||
|
||||
static PyTypeObject gentype;
|
||||
|
||||
typedef struct {
|
||||
|
@ -475,6 +531,7 @@ volatile int _Py_Ticker = 100;
|
|||
PyObject *
|
||||
PyEval_EvalCode(PyCodeObject *co, PyObject *globals, PyObject *locals)
|
||||
{
|
||||
/* XXX raise SystemError if globals is NULL */
|
||||
return PyEval_EvalCodeEx(co,
|
||||
globals, locals,
|
||||
(PyObject **)NULL, 0,
|
||||
|
@ -1980,6 +2037,7 @@ eval_frame(PyFrameObject *f)
|
|||
continue;
|
||||
|
||||
case CALL_FUNCTION:
|
||||
PCALL(PCALL_ALL);
|
||||
x = call_function(&stack_pointer, oparg);
|
||||
PUSH(x);
|
||||
if (x != NULL)
|
||||
|
@ -1995,6 +2053,7 @@ eval_frame(PyFrameObject *f)
|
|||
int flags = (opcode - CALL_FUNCTION) & 3;
|
||||
int n = na + 2 * nk;
|
||||
PyObject **pfunc, *func;
|
||||
PCALL(PCALL_ALL);
|
||||
if (flags & CALL_FLAG_VAR)
|
||||
n++;
|
||||
if (flags & CALL_FLAG_KW)
|
||||
|
@ -2317,9 +2376,8 @@ PyEval_EvalCodeEx(PyCodeObject *co, PyObject *globals, PyObject *locals,
|
|||
return NULL;
|
||||
}
|
||||
|
||||
f = PyFrame_New(tstate, /*back*/
|
||||
co, /*code*/
|
||||
globals, locals);
|
||||
assert(globals != NULL);
|
||||
f = PyFrame_New(tstate, co, globals, locals);
|
||||
if (f == NULL)
|
||||
return NULL;
|
||||
|
||||
|
@ -2520,6 +2578,8 @@ PyEval_EvalCodeEx(PyCodeObject *co, PyObject *globals, PyObject *locals,
|
|||
Py_XDECREF(f->f_back);
|
||||
f->f_back = NULL;
|
||||
|
||||
PCALL(PCALL_GENERATOR);
|
||||
|
||||
/* Create a new generator that owns the ready to run frame
|
||||
* and return that as the value. */
|
||||
return gen_new(f);
|
||||
|
@ -3198,12 +3258,12 @@ call_function(PyObject ***pp_stack, int oparg)
|
|||
PyObject *func = *pfunc;
|
||||
PyObject *x, *w;
|
||||
|
||||
/* Always dispatch PyCFunction first, because
|
||||
these are presumed to be the most frequent
|
||||
callable object.
|
||||
/* Always dispatch PyCFunction first, because these are
|
||||
presumed to be the most frequent callable object.
|
||||
*/
|
||||
if (PyCFunction_Check(func) && nk == 0) {
|
||||
int flags = PyCFunction_GET_FLAGS(func);
|
||||
PCALL(PCALL_CFUNCTION);
|
||||
if (flags & (METH_NOARGS | METH_O)) {
|
||||
PyCFunction meth = PyCFunction_GET_FUNCTION(func);
|
||||
PyObject *self = PyCFunction_GET_SELF(func);
|
||||
|
@ -3229,6 +3289,8 @@ call_function(PyObject ***pp_stack, int oparg)
|
|||
if (PyMethod_Check(func) && PyMethod_GET_SELF(func) != NULL) {
|
||||
/* optimize access to bound methods */
|
||||
PyObject *self = PyMethod_GET_SELF(func);
|
||||
PCALL(PCALL_METHOD);
|
||||
PCALL(PCALL_BOUND_METHOD);
|
||||
Py_INCREF(self);
|
||||
func = PyMethod_GET_FUNCTION(func);
|
||||
Py_INCREF(func);
|
||||
|
@ -3245,35 +3307,75 @@ call_function(PyObject ***pp_stack, int oparg)
|
|||
Py_DECREF(func);
|
||||
}
|
||||
|
||||
/* What does this do? */
|
||||
while ((*pp_stack) > pfunc) {
|
||||
w = EXT_POP(*pp_stack);
|
||||
Py_DECREF(w);
|
||||
PCALL(PCALL_POP);
|
||||
}
|
||||
return x;
|
||||
}
|
||||
|
||||
/* The fast_function() function optimize calls for which no argument
|
||||
tuple is necessary; the objects are passed directly from the stack.
|
||||
For the simplest case -- a function that takes only positional
|
||||
arguments and is called with only positional arguments -- it
|
||||
inlines the most primitive frame setup code from
|
||||
PyEval_EvalCodeEx(), which vastly reduces the checks that must be
|
||||
done before evaluating the frame.
|
||||
*/
|
||||
|
||||
static PyObject *
|
||||
fast_function(PyObject *func, PyObject ***pp_stack, int n, int na, int nk)
|
||||
{
|
||||
PyObject *co = PyFunction_GET_CODE(func);
|
||||
PyCodeObject *co = (PyCodeObject *)PyFunction_GET_CODE(func);
|
||||
PyObject *globals = PyFunction_GET_GLOBALS(func);
|
||||
PyObject *argdefs = PyFunction_GET_DEFAULTS(func);
|
||||
PyObject *closure = PyFunction_GET_CLOSURE(func);
|
||||
PyObject **d = NULL;
|
||||
int nd = 0;
|
||||
|
||||
PCALL(PCALL_FUNCTION);
|
||||
PCALL(PCALL_FAST_FUNCTION);
|
||||
if (argdefs == NULL && co->co_argcount == n &&
|
||||
co->co_flags == (CO_OPTIMIZED | CO_NEWLOCALS | CO_NOFREE)) {
|
||||
PyFrameObject *f;
|
||||
PyObject *retval = NULL;
|
||||
PyThreadState *tstate = PyThreadState_GET();
|
||||
PyObject **fastlocals, **stack;
|
||||
int i;
|
||||
|
||||
PCALL(PCALL_FASTER_FUNCTION);
|
||||
assert(globals != NULL);
|
||||
/* XXX Perhaps we should create a specialized
|
||||
PyFrame_New() that doesn't take locals, but does
|
||||
take builtins without sanity checking them.
|
||||
*/
|
||||
f = PyFrame_New(tstate, co, globals, NULL);
|
||||
if (f == NULL)
|
||||
return NULL;
|
||||
|
||||
fastlocals = f->f_localsplus;
|
||||
stack = (*pp_stack) - n;
|
||||
|
||||
for (i = 0; i < n; i++) {
|
||||
Py_INCREF(*stack);
|
||||
fastlocals[i] = *stack++;
|
||||
}
|
||||
retval = eval_frame(f);
|
||||
assert(tstate != NULL);
|
||||
++tstate->recursion_depth;
|
||||
Py_DECREF(f);
|
||||
--tstate->recursion_depth;
|
||||
return retval;
|
||||
}
|
||||
if (argdefs != NULL) {
|
||||
d = &PyTuple_GET_ITEM(argdefs, 0);
|
||||
nd = ((PyTupleObject *)argdefs)->ob_size;
|
||||
}
|
||||
return PyEval_EvalCodeEx((PyCodeObject *)co, globals,
|
||||
return PyEval_EvalCodeEx(co, globals,
|
||||
(PyObject *)NULL, (*pp_stack)-n, na,
|
||||
(*pp_stack)-2*nk, nk, d, nd,
|
||||
closure);
|
||||
PyFunction_GET_CLOSURE(func));
|
||||
}
|
||||
|
||||
static PyObject *
|
||||
|
@ -3371,6 +3473,20 @@ do_call(PyObject *func, PyObject ***pp_stack, int na, int nk)
|
|||
callargs = load_args(pp_stack, na);
|
||||
if (callargs == NULL)
|
||||
goto call_fail;
|
||||
#ifdef CALL_PROFILE
|
||||
/* At this point, we have to look at the type of func to
|
||||
update the call stats properly. Do it here so as to avoid
|
||||
exposing the call stats machinery outside ceval.c
|
||||
*/
|
||||
if (PyFunction_Check(func))
|
||||
PCALL(PCALL_FUNCTION);
|
||||
else if (PyMethod_Check(func))
|
||||
PCALL(PCALL_METHOD);
|
||||
else if (PyType_Check(func))
|
||||
PCALL(PCALL_TYPE);
|
||||
else
|
||||
PCALL(PCALL_OTHER);
|
||||
#endif
|
||||
result = PyObject_Call(func, callargs, kwdict);
|
||||
call_fail:
|
||||
Py_XDECREF(callargs);
|
||||
|
@ -3426,6 +3542,20 @@ ext_do_call(PyObject *func, PyObject ***pp_stack, int flags, int na, int nk)
|
|||
callargs = update_star_args(na, nstar, stararg, pp_stack);
|
||||
if (callargs == NULL)
|
||||
goto ext_call_fail;
|
||||
#ifdef CALL_PROFILE
|
||||
/* At this point, we have to look at the type of func to
|
||||
update the call stats properly. Do it here so as to avoid
|
||||
exposing the call stats machinery outside ceval.c
|
||||
*/
|
||||
if (PyFunction_Check(func))
|
||||
PCALL(PCALL_FUNCTION);
|
||||
else if (PyMethod_Check(func))
|
||||
PCALL(PCALL_METHOD);
|
||||
else if (PyType_Check(func))
|
||||
PCALL(PCALL_TYPE);
|
||||
else
|
||||
PCALL(PCALL_OTHER);
|
||||
#endif
|
||||
result = PyObject_Call(func, callargs, kwdict);
|
||||
ext_call_fail:
|
||||
Py_XDECREF(callargs);
|
||||
|
|
|
@ -385,6 +385,9 @@ PyCode_New(int argcount, int nlocals, int stacksize, int flags,
|
|||
co->co_firstlineno = firstlineno;
|
||||
Py_INCREF(lnotab);
|
||||
co->co_lnotab = lnotab;
|
||||
if (PyTuple_GET_SIZE(freevars) == 0 &&
|
||||
PyTuple_GET_SIZE(cellvars) == 0)
|
||||
co->co_flags |= CO_NOFREE;
|
||||
}
|
||||
return co;
|
||||
}
|
||||
|
|
|
@ -562,6 +562,28 @@ sys_getframe(PyObject *self, PyObject *args)
|
|||
return (PyObject*)f;
|
||||
}
|
||||
|
||||
PyDoc_STRVAR(callstats_doc,
|
||||
"callstats() -> tuple of integers\n\
|
||||
\n\
|
||||
Return a tuple of function call statistics, if CALL_PROFILE was defined\n\
|
||||
when Python was built. Otherwise, return None.\n\
|
||||
\n\
|
||||
When enabled, this function returns detailed, implementation-specific\n\
|
||||
details about the number of function calls executed. The return value is\n\
|
||||
a 11-tuple where the entries in the tuple are counts of:\n\
|
||||
0. all function calls\n\
|
||||
1. calls to PyFunction_Type objects\n\
|
||||
2. PyFunction calls that do not create an argument tuple\n\
|
||||
3. PyFunction calls that do not create an argument tuple\n\
|
||||
and bypass PyEval_EvalCodeEx()\n\
|
||||
4. PyMethod calls\n\
|
||||
5. PyMethod calls on bound methods\n\
|
||||
6. PyType calls\n\
|
||||
7. PyCFunction calls\n\
|
||||
8. generator calls\n\
|
||||
9. All other calls\n\
|
||||
10. Number of stack pops performed by call_function()"
|
||||
);
|
||||
|
||||
#ifdef Py_TRACE_REFS
|
||||
/* Defined in objects.c because it uses static globals if that file */
|
||||
|
@ -575,13 +597,15 @@ extern PyObject *_Py_GetDXProfile(PyObject *, PyObject *);
|
|||
|
||||
static PyMethodDef sys_methods[] = {
|
||||
/* Might as well keep this in alphabetic order */
|
||||
{"callstats", (PyCFunction)PyEval_GetCallStats, METH_NOARGS,
|
||||
callstats_doc},
|
||||
{"displayhook", sys_displayhook, METH_O, displayhook_doc},
|
||||
{"exc_info", (PyCFunction)sys_exc_info, METH_NOARGS, exc_info_doc},
|
||||
{"excepthook", sys_excepthook, METH_VARARGS, excepthook_doc},
|
||||
{"exit", sys_exit, METH_VARARGS, exit_doc},
|
||||
#ifdef Py_USING_UNICODE
|
||||
{"getdefaultencoding", (PyCFunction)sys_getdefaultencoding, METH_NOARGS,
|
||||
getdefaultencoding_doc},
|
||||
{"getdefaultencoding", (PyCFunction)sys_getdefaultencoding,
|
||||
METH_NOARGS, getdefaultencoding_doc},
|
||||
#endif
|
||||
#ifdef HAVE_DLOPEN
|
||||
{"getdlopenflags", (PyCFunction)sys_getdlopenflags, METH_NOARGS,
|
||||
|
|
Loading…
Reference in New Issue