From ed184c06e2e610e12050c5d5c9b0c1c2ecabb930 Mon Sep 17 00:00:00 2001 From: Jeroen Demeyer Date: Sat, 13 Jul 2019 16:39:18 +0200 Subject: [PATCH] bpo-37358: Use vectorcall for functools.partial (GH-14284) https://bugs.python.org/issue37358 --- .../2019-06-21-14-54-02.bpo-37358.RsASpn.rst | 1 + Modules/_functoolsmodule.c | 166 +++++++++++------- 2 files changed, 100 insertions(+), 67 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2019-06-21-14-54-02.bpo-37358.RsASpn.rst diff --git a/Misc/NEWS.d/next/Library/2019-06-21-14-54-02.bpo-37358.RsASpn.rst b/Misc/NEWS.d/next/Library/2019-06-21-14-54-02.bpo-37358.RsASpn.rst new file mode 100644 index 00000000000..bc37631020f --- /dev/null +++ b/Misc/NEWS.d/next/Library/2019-06-21-14-54-02.bpo-37358.RsASpn.rst @@ -0,0 +1 @@ +Optimized ``functools.partial`` by using vectorcall. diff --git a/Modules/_functoolsmodule.c b/Modules/_functoolsmodule.c index a101363bf02..17d49ac5b34 100644 --- a/Modules/_functoolsmodule.c +++ b/Modules/_functoolsmodule.c @@ -18,13 +18,15 @@ typedef struct { PyObject *fn; PyObject *args; PyObject *kw; - PyObject *dict; + PyObject *dict; /* __dict__ */ PyObject *weakreflist; /* List of weak references */ - int use_fastcall; + vectorcallfunc vectorcall; } partialobject; static PyTypeObject partial_type; +static void partial_setvectorcall(partialobject *pto); + static PyObject * partial_new(PyTypeObject *type, PyObject *args, PyObject *kw) { @@ -107,8 +109,7 @@ partial_new(PyTypeObject *type, PyObject *args, PyObject *kw) return NULL; } - pto->use_fastcall = (_PyVectorcall_Function(func) != NULL); - + partial_setvectorcall(pto); return (PyObject *)pto; } @@ -126,77 +127,107 @@ partial_dealloc(partialobject *pto) Py_TYPE(pto)->tp_free(pto); } -static PyObject * -partial_fastcall(partialobject *pto, PyObject **args, Py_ssize_t nargs, - PyObject *kwargs) + +/* Merging keyword arguments using the vectorcall convention is messy, so + * if we would need to do that, we stop using vectorcall and fall back + * to using partial_call() instead. */ +_Py_NO_INLINE static PyObject * +partial_vectorcall_fallback(partialobject *pto, PyObject *const *args, + size_t nargsf, PyObject *kwnames) { + pto->vectorcall = NULL; + Py_ssize_t nargs = PyVectorcall_NARGS(nargsf); + return _PyObject_MakeTpCall((PyObject *)pto, args, nargs, kwnames); +} + +static PyObject * +partial_vectorcall(partialobject *pto, PyObject *const *args, + size_t nargsf, PyObject *kwnames) +{ + /* pto->kw is mutable, so need to check every time */ + if (PyDict_GET_SIZE(pto->kw)) { + return partial_vectorcall_fallback(pto, args, nargsf, kwnames); + } + + Py_ssize_t nargs = PyVectorcall_NARGS(nargsf); + Py_ssize_t nargs_total = nargs; + if (kwnames != NULL) { + nargs_total += PyTuple_GET_SIZE(kwnames); + } + + PyObject **pto_args = _PyTuple_ITEMS(pto->args); + Py_ssize_t pto_nargs = PyTuple_GET_SIZE(pto->args); + + /* Fast path if we're called without arguments */ + if (nargs_total == 0) { + return _PyObject_Vectorcall(pto->fn, pto_args, pto_nargs, NULL); + } + + /* Fast path using PY_VECTORCALL_ARGUMENTS_OFFSET to prepend a single + * positional argument */ + if (pto_nargs == 1 && (nargsf & PY_VECTORCALL_ARGUMENTS_OFFSET)) { + PyObject **newargs = (PyObject **)args - 1; + PyObject *tmp = newargs[0]; + newargs[0] = pto_args[0]; + PyObject *ret = _PyObject_Vectorcall(pto->fn, newargs, nargs + 1, kwnames); + newargs[0] = tmp; + return ret; + } + + Py_ssize_t newnargs_total = pto_nargs + nargs_total; + PyObject *small_stack[_PY_FASTCALL_SMALL_STACK]; PyObject *ret; - PyObject **stack, **stack_buf = NULL; - Py_ssize_t nargs2, pto_nargs; + PyObject **stack; - pto_nargs = PyTuple_GET_SIZE(pto->args); - nargs2 = pto_nargs + nargs; - - if (pto_nargs == 0) { - stack = args; - } - else if (nargs == 0) { - stack = _PyTuple_ITEMS(pto->args); + if (newnargs_total <= (Py_ssize_t)Py_ARRAY_LENGTH(small_stack)) { + stack = small_stack; } else { - if (nargs2 <= (Py_ssize_t)Py_ARRAY_LENGTH(small_stack)) { - stack = small_stack; + stack = PyMem_Malloc(newnargs_total * sizeof(PyObject *)); + if (stack == NULL) { + PyErr_NoMemory(); + return NULL; } - else { - stack_buf = PyMem_Malloc(nargs2 * sizeof(PyObject *)); - if (stack_buf == NULL) { - PyErr_NoMemory(); - return NULL; - } - stack = stack_buf; - } - - /* use borrowed references */ - memcpy(stack, - _PyTuple_ITEMS(pto->args), - pto_nargs * sizeof(PyObject*)); - memcpy(&stack[pto_nargs], - args, - nargs * sizeof(PyObject*)); } - ret = _PyObject_FastCallDict(pto->fn, stack, nargs2, kwargs); - PyMem_Free(stack_buf); + /* Copy to new stack, using borrowed references */ + memcpy(stack, pto_args, pto_nargs * sizeof(PyObject*)); + memcpy(stack + pto_nargs, args, nargs_total * sizeof(PyObject*)); + + ret = _PyObject_Vectorcall(pto->fn, stack, pto_nargs + nargs, kwnames); + if (stack != small_stack) { + PyMem_Free(stack); + } return ret; } -static PyObject * -partial_call_impl(partialobject *pto, PyObject *args, PyObject *kwargs) +/* Set pto->vectorcall depending on the parameters of the partial object */ +static void +partial_setvectorcall(partialobject *pto) { - PyObject *ret, *args2; - - /* Note: tupleconcat() is optimized for empty tuples */ - args2 = PySequence_Concat(pto->args, args); - if (args2 == NULL) { - return NULL; + if (_PyVectorcall_Function(pto->fn) == NULL) { + /* Don't use vectorcall if the underlying function doesn't support it */ + pto->vectorcall = NULL; + } + /* We could have a special case if there are no arguments, + * but that is unlikely (why use partial without arguments?), + * so we don't optimize that */ + else { + pto->vectorcall = (vectorcallfunc)partial_vectorcall; } - assert(PyTuple_Check(args2)); - - ret = PyObject_Call(pto->fn, args2, kwargs); - Py_DECREF(args2); - return ret; } + static PyObject * partial_call(partialobject *pto, PyObject *args, PyObject *kwargs) { - PyObject *kwargs2, *res; - - assert (PyCallable_Check(pto->fn)); - assert (PyTuple_Check(pto->args)); - assert (PyDict_Check(pto->kw)); + assert(PyCallable_Check(pto->fn)); + assert(PyTuple_Check(pto->args)); + assert(PyDict_Check(pto->kw)); + /* Merge keywords */ + PyObject *kwargs2; if (PyDict_GET_SIZE(pto->kw) == 0) { /* kwargs can be NULL */ kwargs2 = kwargs; @@ -219,16 +250,16 @@ partial_call(partialobject *pto, PyObject *args, PyObject *kwargs) } } + /* Merge positional arguments */ + /* Note: tupleconcat() is optimized for empty tuples */ + PyObject *args2 = PySequence_Concat(pto->args, args); + if (args2 == NULL) { + Py_XDECREF(kwargs2); + return NULL; + } - if (pto->use_fastcall) { - res = partial_fastcall(pto, - _PyTuple_ITEMS(args), - PyTuple_GET_SIZE(args), - kwargs2); - } - else { - res = partial_call_impl(pto, args, kwargs2); - } + PyObject *res = PyObject_Call(pto->fn, args2, kwargs2); + Py_DECREF(args2); Py_XDECREF(kwargs2); return res; } @@ -365,11 +396,11 @@ partial_setstate(partialobject *pto, PyObject *state) Py_INCREF(dict); Py_INCREF(fn); - pto->use_fastcall = (_PyVectorcall_Function(fn) != NULL); Py_SETREF(pto->fn, fn); Py_SETREF(pto->args, fnargs); Py_SETREF(pto->kw, kw); Py_XSETREF(pto->dict, dict); + partial_setvectorcall(pto); Py_RETURN_NONE; } @@ -386,7 +417,7 @@ static PyTypeObject partial_type = { 0, /* tp_itemsize */ /* methods */ (destructor)partial_dealloc, /* tp_dealloc */ - 0, /* tp_vectorcall_offset */ + offsetof(partialobject, vectorcall),/* tp_vectorcall_offset */ 0, /* tp_getattr */ 0, /* tp_setattr */ 0, /* tp_as_async */ @@ -401,7 +432,8 @@ static PyTypeObject partial_type = { PyObject_GenericSetAttr, /* tp_setattro */ 0, /* tp_as_buffer */ Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC | - Py_TPFLAGS_BASETYPE, /* tp_flags */ + Py_TPFLAGS_BASETYPE | + _Py_TPFLAGS_HAVE_VECTORCALL, /* tp_flags */ partial_doc, /* tp_doc */ (traverseproc)partial_traverse, /* tp_traverse */ 0, /* tp_clear */