From 6ddb09f35b922a3bbb59e408a3ca7636a6938468 Mon Sep 17 00:00:00 2001 From: Dennis Sweeney <36520290+sweeneyde@users.noreply.github.com> Date: Tue, 1 Mar 2022 23:46:30 -0500 Subject: [PATCH] bpo-46848: Use stringlib/fastsearch in mmap (GH-31625) Speed up mmap.find(). Add _PyBytes_Find() and _PyBytes_ReverseFind(). --- Include/cpython/bytesobject.h | 19 +++++++++++ .../2022-03-01-01-16-13.bpo-46848.BB01Fr.rst | 3 ++ Modules/mmapmodule.c | 32 ++++++++----------- Objects/bytesobject.c | 18 +++++++++++ 4 files changed, 53 insertions(+), 19 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2022-03-01-01-16-13.bpo-46848.BB01Fr.rst diff --git a/Include/cpython/bytesobject.h b/Include/cpython/bytesobject.h index 6b3f55224fc..38a0fe0af66 100644 --- a/Include/cpython/bytesobject.h +++ b/Include/cpython/bytesobject.h @@ -116,3 +116,22 @@ PyAPI_FUNC(void*) _PyBytesWriter_WriteBytes(_PyBytesWriter *writer, void *str, const void *bytes, Py_ssize_t size); + +/* Substring Search. + + Returns the index of the first occurence of + a substring ("needle") in a larger text ("haystack"). + If the needle is not found, return -1. + If the needle is found, add offset to the index. +*/ + +PyAPI_FUNC(Py_ssize_t) +_PyBytes_Find(const char *haystack, Py_ssize_t len_haystack, + const char *needle, Py_ssize_t len_needle, + Py_ssize_t offset); + +/* Same as above, but search right-to-left */ +PyAPI_FUNC(Py_ssize_t) +_PyBytes_ReverseFind(const char *haystack, Py_ssize_t len_haystack, + const char *needle, Py_ssize_t len_needle, + Py_ssize_t offset); diff --git a/Misc/NEWS.d/next/Library/2022-03-01-01-16-13.bpo-46848.BB01Fr.rst b/Misc/NEWS.d/next/Library/2022-03-01-01-16-13.bpo-46848.BB01Fr.rst new file mode 100644 index 00000000000..bd20a843ab6 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2022-03-01-01-16-13.bpo-46848.BB01Fr.rst @@ -0,0 +1,3 @@ +For performance, use the optimized string-searching implementations +from :meth:`~bytes.find` and :meth:`~bytes.rfind` +for :meth:`~mmap.find` and :meth:`~mmap.rfind`. diff --git a/Modules/mmapmodule.c b/Modules/mmapmodule.c index 26cedf1b900..6a038e72f93 100644 --- a/Modules/mmapmodule.c +++ b/Modules/mmapmodule.c @@ -315,12 +315,8 @@ mmap_gfind(mmap_object *self, if (!PyArg_ParseTuple(args, reverse ? "y*|nn:rfind" : "y*|nn:find", &view, &start, &end)) { return NULL; - } else { - const char *p, *start_p, *end_p; - int sign = reverse ? -1 : 1; - const char *needle = view.buf; - Py_ssize_t len = view.len; - + } + else { if (start < 0) start += self->size; if (start < 0) @@ -335,21 +331,19 @@ mmap_gfind(mmap_object *self, else if (end > self->size) end = self->size; - start_p = self->data + start; - end_p = self->data + end; - - for (p = (reverse ? end_p - len : start_p); - (p >= start_p) && (p + len <= end_p); p += sign) { - Py_ssize_t i; - for (i = 0; i < len && needle[i] == p[i]; ++i) - /* nothing */; - if (i == len) { - PyBuffer_Release(&view); - return PyLong_FromSsize_t(p - self->data); - } + Py_ssize_t res; + if (reverse) { + res = _PyBytes_ReverseFind( + self->data + start, end - start, + view.buf, view.len, start); + } + else { + res = _PyBytes_Find( + self->data + start, end - start, + view.buf, view.len, start); } PyBuffer_Release(&view); - return PyLong_FromLong(-1); + return PyLong_FromSsize_t(res); } } diff --git a/Objects/bytesobject.c b/Objects/bytesobject.c index 3d8a21696d1..4c67b8f7af2 100644 --- a/Objects/bytesobject.c +++ b/Objects/bytesobject.c @@ -1247,6 +1247,24 @@ PyBytes_AsStringAndSize(PyObject *obj, #undef STRINGLIB_GET_EMPTY +Py_ssize_t +_PyBytes_Find(const char *haystack, Py_ssize_t len_haystack, + const char *needle, Py_ssize_t len_needle, + Py_ssize_t offset) +{ + return stringlib_find(haystack, len_haystack, + needle, len_needle, offset); +} + +Py_ssize_t +_PyBytes_ReverseFind(const char *haystack, Py_ssize_t len_haystack, + const char *needle, Py_ssize_t len_needle, + Py_ssize_t offset) +{ + return stringlib_rfind(haystack, len_haystack, + needle, len_needle, offset); +} + PyObject * PyBytes_Repr(PyObject *obj, int smartquotes) {