gh-102511: Speed up os.path.splitroot() with native helpers (GH-118089)

This commit is contained in:
Nice Zombies 2024-04-25 11:07:38 +02:00 committed by GitHub
parent e38b43c213
commit 10bb90ed49
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 337 additions and 108 deletions

View File

@ -290,6 +290,8 @@ extern wchar_t *_Py_normpath_and_size(wchar_t *path, Py_ssize_t size, Py_ssize_t
extern HRESULT PathCchSkipRoot(const wchar_t *pszPath, const wchar_t **ppszRootEnd);
#endif /* defined(MS_WINDOWS_GAMES) && !defined(MS_WINDOWS_DESKTOP) */
extern void _Py_skiproot(const wchar_t *path, Py_ssize_t size, Py_ssize_t *drvsize, Py_ssize_t *rootsize);
// Macros to protect CRT calls against instant termination when passed an
// invalid parameter (bpo-23524). IPH stands for Invalid Parameter Handler.
// Usage:

View File

@ -167,56 +167,76 @@ def splitdrive(p):
return drive, root + tail
def splitroot(p):
"""Split a pathname into drive, root and tail. The drive is defined
exactly as in splitdrive(). On Windows, the root may be a single path
separator or an empty string. The tail contains anything after the root.
For example:
try:
from nt import _path_splitroot_ex
except ImportError:
def splitroot(p):
"""Split a pathname into drive, root and tail. The drive is defined
exactly as in splitdrive(). On Windows, the root may be a single path
separator or an empty string. The tail contains anything after the root.
For example:
splitroot('//server/share/') == ('//server/share', '/', '')
splitroot('C:/Users/Barney') == ('C:', '/', 'Users/Barney')
splitroot('C:///spam///ham') == ('C:', '/', '//spam///ham')
splitroot('Windows/notepad') == ('', '', 'Windows/notepad')
"""
p = os.fspath(p)
if isinstance(p, bytes):
sep = b'\\'
altsep = b'/'
colon = b':'
unc_prefix = b'\\\\?\\UNC\\'
empty = b''
else:
sep = '\\'
altsep = '/'
colon = ':'
unc_prefix = '\\\\?\\UNC\\'
empty = ''
normp = p.replace(altsep, sep)
if normp[:1] == sep:
if normp[1:2] == sep:
# UNC drives, e.g. \\server\share or \\?\UNC\server\share
# Device drives, e.g. \\.\device or \\?\device
start = 8 if normp[:8].upper() == unc_prefix else 2
index = normp.find(sep, start)
if index == -1:
return p, empty, empty
index2 = normp.find(sep, index + 1)
if index2 == -1:
return p, empty, empty
return p[:index2], p[index2:index2 + 1], p[index2 + 1:]
splitroot('//server/share/') == ('//server/share', '/', '')
splitroot('C:/Users/Barney') == ('C:', '/', 'Users/Barney')
splitroot('C:///spam///ham') == ('C:', '/', '//spam///ham')
splitroot('Windows/notepad') == ('', '', 'Windows/notepad')
"""
p = os.fspath(p)
if isinstance(p, bytes):
sep = b'\\'
altsep = b'/'
colon = b':'
unc_prefix = b'\\\\?\\UNC\\'
empty = b''
else:
# Relative path with root, e.g. \Windows
return empty, p[:1], p[1:]
elif normp[1:2] == colon:
if normp[2:3] == sep:
# Absolute drive-letter path, e.g. X:\Windows
return p[:2], p[2:3], p[3:]
sep = '\\'
altsep = '/'
colon = ':'
unc_prefix = '\\\\?\\UNC\\'
empty = ''
normp = p.replace(altsep, sep)
if normp[:1] == sep:
if normp[1:2] == sep:
# UNC drives, e.g. \\server\share or \\?\UNC\server\share
# Device drives, e.g. \\.\device or \\?\device
start = 8 if normp[:8].upper() == unc_prefix else 2
index = normp.find(sep, start)
if index == -1:
return p, empty, empty
index2 = normp.find(sep, index + 1)
if index2 == -1:
return p, empty, empty
return p[:index2], p[index2:index2 + 1], p[index2 + 1:]
else:
# Relative path with root, e.g. \Windows
return empty, p[:1], p[1:]
elif normp[1:2] == colon:
if normp[2:3] == sep:
# Absolute drive-letter path, e.g. X:\Windows
return p[:2], p[2:3], p[3:]
else:
# Relative path with drive, e.g. X:Windows
return p[:2], empty, p[2:]
else:
# Relative path with drive, e.g. X:Windows
return p[:2], empty, p[2:]
else:
# Relative path, e.g. Windows
return empty, empty, p
# Relative path, e.g. Windows
return empty, empty, p
else:
def splitroot(p):
"""Split a pathname into drive, root and tail. The drive is defined
exactly as in splitdrive(). On Windows, the root may be a single path
separator or an empty string. The tail contains anything after the root.
For example:
splitroot('//server/share/') == ('//server/share', '/', '')
splitroot('C:/Users/Barney') == ('C:', '/', 'Users/Barney')
splitroot('C:///spam///ham') == ('C:', '/', '//spam///ham')
splitroot('Windows/notepad') == ('', '', 'Windows/notepad')
"""
p = os.fspath(p)
if isinstance(p, bytes):
drive, root, tail = _path_splitroot_ex(os.fsdecode(p))
return os.fsencode(drive), os.fsencode(root), os.fsencode(tail)
return _path_splitroot_ex(p)
# Split a path in head (everything up to the last '/') and tail (the

View File

@ -134,33 +134,53 @@ def splitdrive(p):
return p[:0], p
def splitroot(p):
"""Split a pathname into drive, root and tail. On Posix, drive is always
empty; the root may be empty, a single slash, or two slashes. The tail
contains anything after the root. For example:
try:
from posix import _path_splitroot_ex
except ImportError:
def splitroot(p):
"""Split a pathname into drive, root and tail. On Posix, drive is always
empty; the root may be empty, a single slash, or two slashes. The tail
contains anything after the root. For example:
splitroot('foo/bar') == ('', '', 'foo/bar')
splitroot('/foo/bar') == ('', '/', 'foo/bar')
splitroot('//foo/bar') == ('', '//', 'foo/bar')
splitroot('///foo/bar') == ('', '/', '//foo/bar')
"""
p = os.fspath(p)
if isinstance(p, bytes):
sep = b'/'
empty = b''
else:
sep = '/'
empty = ''
if p[:1] != sep:
# Relative path, e.g.: 'foo'
return empty, empty, p
elif p[1:2] != sep or p[2:3] == sep:
# Absolute path, e.g.: '/foo', '///foo', '////foo', etc.
return empty, sep, p[1:]
else:
# Precisely two leading slashes, e.g.: '//foo'. Implementation defined per POSIX, see
# https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap04.html#tag_04_13
return empty, p[:2], p[2:]
splitroot('foo/bar') == ('', '', 'foo/bar')
splitroot('/foo/bar') == ('', '/', 'foo/bar')
splitroot('//foo/bar') == ('', '//', 'foo/bar')
splitroot('///foo/bar') == ('', '/', '//foo/bar')
"""
p = os.fspath(p)
if isinstance(p, bytes):
sep = b'/'
empty = b''
else:
sep = '/'
empty = ''
if p[:1] != sep:
# Relative path, e.g.: 'foo'
return empty, empty, p
elif p[1:2] != sep or p[2:3] == sep:
# Absolute path, e.g.: '/foo', '///foo', '////foo', etc.
return empty, sep, p[1:]
else:
# Precisely two leading slashes, e.g.: '//foo'. Implementation defined per POSIX, see
# https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap04.html#tag_04_13
return empty, p[:2], p[2:]
else:
def splitroot(p):
"""Split a pathname into drive, root and tail. On Posix, drive is always
empty; the root may be empty, a single slash, or two slashes. The tail
contains anything after the root. For example:
splitroot('foo/bar') == ('', '', 'foo/bar')
splitroot('/foo/bar') == ('', '/', 'foo/bar')
splitroot('//foo/bar') == ('', '//', 'foo/bar')
splitroot('///foo/bar') == ('', '/', '//foo/bar')
"""
p = os.fspath(p)
if isinstance(p, bytes):
# Optimisation: the drive is always empty
_, root, tail = _path_splitroot_ex(os.fsdecode(p))
return b'', os.fsencode(root), os.fsencode(tail)
return _path_splitroot_ex(p)
# Return the tail (basename) part of a path, same as split(path)[1].

View File

@ -374,6 +374,7 @@ class TestNtpath(NtpathTestCase):
tester("ntpath.normpath('\\\\foo\\')", '\\\\foo\\')
tester("ntpath.normpath('\\\\foo')", '\\\\foo')
tester("ntpath.normpath('\\\\')", '\\\\')
tester("ntpath.normpath('//?/UNC/server/share/..')", '\\\\?\\UNC\\server\\share\\')
def test_realpath_curdir(self):
expected = ntpath.normpath(os.getcwd())

View File

@ -0,0 +1 @@
Speed up :func:`os.path.splitroot` with a native implementation.

View File

@ -2248,6 +2248,64 @@ exit:
#endif /* defined(MS_WINDOWS) */
PyDoc_STRVAR(os__path_splitroot_ex__doc__,
"_path_splitroot_ex($module, /, path)\n"
"--\n"
"\n");
#define OS__PATH_SPLITROOT_EX_METHODDEF \
{"_path_splitroot_ex", _PyCFunction_CAST(os__path_splitroot_ex), METH_FASTCALL|METH_KEYWORDS, os__path_splitroot_ex__doc__},
static PyObject *
os__path_splitroot_ex_impl(PyObject *module, PyObject *path);
static PyObject *
os__path_splitroot_ex(PyObject *module, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames)
{
PyObject *return_value = NULL;
#if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE)
#define NUM_KEYWORDS 1
static struct {
PyGC_Head _this_is_not_used;
PyObject_VAR_HEAD
PyObject *ob_item[NUM_KEYWORDS];
} _kwtuple = {
.ob_base = PyVarObject_HEAD_INIT(&PyTuple_Type, NUM_KEYWORDS)
.ob_item = { &_Py_ID(path), },
};
#undef NUM_KEYWORDS
#define KWTUPLE (&_kwtuple.ob_base.ob_base)
#else // !Py_BUILD_CORE
# define KWTUPLE NULL
#endif // !Py_BUILD_CORE
static const char * const _keywords[] = {"path", NULL};
static _PyArg_Parser _parser = {
.keywords = _keywords,
.fname = "_path_splitroot_ex",
.kwtuple = KWTUPLE,
};
#undef KWTUPLE
PyObject *argsbuf[1];
PyObject *path;
args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser, 1, 1, 0, argsbuf);
if (!args) {
goto exit;
}
if (!PyUnicode_Check(args[0])) {
_PyArg_BadArgument("_path_splitroot_ex", "argument 'path'", "str", args[0]);
goto exit;
}
path = args[0];
return_value = os__path_splitroot_ex_impl(module, path);
exit:
return return_value;
}
PyDoc_STRVAR(os__path_normpath__doc__,
"_path_normpath($module, /, path)\n"
"--\n"
@ -12602,4 +12660,4 @@ os__supports_virtual_terminal(PyObject *module, PyObject *Py_UNUSED(ignored))
#ifndef OS__SUPPORTS_VIRTUAL_TERMINAL_METHODDEF
#define OS__SUPPORTS_VIRTUAL_TERMINAL_METHODDEF
#endif /* !defined(OS__SUPPORTS_VIRTUAL_TERMINAL_METHODDEF) */
/*[clinic end generated code: output=511f0788a6b90db0 input=a9049054013a1b77]*/
/*[clinic end generated code: output=c4698b47007cd6eb input=a9049054013a1b77]*/

View File

@ -5467,6 +5467,49 @@ os__path_islink_impl(PyObject *module, PyObject *path)
#endif /* MS_WINDOWS */
/*[clinic input]
os._path_splitroot_ex
path: unicode
[clinic start generated code]*/
static PyObject *
os__path_splitroot_ex_impl(PyObject *module, PyObject *path)
/*[clinic end generated code: output=de97403d3dfebc40 input=f1470e12d899f9ac]*/
{
Py_ssize_t len, drvsize, rootsize;
PyObject *drv = NULL, *root = NULL, *tail = NULL, *result = NULL;
wchar_t *buffer = PyUnicode_AsWideCharString(path, &len);
if (!buffer) {
goto exit;
}
_Py_skiproot(buffer, len, &drvsize, &rootsize);
drv = PyUnicode_FromWideChar(buffer, drvsize);
if (drv == NULL) {
goto exit;
}
root = PyUnicode_FromWideChar(&buffer[drvsize], rootsize);
if (root == NULL) {
goto exit;
}
tail = PyUnicode_FromWideChar(&buffer[drvsize + rootsize],
len - drvsize - rootsize);
if (tail == NULL) {
goto exit;
}
result = Py_BuildValue("(OOO)", drv, root, tail);
exit:
PyMem_Free(buffer);
Py_XDECREF(drv);
Py_XDECREF(root);
Py_XDECREF(tail);
return result;
}
/*[clinic input]
os._path_normpath
@ -16799,6 +16842,7 @@ static PyMethodDef posix_methods[] = {
OS__FINDFIRSTFILE_METHODDEF
OS__GETVOLUMEPATHNAME_METHODDEF
OS__PATH_SPLITROOT_METHODDEF
OS__PATH_SPLITROOT_EX_METHODDEF
OS__PATH_NORMPATH_METHODDEF
OS_GETLOADAVG_METHODDEF
OS_URANDOM_METHODDEF

View File

@ -2295,6 +2295,99 @@ PathCchCombineEx(wchar_t *buffer, size_t bufsize, const wchar_t *dirname,
#endif /* defined(MS_WINDOWS_GAMES) && !defined(MS_WINDOWS_DESKTOP) */
void
_Py_skiproot(const wchar_t *path, Py_ssize_t size, Py_ssize_t *drvsize,
Py_ssize_t *rootsize)
{
assert(drvsize);
assert(rootsize);
#ifndef MS_WINDOWS
#define IS_SEP(x) (*(x) == SEP)
*drvsize = 0;
if (!IS_SEP(&path[0])) {
// Relative path, e.g.: 'foo'
*rootsize = 0;
}
else if (!IS_SEP(&path[1]) || IS_SEP(&path[2])) {
// Absolute path, e.g.: '/foo', '///foo', '////foo', etc.
*rootsize = 1;
}
else {
// Precisely two leading slashes, e.g.: '//foo'. Implementation defined per POSIX, see
// https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap04.html#tag_04_13
*rootsize = 2;
}
#undef IS_SEP
#else
const wchar_t *pEnd = size >= 0 ? &path[size] : NULL;
#define IS_END(x) (pEnd ? (x) == pEnd : !*(x))
#define IS_SEP(x) (*(x) == SEP || *(x) == ALTSEP)
#define SEP_OR_END(x) (IS_SEP(x) || IS_END(x))
if (IS_SEP(&path[0])) {
if (IS_SEP(&path[1])) {
// Device drives, e.g. \\.\device or \\?\device
// UNC drives, e.g. \\server\share or \\?\UNC\server\share
Py_ssize_t idx;
if (path[2] == L'?' && IS_SEP(&path[3]) &&
(path[4] == L'U' || path[4] == L'u') &&
(path[5] == L'N' || path[5] == L'n') &&
(path[6] == L'C' || path[6] == L'c') &&
IS_SEP(&path[7]))
{
idx = 8;
}
else {
idx = 2;
}
while (!SEP_OR_END(&path[idx])) {
idx++;
}
if (IS_END(&path[idx])) {
*drvsize = idx;
*rootsize = 0;
}
else {
idx++;
while (!SEP_OR_END(&path[idx])) {
idx++;
}
*drvsize = idx;
if (IS_END(&path[idx])) {
*rootsize = 0;
}
else {
*rootsize = 1;
}
}
}
else {
// Relative path with root, e.g. \Windows
*drvsize = 0;
*rootsize = 1;
}
}
else if (!IS_END(&path[0]) && path[1] == L':') {
*drvsize = 2;
if (IS_SEP(&path[2])) {
// Absolute drive-letter path, e.g. X:\Windows
*rootsize = 1;
}
else {
// Relative path with drive, e.g. X:Windows
*rootsize = 0;
}
}
else {
// Relative path, e.g. Windows
*drvsize = 0;
*rootsize = 0;
}
#undef SEP_OR_END
#undef IS_SEP
#undef IS_END
#endif
}
// The caller must ensure "buffer" is big enough.
static int
join_relfile(wchar_t *buffer, size_t bufsize,
@ -2411,49 +2504,39 @@ _Py_normpath_and_size(wchar_t *path, Py_ssize_t size, Py_ssize_t *normsize)
#endif
#define SEP_OR_END(x) (IS_SEP(x) || IS_END(x))
// Skip leading '.\'
if (p1[0] == L'.' && IS_SEP(&p1[1])) {
// Skip leading '.\'
path = &path[2];
while (IS_SEP(path) && !IS_END(path)) {
while (IS_SEP(path)) {
path++;
}
p1 = p2 = minP2 = path;
lastC = SEP;
}
#ifdef MS_WINDOWS
// Skip past drive segment and update minP2
else if (p1[0] && p1[1] == L':') {
*p2++ = *p1++;
*p2++ = *p1++;
minP2 = p2;
lastC = L':';
}
// Skip past all \\-prefixed paths, including \\?\, \\.\,
// and network paths, including the first segment.
else if (IS_SEP(&p1[0]) && IS_SEP(&p1[1])) {
int sepCount = 2;
*p2++ = SEP;
*p2++ = SEP;
p1 += 2;
for (; !IS_END(p1) && sepCount; ++p1) {
if (IS_SEP(p1)) {
--sepCount;
*p2++ = lastC = SEP;
} else {
*p2++ = lastC = *p1;
}
}
minP2 = p2 - 1;
}
else {
Py_ssize_t drvsize, rootsize;
_Py_skiproot(path, size, &drvsize, &rootsize);
if (drvsize || rootsize) {
// Skip past root and update minP2
p1 = &path[drvsize + rootsize];
#ifndef ALTSEP
p2 = p1;
#else
// Skip past two leading SEPs
else if (IS_SEP(&p1[0]) && IS_SEP(&p1[1]) && !IS_SEP(&p1[2])) {
*p2++ = *p1++;
*p2++ = *p1++;
minP2 = p2 - 1; // Absolute path has SEP at minP2
lastC = SEP;
for (; p2 < p1; ++p2) {
if (*p2 == ALTSEP) {
*p2 = SEP;
}
}
#endif
minP2 = p2 - 1;
lastC = *minP2;
#ifdef MS_WINDOWS
if (lastC != SEP) {
minP2++;
}
#endif
}
}
#endif /* MS_WINDOWS */
/* if pEnd is specified, check that. Else, check for null terminator */
for (; !IS_END(p1); ++p1) {