diff --git a/Include/internal/pycore_fileutils.h b/Include/internal/pycore_fileutils.h index 5c55282fa39..bc8100b58e8 100644 --- a/Include/internal/pycore_fileutils.h +++ b/Include/internal/pycore_fileutils.h @@ -290,6 +290,8 @@ extern wchar_t *_Py_normpath_and_size(wchar_t *path, Py_ssize_t size, Py_ssize_t extern HRESULT PathCchSkipRoot(const wchar_t *pszPath, const wchar_t **ppszRootEnd); #endif /* defined(MS_WINDOWS_GAMES) && !defined(MS_WINDOWS_DESKTOP) */ +extern void _Py_skiproot(const wchar_t *path, Py_ssize_t size, Py_ssize_t *drvsize, Py_ssize_t *rootsize); + // Macros to protect CRT calls against instant termination when passed an // invalid parameter (bpo-23524). IPH stands for Invalid Parameter Handler. // Usage: diff --git a/Lib/ntpath.py b/Lib/ntpath.py index aba18bfe407..e810b655e5a 100644 --- a/Lib/ntpath.py +++ b/Lib/ntpath.py @@ -167,56 +167,76 @@ def splitdrive(p): return drive, root + tail -def splitroot(p): - """Split a pathname into drive, root and tail. The drive is defined - exactly as in splitdrive(). On Windows, the root may be a single path - separator or an empty string. The tail contains anything after the root. - For example: +try: + from nt import _path_splitroot_ex +except ImportError: + def splitroot(p): + """Split a pathname into drive, root and tail. The drive is defined + exactly as in splitdrive(). On Windows, the root may be a single path + separator or an empty string. The tail contains anything after the root. + For example: - splitroot('//server/share/') == ('//server/share', '/', '') - splitroot('C:/Users/Barney') == ('C:', '/', 'Users/Barney') - splitroot('C:///spam///ham') == ('C:', '/', '//spam///ham') - splitroot('Windows/notepad') == ('', '', 'Windows/notepad') - """ - p = os.fspath(p) - if isinstance(p, bytes): - sep = b'\\' - altsep = b'/' - colon = b':' - unc_prefix = b'\\\\?\\UNC\\' - empty = b'' - else: - sep = '\\' - altsep = '/' - colon = ':' - unc_prefix = '\\\\?\\UNC\\' - empty = '' - normp = p.replace(altsep, sep) - if normp[:1] == sep: - if normp[1:2] == sep: - # UNC drives, e.g. \\server\share or \\?\UNC\server\share - # Device drives, e.g. \\.\device or \\?\device - start = 8 if normp[:8].upper() == unc_prefix else 2 - index = normp.find(sep, start) - if index == -1: - return p, empty, empty - index2 = normp.find(sep, index + 1) - if index2 == -1: - return p, empty, empty - return p[:index2], p[index2:index2 + 1], p[index2 + 1:] + splitroot('//server/share/') == ('//server/share', '/', '') + splitroot('C:/Users/Barney') == ('C:', '/', 'Users/Barney') + splitroot('C:///spam///ham') == ('C:', '/', '//spam///ham') + splitroot('Windows/notepad') == ('', '', 'Windows/notepad') + """ + p = os.fspath(p) + if isinstance(p, bytes): + sep = b'\\' + altsep = b'/' + colon = b':' + unc_prefix = b'\\\\?\\UNC\\' + empty = b'' else: - # Relative path with root, e.g. \Windows - return empty, p[:1], p[1:] - elif normp[1:2] == colon: - if normp[2:3] == sep: - # Absolute drive-letter path, e.g. X:\Windows - return p[:2], p[2:3], p[3:] + sep = '\\' + altsep = '/' + colon = ':' + unc_prefix = '\\\\?\\UNC\\' + empty = '' + normp = p.replace(altsep, sep) + if normp[:1] == sep: + if normp[1:2] == sep: + # UNC drives, e.g. \\server\share or \\?\UNC\server\share + # Device drives, e.g. \\.\device or \\?\device + start = 8 if normp[:8].upper() == unc_prefix else 2 + index = normp.find(sep, start) + if index == -1: + return p, empty, empty + index2 = normp.find(sep, index + 1) + if index2 == -1: + return p, empty, empty + return p[:index2], p[index2:index2 + 1], p[index2 + 1:] + else: + # Relative path with root, e.g. \Windows + return empty, p[:1], p[1:] + elif normp[1:2] == colon: + if normp[2:3] == sep: + # Absolute drive-letter path, e.g. X:\Windows + return p[:2], p[2:3], p[3:] + else: + # Relative path with drive, e.g. X:Windows + return p[:2], empty, p[2:] else: - # Relative path with drive, e.g. X:Windows - return p[:2], empty, p[2:] - else: - # Relative path, e.g. Windows - return empty, empty, p + # Relative path, e.g. Windows + return empty, empty, p +else: + def splitroot(p): + """Split a pathname into drive, root and tail. The drive is defined + exactly as in splitdrive(). On Windows, the root may be a single path + separator or an empty string. The tail contains anything after the root. + For example: + + splitroot('//server/share/') == ('//server/share', '/', '') + splitroot('C:/Users/Barney') == ('C:', '/', 'Users/Barney') + splitroot('C:///spam///ham') == ('C:', '/', '//spam///ham') + splitroot('Windows/notepad') == ('', '', 'Windows/notepad') + """ + p = os.fspath(p) + if isinstance(p, bytes): + drive, root, tail = _path_splitroot_ex(os.fsdecode(p)) + return os.fsencode(drive), os.fsencode(root), os.fsencode(tail) + return _path_splitroot_ex(p) # Split a path in head (everything up to the last '/') and tail (the diff --git a/Lib/posixpath.py b/Lib/posixpath.py index f1960ddb88e..56b7915826d 100644 --- a/Lib/posixpath.py +++ b/Lib/posixpath.py @@ -134,33 +134,53 @@ def splitdrive(p): return p[:0], p -def splitroot(p): - """Split a pathname into drive, root and tail. On Posix, drive is always - empty; the root may be empty, a single slash, or two slashes. The tail - contains anything after the root. For example: +try: + from posix import _path_splitroot_ex +except ImportError: + def splitroot(p): + """Split a pathname into drive, root and tail. On Posix, drive is always + empty; the root may be empty, a single slash, or two slashes. The tail + contains anything after the root. For example: - splitroot('foo/bar') == ('', '', 'foo/bar') - splitroot('/foo/bar') == ('', '/', 'foo/bar') - splitroot('//foo/bar') == ('', '//', 'foo/bar') - splitroot('///foo/bar') == ('', '/', '//foo/bar') - """ - p = os.fspath(p) - if isinstance(p, bytes): - sep = b'/' - empty = b'' - else: - sep = '/' - empty = '' - if p[:1] != sep: - # Relative path, e.g.: 'foo' - return empty, empty, p - elif p[1:2] != sep or p[2:3] == sep: - # Absolute path, e.g.: '/foo', '///foo', '////foo', etc. - return empty, sep, p[1:] - else: - # Precisely two leading slashes, e.g.: '//foo'. Implementation defined per POSIX, see - # https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap04.html#tag_04_13 - return empty, p[:2], p[2:] + splitroot('foo/bar') == ('', '', 'foo/bar') + splitroot('/foo/bar') == ('', '/', 'foo/bar') + splitroot('//foo/bar') == ('', '//', 'foo/bar') + splitroot('///foo/bar') == ('', '/', '//foo/bar') + """ + p = os.fspath(p) + if isinstance(p, bytes): + sep = b'/' + empty = b'' + else: + sep = '/' + empty = '' + if p[:1] != sep: + # Relative path, e.g.: 'foo' + return empty, empty, p + elif p[1:2] != sep or p[2:3] == sep: + # Absolute path, e.g.: '/foo', '///foo', '////foo', etc. + return empty, sep, p[1:] + else: + # Precisely two leading slashes, e.g.: '//foo'. Implementation defined per POSIX, see + # https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap04.html#tag_04_13 + return empty, p[:2], p[2:] +else: + def splitroot(p): + """Split a pathname into drive, root and tail. On Posix, drive is always + empty; the root may be empty, a single slash, or two slashes. The tail + contains anything after the root. For example: + + splitroot('foo/bar') == ('', '', 'foo/bar') + splitroot('/foo/bar') == ('', '/', 'foo/bar') + splitroot('//foo/bar') == ('', '//', 'foo/bar') + splitroot('///foo/bar') == ('', '/', '//foo/bar') + """ + p = os.fspath(p) + if isinstance(p, bytes): + # Optimisation: the drive is always empty + _, root, tail = _path_splitroot_ex(os.fsdecode(p)) + return b'', os.fsencode(root), os.fsencode(tail) + return _path_splitroot_ex(p) # Return the tail (basename) part of a path, same as split(path)[1]. diff --git a/Lib/test/test_ntpath.py b/Lib/test/test_ntpath.py index 31156130fcc..7f91bf1c2b8 100644 --- a/Lib/test/test_ntpath.py +++ b/Lib/test/test_ntpath.py @@ -374,6 +374,7 @@ class TestNtpath(NtpathTestCase): tester("ntpath.normpath('\\\\foo\\')", '\\\\foo\\') tester("ntpath.normpath('\\\\foo')", '\\\\foo') tester("ntpath.normpath('\\\\')", '\\\\') + tester("ntpath.normpath('//?/UNC/server/share/..')", '\\\\?\\UNC\\server\\share\\') def test_realpath_curdir(self): expected = ntpath.normpath(os.getcwd()) diff --git a/Misc/NEWS.d/next/Core and Builtins/2024-04-19-08-50-48.gh-issue-102511.qDEB66.rst b/Misc/NEWS.d/next/Core and Builtins/2024-04-19-08-50-48.gh-issue-102511.qDEB66.rst new file mode 100644 index 00000000000..dfdf2507107 --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2024-04-19-08-50-48.gh-issue-102511.qDEB66.rst @@ -0,0 +1 @@ +Speed up :func:`os.path.splitroot` with a native implementation. diff --git a/Modules/clinic/posixmodule.c.h b/Modules/clinic/posixmodule.c.h index 0398629e3c1..a0d1f3238a6 100644 --- a/Modules/clinic/posixmodule.c.h +++ b/Modules/clinic/posixmodule.c.h @@ -2248,6 +2248,64 @@ exit: #endif /* defined(MS_WINDOWS) */ +PyDoc_STRVAR(os__path_splitroot_ex__doc__, +"_path_splitroot_ex($module, /, path)\n" +"--\n" +"\n"); + +#define OS__PATH_SPLITROOT_EX_METHODDEF \ + {"_path_splitroot_ex", _PyCFunction_CAST(os__path_splitroot_ex), METH_FASTCALL|METH_KEYWORDS, os__path_splitroot_ex__doc__}, + +static PyObject * +os__path_splitroot_ex_impl(PyObject *module, PyObject *path); + +static PyObject * +os__path_splitroot_ex(PyObject *module, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames) +{ + PyObject *return_value = NULL; + #if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE) + + #define NUM_KEYWORDS 1 + static struct { + PyGC_Head _this_is_not_used; + PyObject_VAR_HEAD + PyObject *ob_item[NUM_KEYWORDS]; + } _kwtuple = { + .ob_base = PyVarObject_HEAD_INIT(&PyTuple_Type, NUM_KEYWORDS) + .ob_item = { &_Py_ID(path), }, + }; + #undef NUM_KEYWORDS + #define KWTUPLE (&_kwtuple.ob_base.ob_base) + + #else // !Py_BUILD_CORE + # define KWTUPLE NULL + #endif // !Py_BUILD_CORE + + static const char * const _keywords[] = {"path", NULL}; + static _PyArg_Parser _parser = { + .keywords = _keywords, + .fname = "_path_splitroot_ex", + .kwtuple = KWTUPLE, + }; + #undef KWTUPLE + PyObject *argsbuf[1]; + PyObject *path; + + args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser, 1, 1, 0, argsbuf); + if (!args) { + goto exit; + } + if (!PyUnicode_Check(args[0])) { + _PyArg_BadArgument("_path_splitroot_ex", "argument 'path'", "str", args[0]); + goto exit; + } + path = args[0]; + return_value = os__path_splitroot_ex_impl(module, path); + +exit: + return return_value; +} + PyDoc_STRVAR(os__path_normpath__doc__, "_path_normpath($module, /, path)\n" "--\n" @@ -12602,4 +12660,4 @@ os__supports_virtual_terminal(PyObject *module, PyObject *Py_UNUSED(ignored)) #ifndef OS__SUPPORTS_VIRTUAL_TERMINAL_METHODDEF #define OS__SUPPORTS_VIRTUAL_TERMINAL_METHODDEF #endif /* !defined(OS__SUPPORTS_VIRTUAL_TERMINAL_METHODDEF) */ -/*[clinic end generated code: output=511f0788a6b90db0 input=a9049054013a1b77]*/ +/*[clinic end generated code: output=c4698b47007cd6eb input=a9049054013a1b77]*/ diff --git a/Modules/posixmodule.c b/Modules/posixmodule.c index 5e54cf64cd5..c9d67ccbb8c 100644 --- a/Modules/posixmodule.c +++ b/Modules/posixmodule.c @@ -5467,6 +5467,49 @@ os__path_islink_impl(PyObject *module, PyObject *path) #endif /* MS_WINDOWS */ +/*[clinic input] +os._path_splitroot_ex + + path: unicode + +[clinic start generated code]*/ + +static PyObject * +os__path_splitroot_ex_impl(PyObject *module, PyObject *path) +/*[clinic end generated code: output=de97403d3dfebc40 input=f1470e12d899f9ac]*/ +{ + Py_ssize_t len, drvsize, rootsize; + PyObject *drv = NULL, *root = NULL, *tail = NULL, *result = NULL; + + wchar_t *buffer = PyUnicode_AsWideCharString(path, &len); + if (!buffer) { + goto exit; + } + + _Py_skiproot(buffer, len, &drvsize, &rootsize); + drv = PyUnicode_FromWideChar(buffer, drvsize); + if (drv == NULL) { + goto exit; + } + root = PyUnicode_FromWideChar(&buffer[drvsize], rootsize); + if (root == NULL) { + goto exit; + } + tail = PyUnicode_FromWideChar(&buffer[drvsize + rootsize], + len - drvsize - rootsize); + if (tail == NULL) { + goto exit; + } + result = Py_BuildValue("(OOO)", drv, root, tail); +exit: + PyMem_Free(buffer); + Py_XDECREF(drv); + Py_XDECREF(root); + Py_XDECREF(tail); + return result; +} + + /*[clinic input] os._path_normpath @@ -16799,6 +16842,7 @@ static PyMethodDef posix_methods[] = { OS__FINDFIRSTFILE_METHODDEF OS__GETVOLUMEPATHNAME_METHODDEF OS__PATH_SPLITROOT_METHODDEF + OS__PATH_SPLITROOT_EX_METHODDEF OS__PATH_NORMPATH_METHODDEF OS_GETLOADAVG_METHODDEF OS_URANDOM_METHODDEF diff --git a/Python/fileutils.c b/Python/fileutils.c index 882d3299575..54853ba2f75 100644 --- a/Python/fileutils.c +++ b/Python/fileutils.c @@ -2295,6 +2295,99 @@ PathCchCombineEx(wchar_t *buffer, size_t bufsize, const wchar_t *dirname, #endif /* defined(MS_WINDOWS_GAMES) && !defined(MS_WINDOWS_DESKTOP) */ +void +_Py_skiproot(const wchar_t *path, Py_ssize_t size, Py_ssize_t *drvsize, + Py_ssize_t *rootsize) +{ + assert(drvsize); + assert(rootsize); +#ifndef MS_WINDOWS +#define IS_SEP(x) (*(x) == SEP) + *drvsize = 0; + if (!IS_SEP(&path[0])) { + // Relative path, e.g.: 'foo' + *rootsize = 0; + } + else if (!IS_SEP(&path[1]) || IS_SEP(&path[2])) { + // Absolute path, e.g.: '/foo', '///foo', '////foo', etc. + *rootsize = 1; + } + else { + // Precisely two leading slashes, e.g.: '//foo'. Implementation defined per POSIX, see + // https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap04.html#tag_04_13 + *rootsize = 2; + } +#undef IS_SEP +#else + const wchar_t *pEnd = size >= 0 ? &path[size] : NULL; +#define IS_END(x) (pEnd ? (x) == pEnd : !*(x)) +#define IS_SEP(x) (*(x) == SEP || *(x) == ALTSEP) +#define SEP_OR_END(x) (IS_SEP(x) || IS_END(x)) + if (IS_SEP(&path[0])) { + if (IS_SEP(&path[1])) { + // Device drives, e.g. \\.\device or \\?\device + // UNC drives, e.g. \\server\share or \\?\UNC\server\share + Py_ssize_t idx; + if (path[2] == L'?' && IS_SEP(&path[3]) && + (path[4] == L'U' || path[4] == L'u') && + (path[5] == L'N' || path[5] == L'n') && + (path[6] == L'C' || path[6] == L'c') && + IS_SEP(&path[7])) + { + idx = 8; + } + else { + idx = 2; + } + while (!SEP_OR_END(&path[idx])) { + idx++; + } + if (IS_END(&path[idx])) { + *drvsize = idx; + *rootsize = 0; + } + else { + idx++; + while (!SEP_OR_END(&path[idx])) { + idx++; + } + *drvsize = idx; + if (IS_END(&path[idx])) { + *rootsize = 0; + } + else { + *rootsize = 1; + } + } + } + else { + // Relative path with root, e.g. \Windows + *drvsize = 0; + *rootsize = 1; + } + } + else if (!IS_END(&path[0]) && path[1] == L':') { + *drvsize = 2; + if (IS_SEP(&path[2])) { + // Absolute drive-letter path, e.g. X:\Windows + *rootsize = 1; + } + else { + // Relative path with drive, e.g. X:Windows + *rootsize = 0; + } + } + else { + // Relative path, e.g. Windows + *drvsize = 0; + *rootsize = 0; + } +#undef SEP_OR_END +#undef IS_SEP +#undef IS_END +#endif +} + // The caller must ensure "buffer" is big enough. static int join_relfile(wchar_t *buffer, size_t bufsize, @@ -2411,49 +2504,39 @@ _Py_normpath_and_size(wchar_t *path, Py_ssize_t size, Py_ssize_t *normsize) #endif #define SEP_OR_END(x) (IS_SEP(x) || IS_END(x)) - // Skip leading '.\' if (p1[0] == L'.' && IS_SEP(&p1[1])) { + // Skip leading '.\' path = &path[2]; - while (IS_SEP(path) && !IS_END(path)) { + while (IS_SEP(path)) { path++; } p1 = p2 = minP2 = path; lastC = SEP; } -#ifdef MS_WINDOWS - // Skip past drive segment and update minP2 - else if (p1[0] && p1[1] == L':') { - *p2++ = *p1++; - *p2++ = *p1++; - minP2 = p2; - lastC = L':'; - } - // Skip past all \\-prefixed paths, including \\?\, \\.\, - // and network paths, including the first segment. - else if (IS_SEP(&p1[0]) && IS_SEP(&p1[1])) { - int sepCount = 2; - *p2++ = SEP; - *p2++ = SEP; - p1 += 2; - for (; !IS_END(p1) && sepCount; ++p1) { - if (IS_SEP(p1)) { - --sepCount; - *p2++ = lastC = SEP; - } else { - *p2++ = lastC = *p1; - } - } - minP2 = p2 - 1; - } + else { + Py_ssize_t drvsize, rootsize; + _Py_skiproot(path, size, &drvsize, &rootsize); + if (drvsize || rootsize) { + // Skip past root and update minP2 + p1 = &path[drvsize + rootsize]; +#ifndef ALTSEP + p2 = p1; #else - // Skip past two leading SEPs - else if (IS_SEP(&p1[0]) && IS_SEP(&p1[1]) && !IS_SEP(&p1[2])) { - *p2++ = *p1++; - *p2++ = *p1++; - minP2 = p2 - 1; // Absolute path has SEP at minP2 - lastC = SEP; + for (; p2 < p1; ++p2) { + if (*p2 == ALTSEP) { + *p2 = SEP; + } + } +#endif + minP2 = p2 - 1; + lastC = *minP2; +#ifdef MS_WINDOWS + if (lastC != SEP) { + minP2++; + } +#endif + } } -#endif /* MS_WINDOWS */ /* if pEnd is specified, check that. Else, check for null terminator */ for (; !IS_END(p1); ++p1) {