gh-120754: Refactor I/O modules to stash whole stat result rather than individual members (#123412)

Multiple places in the I/O stack optimize common cases by using the
information from stat. Currently individual members are extracted from
the stat and stored into the fileio struct. Refactor the code to store
the whole stat struct instead.

Parallels the changes to _io. The `stat` Python object doesn't allow
changing members, so rather than modifying estimated_size, just clear
the value.
This commit is contained in:
Cody Maloney 2024-09-18 08:47:57 -07:00 committed by GitHub
parent 96f619faa7
commit 8b6c7c7877
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 81 additions and 46 deletions

View File

@ -242,14 +242,7 @@ def open(file, mode="r", buffering=-1, encoding=None, errors=None,
buffering = -1 buffering = -1
line_buffering = True line_buffering = True
if buffering < 0: if buffering < 0:
buffering = DEFAULT_BUFFER_SIZE buffering = raw._blksize
try:
bs = os.fstat(raw.fileno()).st_blksize
except (OSError, AttributeError):
pass
else:
if bs > 1:
buffering = bs
if buffering < 0: if buffering < 0:
raise ValueError("invalid buffering size") raise ValueError("invalid buffering size")
if buffering == 0: if buffering == 0:
@ -1565,19 +1558,15 @@ class FileIO(RawIOBase):
os.set_inheritable(fd, False) os.set_inheritable(fd, False)
self._closefd = closefd self._closefd = closefd
fdfstat = os.fstat(fd) self._stat_atopen = os.fstat(fd)
try: try:
if stat.S_ISDIR(fdfstat.st_mode): if stat.S_ISDIR(self._stat_atopen.st_mode):
raise IsADirectoryError(errno.EISDIR, raise IsADirectoryError(errno.EISDIR,
os.strerror(errno.EISDIR), file) os.strerror(errno.EISDIR), file)
except AttributeError: except AttributeError:
# Ignore the AttributeError if stat.S_ISDIR or errno.EISDIR # Ignore the AttributeError if stat.S_ISDIR or errno.EISDIR
# don't exist. # don't exist.
pass pass
self._blksize = getattr(fdfstat, 'st_blksize', 0)
if self._blksize <= 1:
self._blksize = DEFAULT_BUFFER_SIZE
self._estimated_size = fdfstat.st_size
if _setmode: if _setmode:
# don't translate newlines (\r\n <=> \n) # don't translate newlines (\r\n <=> \n)
@ -1623,6 +1612,17 @@ class FileIO(RawIOBase):
return ('<%s name=%r mode=%r closefd=%r>' % return ('<%s name=%r mode=%r closefd=%r>' %
(class_name, name, self.mode, self._closefd)) (class_name, name, self.mode, self._closefd))
@property
def _blksize(self):
if self._stat_atopen is None:
return DEFAULT_BUFFER_SIZE
blksize = getattr(self._stat_atopen, "st_blksize", 0)
# WASI sets blsize to 0
if not blksize:
return DEFAULT_BUFFER_SIZE
return blksize
def _checkReadable(self): def _checkReadable(self):
if not self._readable: if not self._readable:
raise UnsupportedOperation('File not open for reading') raise UnsupportedOperation('File not open for reading')
@ -1655,16 +1655,20 @@ class FileIO(RawIOBase):
""" """
self._checkClosed() self._checkClosed()
self._checkReadable() self._checkReadable()
if self._estimated_size <= 0: if self._stat_atopen is None or self._stat_atopen.st_size <= 0:
bufsize = DEFAULT_BUFFER_SIZE bufsize = DEFAULT_BUFFER_SIZE
else: else:
bufsize = self._estimated_size + 1 # In order to detect end of file, need a read() of at least 1
# byte which returns size 0. Oversize the buffer by 1 byte so the
# I/O can be completed with two read() calls (one for all data, one
# for EOF) without needing to resize the buffer.
bufsize = self._stat_atopen.st_size + 1
if self._estimated_size > 65536: if self._stat_atopen.st_size > 65536:
try: try:
pos = os.lseek(self._fd, 0, SEEK_CUR) pos = os.lseek(self._fd, 0, SEEK_CUR)
if self._estimated_size >= pos: if self._stat_atopen.st_size >= pos:
bufsize = self._estimated_size - pos + 1 bufsize = self._stat_atopen.st_size - pos + 1
except OSError: except OSError:
pass pass
@ -1742,7 +1746,7 @@ class FileIO(RawIOBase):
if size is None: if size is None:
size = self.tell() size = self.tell()
os.ftruncate(self._fd, size) os.ftruncate(self._fd, size)
self._estimated_size = size self._stat_atopen = None
return size return size
def close(self): def close(self):

View File

@ -74,8 +74,13 @@ typedef struct {
signed int seekable : 2; /* -1 means unknown */ signed int seekable : 2; /* -1 means unknown */
unsigned int closefd : 1; unsigned int closefd : 1;
char finalizing; char finalizing;
unsigned int blksize; /* Stat result which was grabbed at file open, useful for optimizing common
Py_off_t estimated_size; File I/O patterns to be more efficient. This is only guidance / an
estimate, as it is subject to Time-Of-Check to Time-Of-Use (TOCTOU)
issues / bugs. Both the underlying file descriptor and file may be
modified outside of the fileio object / Python (ex. gh-90102, GH-121941,
gh-109523). */
struct _Py_stat_struct *stat_atopen;
PyObject *weakreflist; PyObject *weakreflist;
PyObject *dict; PyObject *dict;
} fileio; } fileio;
@ -199,8 +204,7 @@ fileio_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
self->writable = 0; self->writable = 0;
self->appending = 0; self->appending = 0;
self->seekable = -1; self->seekable = -1;
self->blksize = 0; self->stat_atopen = NULL;
self->estimated_size = -1;
self->closefd = 1; self->closefd = 1;
self->weakreflist = NULL; self->weakreflist = NULL;
} }
@ -256,7 +260,6 @@ _io_FileIO___init___impl(fileio *self, PyObject *nameobj, const char *mode,
#elif !defined(MS_WINDOWS) #elif !defined(MS_WINDOWS)
int *atomic_flag_works = NULL; int *atomic_flag_works = NULL;
#endif #endif
struct _Py_stat_struct fdfstat;
int fstat_result; int fstat_result;
int async_err = 0; int async_err = 0;
@ -454,9 +457,13 @@ _io_FileIO___init___impl(fileio *self, PyObject *nameobj, const char *mode,
#endif #endif
} }
self->blksize = DEFAULT_BUFFER_SIZE; self->stat_atopen = PyMem_New(struct _Py_stat_struct, 1);
if (self->stat_atopen == NULL) {
PyErr_NoMemory();
goto error;
}
Py_BEGIN_ALLOW_THREADS Py_BEGIN_ALLOW_THREADS
fstat_result = _Py_fstat_noraise(self->fd, &fdfstat); fstat_result = _Py_fstat_noraise(self->fd, self->stat_atopen);
Py_END_ALLOW_THREADS Py_END_ALLOW_THREADS
if (fstat_result < 0) { if (fstat_result < 0) {
/* Tolerate fstat() errors other than EBADF. See Issue #25717, where /* Tolerate fstat() errors other than EBADF. See Issue #25717, where
@ -471,25 +478,21 @@ _io_FileIO___init___impl(fileio *self, PyObject *nameobj, const char *mode,
#endif #endif
goto error; goto error;
} }
PyMem_Free(self->stat_atopen);
self->stat_atopen = NULL;
} }
else { else {
#if defined(S_ISDIR) && defined(EISDIR) #if defined(S_ISDIR) && defined(EISDIR)
/* On Unix, open will succeed for directories. /* On Unix, open will succeed for directories.
In Python, there should be no file objects referring to In Python, there should be no file objects referring to
directories, so we need a check. */ directories, so we need a check. */
if (S_ISDIR(fdfstat.st_mode)) { if (S_ISDIR(self->stat_atopen->st_mode)) {
errno = EISDIR; errno = EISDIR;
PyErr_SetFromErrnoWithFilenameObject(PyExc_OSError, nameobj); PyErr_SetFromErrnoWithFilenameObject(PyExc_OSError, nameobj);
goto error; goto error;
} }
#endif /* defined(S_ISDIR) */ #endif /* defined(S_ISDIR) */
#ifdef HAVE_STRUCT_STAT_ST_BLKSIZE
if (fdfstat.st_blksize > 1)
self->blksize = fdfstat.st_blksize;
#endif /* HAVE_STRUCT_STAT_ST_BLKSIZE */
if (fdfstat.st_size < PY_SSIZE_T_MAX) {
self->estimated_size = (Py_off_t)fdfstat.st_size;
}
} }
#if defined(MS_WINDOWS) || defined(__CYGWIN__) #if defined(MS_WINDOWS) || defined(__CYGWIN__)
@ -521,6 +524,10 @@ _io_FileIO___init___impl(fileio *self, PyObject *nameobj, const char *mode,
internal_close(self); internal_close(self);
_PyErr_ChainExceptions1(exc); _PyErr_ChainExceptions1(exc);
} }
if (self->stat_atopen != NULL) {
PyMem_Free(self->stat_atopen);
self->stat_atopen = NULL;
}
done: done:
#ifdef MS_WINDOWS #ifdef MS_WINDOWS
@ -553,6 +560,10 @@ fileio_dealloc(fileio *self)
if (_PyIOBase_finalize((PyObject *) self) < 0) if (_PyIOBase_finalize((PyObject *) self) < 0)
return; return;
_PyObject_GC_UNTRACK(self); _PyObject_GC_UNTRACK(self);
if (self->stat_atopen != NULL) {
PyMem_Free(self->stat_atopen);
self->stat_atopen = NULL;
}
if (self->weakreflist != NULL) if (self->weakreflist != NULL)
PyObject_ClearWeakRefs((PyObject *) self); PyObject_ClearWeakRefs((PyObject *) self);
(void)fileio_clear(self); (void)fileio_clear(self);
@ -725,20 +736,27 @@ _io_FileIO_readall_impl(fileio *self)
return err_closed(); return err_closed();
} }
end = self->estimated_size; if (self->stat_atopen != NULL && self->stat_atopen->st_size < _PY_READ_MAX) {
end = (Py_off_t)self->stat_atopen->st_size;
}
else {
end = -1;
}
if (end <= 0) { if (end <= 0) {
/* Use a default size and resize as needed. */ /* Use a default size and resize as needed. */
bufsize = SMALLCHUNK; bufsize = SMALLCHUNK;
} }
else { else {
/* This is probably a real file, so we try to allocate a /* This is probably a real file. */
buffer one byte larger than the rest of the file. If the
calculation is right then we should get EOF without having
to enlarge the buffer. */
if (end > _PY_READ_MAX - 1) { if (end > _PY_READ_MAX - 1) {
bufsize = _PY_READ_MAX; bufsize = _PY_READ_MAX;
} }
else { else {
/* In order to detect end of file, need a read() of at
least 1 byte which returns size 0. Oversize the buffer
by 1 byte so the I/O can be completed with two read()
calls (one for all data, one for EOF) without needing
to resize the buffer. */
bufsize = (size_t)end + 1; bufsize = (size_t)end + 1;
} }
@ -1094,11 +1112,13 @@ _io_FileIO_truncate_impl(fileio *self, PyTypeObject *cls, PyObject *posobj)
return NULL; return NULL;
} }
/* Sometimes a large file is truncated. While estimated_size is used as a /* Since the file was truncated, its size at open is no longer accurate
estimate, that it is much larger than the actual size can result in a as an estimate. Clear out the stat result, and rely on dynamic resize
significant over allocation and sometimes a MemoryError / running out of code if a readall is requested. */
memory. */ if (self->stat_atopen != NULL) {
self->estimated_size = pos; PyMem_Free(self->stat_atopen);
self->stat_atopen = NULL;
}
return posobj; return posobj;
} }
@ -1229,16 +1249,27 @@ get_mode(fileio *self, void *closure)
return PyUnicode_FromString(mode_string(self)); return PyUnicode_FromString(mode_string(self));
} }
static PyObject *
get_blksize(fileio *self, void *closure)
{
#ifdef HAVE_STRUCT_STAT_ST_BLKSIZE
if (self->stat_atopen != NULL && self->stat_atopen->st_blksize > 1) {
return PyLong_FromLong(self->stat_atopen->st_blksize);
}
#endif /* HAVE_STRUCT_STAT_ST_BLKSIZE */
return PyLong_FromLong(DEFAULT_BUFFER_SIZE);
}
static PyGetSetDef fileio_getsetlist[] = { static PyGetSetDef fileio_getsetlist[] = {
{"closed", (getter)get_closed, NULL, "True if the file is closed"}, {"closed", (getter)get_closed, NULL, "True if the file is closed"},
{"closefd", (getter)get_closefd, NULL, {"closefd", (getter)get_closefd, NULL,
"True if the file descriptor will be closed by close()."}, "True if the file descriptor will be closed by close()."},
{"mode", (getter)get_mode, NULL, "String giving the file mode"}, {"mode", (getter)get_mode, NULL, "String giving the file mode"},
{"_blksize", (getter)get_blksize, NULL, "Stat st_blksize if available"},
{NULL}, {NULL},
}; };
static PyMemberDef fileio_members[] = { static PyMemberDef fileio_members[] = {
{"_blksize", Py_T_UINT, offsetof(fileio, blksize), 0},
{"_finalizing", Py_T_BOOL, offsetof(fileio, finalizing), 0}, {"_finalizing", Py_T_BOOL, offsetof(fileio, finalizing), 0},
{"__weaklistoffset__", Py_T_PYSSIZET, offsetof(fileio, weakreflist), Py_READONLY}, {"__weaklistoffset__", Py_T_PYSSIZET, offsetof(fileio, weakreflist), Py_READONLY},
{"__dictoffset__", Py_T_PYSSIZET, offsetof(fileio, dict), Py_READONLY}, {"__dictoffset__", Py_T_PYSSIZET, offsetof(fileio, dict), Py_READONLY},