Issue 2918: Merge StringIO and cStringIO.

This commit is contained in:
Alexandre Vassalotti 2008-06-11 22:58:36 +00:00
parent 502d89ed15
commit 794652dd06
8 changed files with 702 additions and 22 deletions

286
Lib/io.py
View File

@ -1769,19 +1769,19 @@ class TextIOWrapper(TextIOBase):
def newlines(self):
return self._decoder.newlines if self._decoder else None
class StringIO(TextIOWrapper):
"""An in-memory stream for text. The initial_value argument sets the
value of object. The other arguments are like those of TextIOWrapper's
constructor.
class _StringIO(TextIOWrapper):
"""Text I/O implementation using an in-memory buffer.
The initial_value argument sets the value of object. The newline
argument is like the one of TextIOWrapper's constructor.
"""
# XXX This is really slow, but fully functional
def __init__(self, initial_value="", encoding="utf-8",
errors="strict", newline="\n"):
super(StringIO, self).__init__(BytesIO(),
encoding=encoding,
errors=errors,
def __init__(self, initial_value="", newline="\n"):
super(_StringIO, self).__init__(BytesIO(),
encoding="utf-8",
errors="strict",
newline=newline)
if initial_value:
if not isinstance(initial_value, str):
@ -1792,3 +1792,271 @@ class StringIO(TextIOWrapper):
def getvalue(self):
self.flush()
return self.buffer.getvalue().decode(self._encoding, self._errors)
try:
import _stringio
# This subclass is a reimplementation of the TextIOWrapper
# interface without any of its text decoding facilities. All the
# stored data is manipulated with the efficient
# _stringio._StringIO extension type. Also, the newline decoding
# mechanism of IncrementalNewlineDecoder is reimplemented here for
# efficiency. Doing otherwise, would require us to implement a
# fake decoder which would add an additional and unnecessary layer
# on top of the _StringIO methods.
class StringIO(_stringio._StringIO, TextIOBase):
"""Text I/O implementation using an in-memory buffer.
The initial_value argument sets the value of object. The newline
argument is like the one of TextIOWrapper's constructor.
"""
_CHUNK_SIZE = 4096
def __init__(self, initial_value="", newline="\n"):
if newline not in (None, "", "\n", "\r", "\r\n"):
raise ValueError("illegal newline value: %r" % (newline,))
self._readuniversal = not newline
self._readtranslate = newline is None
self._readnl = newline
self._writetranslate = newline != ""
self._writenl = newline or os.linesep
self._pending = ""
self._seennl = 0
# Reset the buffer first, in case __init__ is called
# multiple times.
self.truncate(0)
if initial_value is None:
initial_value = ""
self.write(initial_value)
self.seek(0)
@property
def buffer(self):
raise UnsupportedOperation("%s.buffer attribute is unsupported" %
self.__class__.__name__)
def _decode_newlines(self, input, final=False):
# decode input (with the eventual \r from a previous pass)
if self._pending:
input = self._pending + input
# retain last \r even when not translating data:
# then readline() is sure to get \r\n in one pass
if input.endswith("\r") and not final:
input = input[:-1]
self._pending = "\r"
else:
self._pending = ""
# Record which newlines are read
crlf = input.count('\r\n')
cr = input.count('\r') - crlf
lf = input.count('\n') - crlf
self._seennl |= (lf and self._LF) | (cr and self._CR) \
| (crlf and self._CRLF)
if self._readtranslate:
if crlf:
output = input.replace("\r\n", "\n")
if cr:
output = input.replace("\r", "\n")
else:
output = input
return output
def writable(self):
return True
def readable(self):
return True
def seekable(self):
return True
_read = _stringio._StringIO.read
_write = _stringio._StringIO.write
_tell = _stringio._StringIO.tell
_seek = _stringio._StringIO.seek
_truncate = _stringio._StringIO.truncate
_getvalue = _stringio._StringIO.getvalue
def getvalue(self) -> str:
"""Retrieve the entire contents of the object."""
if self.closed:
raise ValueError("read on closed file")
return self._getvalue()
def write(self, s: str) -> int:
"""Write string s to file.
Returns the number of characters written.
"""
if self.closed:
raise ValueError("write to closed file")
if not isinstance(s, str):
raise TypeError("can't write %s to text stream" %
s.__class__.__name__)
length = len(s)
if self._writetranslate and self._writenl != "\n":
s = s.replace("\n", self._writenl)
self._pending = ""
self._write(s)
return length
def read(self, n: int = None) -> str:
"""Read at most n characters, returned as a string.
If the argument is negative or omitted, read until EOF
is reached. Return an empty string at EOF.
"""
if self.closed:
raise ValueError("read to closed file")
if n is None:
n = -1
res = self._pending
if n < 0:
res += self._decode_newlines(self._read(), True)
self._pending = ""
return res
else:
res = self._decode_newlines(self._read(n), True)
self._pending = res[n:]
return res[:n]
def tell(self) -> int:
"""Tell the current file position."""
if self.closed:
raise ValueError("tell from closed file")
if self._pending:
return self._tell() - len(self._pending)
else:
return self._tell()
def seek(self, pos: int = None, whence: int = 0) -> int:
"""Change stream position.
Seek to character offset pos relative to position indicated by whence:
0 Start of stream (the default). pos should be >= 0;
1 Current position - pos must be 0;
2 End of stream - pos must be 0.
Returns the new absolute position.
"""
if self.closed:
raise ValueError("seek from closed file")
self._pending = ""
return self._seek(pos, whence)
def truncate(self, pos: int = None) -> int:
"""Truncate size to pos.
The pos argument defaults to the current file position, as
returned by tell(). Imply an absolute seek to pos.
Returns the new absolute position.
"""
if self.closed:
raise ValueError("truncate from closed file")
self._pending = ""
return self._truncate(pos)
def readline(self, limit: int = None) -> str:
if self.closed:
raise ValueError("read from closed file")
if limit is None:
limit = -1
if limit >= 0:
# XXX: Hack to support limit argument, for backwards
# XXX compatibility
line = self.readline()
if len(line) <= limit:
return line
line, self._pending = line[:limit], line[limit:] + self._pending
return line
line = self._pending
self._pending = ""
start = 0
pos = endpos = None
while True:
if self._readtranslate:
# Newlines are already translated, only search for \n
pos = line.find('\n', start)
if pos >= 0:
endpos = pos + 1
break
else:
start = len(line)
elif self._readuniversal:
# Universal newline search. Find any of \r, \r\n, \n
# The decoder ensures that \r\n are not split in two pieces
# In C we'd look for these in parallel of course.
nlpos = line.find("\n", start)
crpos = line.find("\r", start)
if crpos == -1:
if nlpos == -1:
# Nothing found
start = len(line)
else:
# Found \n
endpos = nlpos + 1
break
elif nlpos == -1:
# Found lone \r
endpos = crpos + 1
break
elif nlpos < crpos:
# Found \n
endpos = nlpos + 1
break
elif nlpos == crpos + 1:
# Found \r\n
endpos = crpos + 2
break
else:
# Found \r
endpos = crpos + 1
break
else:
# non-universal
pos = line.find(self._readnl)
if pos >= 0:
endpos = pos + len(self._readnl)
break
# No line ending seen yet - get more data
more_line = self.read(self._CHUNK_SIZE)
if more_line:
line += more_line
else:
# end of file
return line
self._pending = line[endpos:]
return line[:endpos]
_LF = 1
_CR = 2
_CRLF = 4
@property
def newlines(self):
return (None,
"\n",
"\r",
("\r", "\n"),
"\r\n",
("\n", "\r\n"),
("\r", "\r\n"),
("\r", "\n", "\r\n")
)[self._seennl]
except ImportError:
StringIO = _StringIO

View File

@ -10,7 +10,7 @@ import io
import sys
try:
import _bytesio
import _bytesio, _stringio
has_c_implementation = True
except ImportError:
has_c_implementation = False
@ -373,7 +373,7 @@ class PyBytesIOTest(MemoryTestMixin, unittest.TestCase):
class PyStringIOTest(MemoryTestMixin, unittest.TestCase):
buftype = str
ioclass = io.StringIO
ioclass = io._StringIO
EOF = ""
def test_relative_seek(self):
@ -404,10 +404,14 @@ if has_c_implementation:
class CBytesIOTest(PyBytesIOTest):
ioclass = io.BytesIO
class CStringIOTest(PyStringIOTest):
ioclass = io.StringIO
def test_main():
tests = [PyBytesIOTest, PyStringIOTest]
if has_c_implementation:
tests.extend([CBytesIOTest])
tests.extend([CBytesIOTest, CStringIOTest])
support.run_unittest(*tests)
if __name__ == '__main__':

View File

@ -3,7 +3,6 @@
import os
import sys
import pickle
from io import StringIO
from test.support import verbose, run_unittest, TestSkipped
import unittest
@ -80,7 +79,7 @@ class MinidomTest(unittest.TestCase):
self.confirm(t == s, "looking for %s, found %s" % (repr(s), repr(t)))
def testParseFromFile(self):
dom = parse(StringIO(open(tstfile).read()))
dom = parse(open(tstfile))
dom.unlink()
self.confirm(isinstance(dom, Document))

View File

@ -17,6 +17,32 @@ encodedtext = b"""\
M5&AE('-M;V]T:\"US8V%L960@<'ET:&]N(&-R97!T(&]V97(@=&AE('-L965P
(:6YG(&1O9PH """
# Stolen from io.py
class FakeIO(io.TextIOWrapper):
"""Text I/O implementation using an in-memory buffer.
Can be a used as a drop-in replacement for sys.stdin and sys.stdout.
"""
# XXX This is really slow, but fully functional
def __init__(self, initial_value="", encoding="utf-8",
errors="strict", newline="\n"):
super(FakeIO, self).__init__(io.BytesIO(),
encoding=encoding,
errors=errors,
newline=newline)
if initial_value:
if not isinstance(initial_value, str):
initial_value = str(initial_value)
self.write(initial_value)
self.seek(0)
def getvalue(self):
self.flush()
return self.buffer.getvalue().decode(self._encoding, self._errors)
def encodedtextwrapped(mode, filename):
return (bytes("begin %03o %s\n" % (mode, filename), "ascii") +
encodedtext + b"\n \nend\n")
@ -76,15 +102,15 @@ class UUStdIOTest(unittest.TestCase):
sys.stdout = self.stdout
def test_encode(self):
sys.stdin = io.StringIO(plaintext.decode("ascii"))
sys.stdout = io.StringIO()
sys.stdin = FakeIO(plaintext.decode("ascii"))
sys.stdout = FakeIO()
uu.encode("-", "-", "t1", 0o666)
self.assertEqual(sys.stdout.getvalue(),
encodedtextwrapped(0o666, "t1").decode("ascii"))
def test_decode(self):
sys.stdin = io.StringIO(encodedtextwrapped(0o666, "t1").decode("ascii"))
sys.stdout = io.StringIO()
sys.stdin = FakeIO(encodedtextwrapped(0o666, "t1").decode("ascii"))
sys.stdout = FakeIO()
uu.decode("-", "-")
stdout = sys.stdout
sys.stdout = self.stdout

View File

@ -14,6 +14,7 @@ Todo:
* SAX 2 namespaces
"""
import codecs
import io
import xml.dom
@ -49,16 +50,16 @@ class Node(xml.dom.Node):
# indent = the indentation string to prepend, per level
# newl = the newline string to append
use_encoding = "utf-8" if encoding is None else encoding
writer = io.StringIO(encoding=use_encoding)
writer = codecs.getwriter(use_encoding)(io.BytesIO())
if self.nodeType == Node.DOCUMENT_NODE:
# Can pass encoding only to document, to put it into XML header
self.writexml(writer, "", indent, newl, encoding)
else:
self.writexml(writer, "", indent, newl)
if encoding is None:
return writer.getvalue()
return writer.stream.getvalue().decode(use_encoding)
else:
return writer.buffer.getvalue()
return writer.stream.getvalue()
def hasChildNodes(self):
if self.childNodes:

View File

@ -78,6 +78,8 @@ Extension Modules
Library
-------
- Added C optimized implementation of io.StringIO.
- The ``pickle`` module is now automatically use an optimized C
implementation of Pickler and Unpickler when available. The
``cPickle`` module is no longer needed.

379
Modules/_stringio.c Normal file
View File

@ -0,0 +1,379 @@
#include "Python.h"
/* This module is a stripped down version of _bytesio.c with a Py_UNICODE
buffer. Most of the functionality is provided by subclassing _StringIO. */
typedef struct {
PyObject_HEAD
Py_UNICODE *buf;
Py_ssize_t pos;
Py_ssize_t string_size;
size_t buf_size;
} StringIOObject;
/* Internal routine for changing the size, in terms of characters, of the
buffer of StringIO objects. The caller should ensure that the 'size'
argument is non-negative. Returns 0 on success, -1 otherwise. */
static int
resize_buffer(StringIOObject *self, size_t size)
{
/* Here, unsigned types are used to avoid dealing with signed integer
overflow, which is undefined in C. */
size_t alloc = self->buf_size;
Py_UNICODE *new_buf = NULL;
assert(self->buf != NULL);
/* For simplicity, stay in the range of the signed type. Anyway, Python
doesn't allow strings to be longer than this. */
if (size > PY_SSIZE_T_MAX)
goto overflow;
if (size < alloc / 2) {
/* Major downsize; resize down to exact size. */
alloc = size + 1;
}
else if (size < alloc) {
/* Within allocated size; quick exit */
return 0;
}
else if (size <= alloc * 1.125) {
/* Moderate upsize; overallocate similar to list_resize() */
alloc = size + (size >> 3) + (size < 9 ? 3 : 6);
}
else {
/* Major upsize; resize up to exact size */
alloc = size + 1;
}
if (alloc > ((size_t)-1) / sizeof(Py_UNICODE))
goto overflow;
new_buf = (Py_UNICODE *)PyMem_Realloc(self->buf,
alloc * sizeof(Py_UNICODE));
if (new_buf == NULL) {
PyErr_NoMemory();
return -1;
}
self->buf_size = alloc;
self->buf = new_buf;
return 0;
overflow:
PyErr_SetString(PyExc_OverflowError,
"new buffer size too large");
return -1;
}
/* Internal routine for writing a string of characters to the buffer of a
StringIO object. Returns the number of bytes wrote, or -1 on error. */
static Py_ssize_t
write_str(StringIOObject *self, const Py_UNICODE *str, Py_ssize_t len)
{
assert(self->buf != NULL);
assert(self->pos >= 0);
assert(len >= 0);
/* This overflow check is not strictly necessary. However, it avoids us to
deal with funky things like comparing an unsigned and a signed
integer. */
if (self->pos > PY_SSIZE_T_MAX - len) {
PyErr_SetString(PyExc_OverflowError,
"new position too large");
return -1;
}
if (self->pos + len > self->string_size) {
if (resize_buffer(self, self->pos + len) < 0)
return -1;
}
if (self->pos > self->string_size) {
/* In case of overseek, pad with null bytes the buffer region between
the end of stream and the current position.
0 lo string_size hi
| |<---used--->|<----------available----------->|
| | <--to pad-->|<---to write---> |
0 buf positon
*/
memset(self->buf + self->string_size, '\0',
(self->pos - self->string_size) * sizeof(Py_UNICODE));
}
/* Copy the data to the internal buffer, overwriting some of the
existing data if self->pos < self->string_size. */
memcpy(self->buf + self->pos, str, len * sizeof(Py_UNICODE));
self->pos += len;
/* Set the new length of the internal string if it has changed */
if (self->string_size < self->pos) {
self->string_size = self->pos;
}
return len;
}
static PyObject *
stringio_getvalue(StringIOObject *self)
{
return PyUnicode_FromUnicode(self->buf, self->string_size);
}
static PyObject *
stringio_tell(StringIOObject *self)
{
return PyLong_FromSsize_t(self->pos);
}
static PyObject *
stringio_read(StringIOObject *self, PyObject *args)
{
Py_ssize_t size, n;
Py_UNICODE *output;
PyObject *arg = Py_None;
if (!PyArg_ParseTuple(args, "|O:read", &arg))
return NULL;
if (PyLong_Check(arg)) {
size = PyLong_AsSsize_t(arg);
}
else if (arg == Py_None) {
/* Read until EOF is reached, by default. */
size = -1;
}
else {
PyErr_Format(PyExc_TypeError, "integer argument expected, got '%s'",
Py_TYPE(arg)->tp_name);
return NULL;
}
/* adjust invalid sizes */
n = self->string_size - self->pos;
if (size < 0 || size > n) {
size = n;
if (size < 0)
size = 0;
}
assert(self->buf != NULL);
output = self->buf + self->pos;
self->pos += size;
return PyUnicode_FromUnicode(output, size);
}
static PyObject *
stringio_truncate(StringIOObject *self, PyObject *args)
{
Py_ssize_t size;
PyObject *arg = Py_None;
if (!PyArg_ParseTuple(args, "|O:truncate", &arg))
return NULL;
if (PyLong_Check(arg)) {
size = PyLong_AsSsize_t(arg);
}
else if (arg == Py_None) {
/* Truncate to current position if no argument is passed. */
size = self->pos;
}
else {
PyErr_Format(PyExc_TypeError, "integer argument expected, got '%s'",
Py_TYPE(arg)->tp_name);
return NULL;
}
if (size < 0) {
PyErr_Format(PyExc_ValueError,
"Negative size value %zd", size);
return NULL;
}
if (size < self->string_size) {
self->string_size = size;
if (resize_buffer(self, size) < 0)
return NULL;
}
self->pos = size;
return PyLong_FromSsize_t(size);
}
static PyObject *
stringio_seek(StringIOObject *self, PyObject *args)
{
Py_ssize_t pos;
int mode = 0;
if (!PyArg_ParseTuple(args, "n|i:seek", &pos, &mode))
return NULL;
if (mode != 0 && mode != 1 && mode != 2) {
PyErr_Format(PyExc_ValueError,
"Invalid whence (%i, should be 0, 1 or 2)", mode);
return NULL;
}
else if (pos < 0 && mode == 0) {
PyErr_Format(PyExc_ValueError,
"Negative seek position %zd", pos);
return NULL;
}
else if (mode != 0 && pos != 0) {
PyErr_SetString(PyExc_IOError,
"Can't do nonzero cur-relative seeks");
return NULL;
}
/* mode 0: offset relative to beginning of the string.
mode 1: no change to current position.
mode 2: change position to end of file. */
if (mode == 1) {
pos = self->pos;
}
else if (mode == 2) {
pos = self->string_size;
}
self->pos = pos;
return PyLong_FromSsize_t(self->pos);
}
static PyObject *
stringio_write(StringIOObject *self, PyObject *obj)
{
const Py_UNICODE *str;
Py_ssize_t size;
Py_ssize_t n = 0;
if (PyUnicode_Check(obj)) {
str = PyUnicode_AsUnicode(obj);
size = PyUnicode_GetSize(obj);
}
else {
PyErr_Format(PyExc_TypeError, "string argument expected, got '%s'",
Py_TYPE(obj)->tp_name);
return NULL;
}
if (size != 0) {
n = write_str(self, str, size);
if (n < 0)
return NULL;
}
return PyLong_FromSsize_t(n);
}
static void
stringio_dealloc(StringIOObject *self)
{
PyMem_Free(self->buf);
Py_TYPE(self)->tp_free(self);
}
static PyObject *
stringio_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
{
StringIOObject *self;
assert(type != NULL && type->tp_alloc != NULL);
self = (StringIOObject *)type->tp_alloc(type, 0);
if (self == NULL)
return NULL;
self->string_size = 0;
self->pos = 0;
self->buf_size = 0;
self->buf = (Py_UNICODE *)PyMem_Malloc(0);
if (self->buf == NULL) {
Py_DECREF(self);
return PyErr_NoMemory();
}
return (PyObject *)self;
}
static struct PyMethodDef stringio_methods[] = {
{"getvalue", (PyCFunction)stringio_getvalue, METH_VARARGS, NULL},
{"read", (PyCFunction)stringio_read, METH_VARARGS, NULL},
{"tell", (PyCFunction)stringio_tell, METH_NOARGS, NULL},
{"truncate", (PyCFunction)stringio_truncate, METH_VARARGS, NULL},
{"seek", (PyCFunction)stringio_seek, METH_VARARGS, NULL},
{"write", (PyCFunction)stringio_write, METH_O, NULL},
{NULL, NULL} /* sentinel */
};
static PyTypeObject StringIO_Type = {
PyVarObject_HEAD_INIT(NULL, 0)
"_stringio._StringIO", /*tp_name*/
sizeof(StringIOObject), /*tp_basicsize*/
0, /*tp_itemsize*/
(destructor)stringio_dealloc, /*tp_dealloc*/
0, /*tp_print*/
0, /*tp_getattr*/
0, /*tp_setattr*/
0, /*tp_compare*/
0, /*tp_repr*/
0, /*tp_as_number*/
0, /*tp_as_sequence*/
0, /*tp_as_mapping*/
0, /*tp_hash*/
0, /*tp_call*/
0, /*tp_str*/
0, /*tp_getattro*/
0, /*tp_setattro*/
0, /*tp_as_buffer*/
Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /*tp_flags*/
0, /*tp_doc*/
0, /*tp_traverse*/
0, /*tp_clear*/
0, /*tp_richcompare*/
0, /*tp_weaklistoffset*/
0, /*tp_iter*/
0, /*tp_iternext*/
stringio_methods, /*tp_methods*/
0, /*tp_members*/
0, /*tp_getset*/
0, /*tp_base*/
0, /*tp_dict*/
0, /*tp_descr_get*/
0, /*tp_descr_set*/
0, /*tp_dictoffset*/
0, /*tp_init*/
0, /*tp_alloc*/
stringio_new, /*tp_new*/
};
static struct PyModuleDef _stringiomodule = {
PyModuleDef_HEAD_INIT,
"_stringio",
NULL,
-1,
NULL,
NULL,
NULL,
NULL,
NULL
};
PyMODINIT_FUNC
PyInit__stringio(void)
{
PyObject *m;
if (PyType_Ready(&StringIO_Type) < 0)
return NULL;
m = PyModule_Create(&_stringiomodule);
if (m == NULL)
return NULL;
Py_INCREF(&StringIO_Type);
if (PyModule_AddObject(m, "_StringIO", (PyObject *)&StringIO_Type) < 0)
return NULL;
return m;
}

View File

@ -422,6 +422,7 @@ class PyBuildExt(build_ext):
exts.append( Extension("_functools", ["_functoolsmodule.c"]) )
# Memory-based IO accelerator modules
exts.append( Extension("_bytesio", ["_bytesio.c"]) )
exts.append( Extension("_stringio", ["_stringio.c"]) )
# C-optimized pickle replacement
exts.append( Extension("_pickle", ["_pickle.c"]) )
# atexit