gh-115712: Support CSV dialects with delimiter=' ' and skipinitialspace=True (GH-115721)

Restore support of such combination, disabled in gh-113796.

csv.writer() now quotes empty fields if delimiter is a space and
skipinitialspace is true and raises exception if quoting is not possible.
This commit is contained in:
Serhiy Storchaka 2024-02-20 18:09:50 +02:00 committed by GitHub
parent cc82e33af9
commit 937d282150
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 90 additions and 17 deletions

View File

@ -64,8 +64,7 @@ class Test_Csv(unittest.TestCase):
ctor(arg, delimiter='\t', skipinitialspace=True)
ctor(arg, escapechar='\t', skipinitialspace=True)
ctor(arg, quotechar='\t', skipinitialspace=True)
self.assertRaises(ValueError, ctor, arg,
delimiter=' ', skipinitialspace=True)
ctor(arg, delimiter=' ', skipinitialspace=True)
self.assertRaises(ValueError, ctor, arg,
escapechar=' ', skipinitialspace=True)
self.assertRaises(ValueError, ctor, arg,
@ -192,9 +191,6 @@ class Test_Csv(unittest.TestCase):
def test_write_arg_valid(self):
self._write_error_test(csv.Error, None)
self._write_test((), '')
self._write_test([None], '""')
self._write_error_test(csv.Error, [None], quoting = csv.QUOTE_NONE)
# Check that exceptions are passed up the chain
self._write_error_test(OSError, BadIterable())
class BadList:
@ -208,7 +204,6 @@ class Test_Csv(unittest.TestCase):
def __str__(self):
raise OSError
self._write_error_test(OSError, [BadItem()])
def test_write_bigfield(self):
# This exercises the buffer realloc functionality
bigstring = 'X' * 50000
@ -315,6 +310,49 @@ class Test_Csv(unittest.TestCase):
fileobj.seek(0)
self.assertEqual(fileobj.read(), 'a\r\n""\r\n')
def test_write_empty_fields(self):
self._write_test((), '')
self._write_test([''], '""')
self._write_error_test(csv.Error, [''], quoting=csv.QUOTE_NONE)
self._write_test([''], '""', quoting=csv.QUOTE_STRINGS)
self._write_test([''], '""', quoting=csv.QUOTE_NOTNULL)
self._write_test([None], '""')
self._write_error_test(csv.Error, [None], quoting=csv.QUOTE_NONE)
self._write_error_test(csv.Error, [None], quoting=csv.QUOTE_STRINGS)
self._write_error_test(csv.Error, [None], quoting=csv.QUOTE_NOTNULL)
self._write_test(['', ''], ',')
self._write_test([None, None], ',')
def test_write_empty_fields_space_delimiter(self):
self._write_test([''], '""', delimiter=' ', skipinitialspace=False)
self._write_test([''], '""', delimiter=' ', skipinitialspace=True)
self._write_test([None], '""', delimiter=' ', skipinitialspace=False)
self._write_test([None], '""', delimiter=' ', skipinitialspace=True)
self._write_test(['', ''], ' ', delimiter=' ', skipinitialspace=False)
self._write_test(['', ''], '"" ""', delimiter=' ', skipinitialspace=True)
self._write_test([None, None], ' ', delimiter=' ', skipinitialspace=False)
self._write_test([None, None], '"" ""', delimiter=' ', skipinitialspace=True)
self._write_test(['', ''], ' ', delimiter=' ', skipinitialspace=False,
quoting=csv.QUOTE_NONE)
self._write_error_test(csv.Error, ['', ''],
delimiter=' ', skipinitialspace=True,
quoting=csv.QUOTE_NONE)
for quoting in csv.QUOTE_STRINGS, csv.QUOTE_NOTNULL:
self._write_test(['', ''], '"" ""', delimiter=' ', skipinitialspace=False,
quoting=quoting)
self._write_test(['', ''], '"" ""', delimiter=' ', skipinitialspace=True,
quoting=quoting)
for quoting in csv.QUOTE_NONE, csv.QUOTE_STRINGS, csv.QUOTE_NOTNULL:
self._write_test([None, None], ' ', delimiter=' ', skipinitialspace=False,
quoting=quoting)
self._write_error_test(csv.Error, [None, None],
delimiter=' ', skipinitialspace=True,
quoting=quoting)
def test_writerows_errors(self):
with TemporaryFile("w+", encoding="utf-8", newline='') as fileobj:
writer = csv.writer(fileobj)
@ -429,6 +467,14 @@ class Test_Csv(unittest.TestCase):
[[None, None, None]],
skipinitialspace=True, quoting=csv.QUOTE_STRINGS)
def test_read_space_delimiter(self):
self._read_test(['a b', ' a ', ' ', ''],
[['a', '', '', 'b'], ['', '', 'a', '', ''], ['', '', ''], []],
delimiter=' ', skipinitialspace=False)
self._read_test(['a b', ' a ', ' ', ''],
[['a', 'b'], ['a', ''], [''], []],
delimiter=' ', skipinitialspace=True)
def test_read_bigfield(self):
# This exercises the buffer realloc functionality and field size
# limits.
@ -555,10 +601,10 @@ class TestDialectRegistry(unittest.TestCase):
escapechar = "\\"
with TemporaryFile("w+", encoding="utf-8") as fileobj:
fileobj.write("abc def\nc1ccccc1 benzene\n")
fileobj.write("abc def\nc1ccccc1 benzene\n")
fileobj.seek(0)
reader = csv.reader(fileobj, dialect=space())
self.assertEqual(next(reader), ["abc", "def"])
self.assertEqual(next(reader), ["abc", "", "", "def"])
self.assertEqual(next(reader), ["c1ccccc1", "benzene"])
def compare_dialect_123(self, expected, *writeargs, **kwwriteargs):
@ -1164,8 +1210,9 @@ class TestDialectValidity(unittest.TestCase):
self.assertRaises(csv.Error, create_invalid, field_name, 5)
self.assertRaises(ValueError, create_invalid, field_name, "\n")
self.assertRaises(ValueError, create_invalid, field_name, "\r")
self.assertRaises(ValueError, create_invalid, field_name, " ",
skipinitialspace=True)
if field_name != "delimiter":
self.assertRaises(ValueError, create_invalid, field_name, " ",
skipinitialspace=True)
class TestSniffer(unittest.TestCase):

View File

@ -0,0 +1,4 @@
Restore support of space delimiter with ``skipinitialspace=True`` in
:mod:`csv`. :func:`csv.writer()` now quotes empty fields if delimiter is a
space and skipinitialspace is true and raises exception if quoting is not
possible.

View File

@ -332,9 +332,9 @@ dialect_check_quoting(int quoting)
}
static int
dialect_check_char(const char *name, Py_UCS4 c, DialectObj *dialect)
dialect_check_char(const char *name, Py_UCS4 c, DialectObj *dialect, bool allowspace)
{
if (c == '\r' || c == '\n' || (dialect->skipinitialspace && c == ' ')) {
if (c == '\r' || c == '\n' || (c == ' ' && !allowspace)) {
PyErr_Format(PyExc_ValueError, "bad %s value", name);
return -1;
}
@ -535,9 +535,11 @@ dialect_new(PyTypeObject *type, PyObject *args, PyObject *kwargs)
PyErr_SetString(PyExc_TypeError, "lineterminator must be set");
goto err;
}
if (dialect_check_char("delimiter", self->delimiter, self) ||
dialect_check_char("escapechar", self->escapechar, self) ||
dialect_check_char("quotechar", self->quotechar, self) ||
if (dialect_check_char("delimiter", self->delimiter, self, true) ||
dialect_check_char("escapechar", self->escapechar, self,
!self->skipinitialspace) ||
dialect_check_char("quotechar", self->quotechar, self,
!self->skipinitialspace) ||
dialect_check_chars("delimiter", "escapechar",
self->delimiter, self->escapechar) ||
dialect_check_chars("delimiter", "quotechar",
@ -1221,6 +1223,7 @@ join_check_rec_size(WriterObj *self, Py_ssize_t rec_len)
static int
join_append(WriterObj *self, PyObject *field, int quoted)
{
DialectObj *dialect = self->dialect;
int field_kind = -1;
const void *field_data = NULL;
Py_ssize_t field_len = 0;
@ -1231,6 +1234,19 @@ join_append(WriterObj *self, PyObject *field, int quoted)
field_data = PyUnicode_DATA(field);
field_len = PyUnicode_GET_LENGTH(field);
}
if (!field_len && dialect->delimiter == ' ' && dialect->skipinitialspace) {
if (dialect->quoting == QUOTE_NONE ||
(field == NULL &&
(dialect->quoting == QUOTE_STRINGS ||
dialect->quoting == QUOTE_NOTNULL)))
{
PyErr_Format(self->error_obj,
"empty field must be quoted if delimiter is a space "
"and skipinitialspace is true");
return 0;
}
quoted = 1;
}
rec_len = join_append_data(self, field_kind, field_data, field_len,
&quoted, 0);
if (rec_len < 0)
@ -1282,6 +1298,7 @@ csv_writerow(WriterObj *self, PyObject *seq)
{
DialectObj *dialect = self->dialect;
PyObject *iter, *field, *line, *result;
bool null_field = false;
iter = PyObject_GetIter(seq);
if (iter == NULL) {
@ -1318,11 +1335,12 @@ csv_writerow(WriterObj *self, PyObject *seq)
break;
}
null_field = (field == Py_None);
if (PyUnicode_Check(field)) {
append_ok = join_append(self, field, quoted);
Py_DECREF(field);
}
else if (field == Py_None) {
else if (null_field) {
append_ok = join_append(self, NULL, quoted);
Py_DECREF(field);
}
@ -1348,7 +1366,11 @@ csv_writerow(WriterObj *self, PyObject *seq)
return NULL;
if (self->num_fields > 0 && self->rec_len == 0) {
if (dialect->quoting == QUOTE_NONE) {
if (dialect->quoting == QUOTE_NONE ||
(null_field &&
(dialect->quoting == QUOTE_STRINGS ||
dialect->quoting == QUOTE_NOTNULL)))
{
PyErr_Format(self->error_obj,
"single empty field record must be quoted");
return NULL;