mirror of https://github.com/python/cpython
bpo-27580: Add support of null characters in the csv module. (GH-28808)
This commit is contained in:
parent
b4903afd4d
commit
b454e8e4df
|
@ -217,6 +217,17 @@ class Test_Csv(unittest.TestCase):
|
|||
self._write_test(['C\\', '6', '7', 'X"'], 'C\\\\,6,7,"X"""',
|
||||
escapechar='\\', quoting=csv.QUOTE_MINIMAL)
|
||||
|
||||
def test_write_lineterminator(self):
|
||||
for lineterminator in '\r\n', '\n', '\r', '!@#', '\0':
|
||||
with self.subTest(lineterminator=lineterminator):
|
||||
with StringIO() as sio:
|
||||
writer = csv.writer(sio, lineterminator=lineterminator)
|
||||
writer.writerow(['a', 'b'])
|
||||
writer.writerow([1, 2])
|
||||
self.assertEqual(sio.getvalue(),
|
||||
f'a,b{lineterminator}'
|
||||
f'1,2{lineterminator}')
|
||||
|
||||
def test_write_iterable(self):
|
||||
self._write_test(iter(['a', 1, 'p,q']), 'a,1,"p,q"')
|
||||
self._write_test(iter(['a', 1, None]), 'a,1,')
|
||||
|
@ -286,14 +297,10 @@ class Test_Csv(unittest.TestCase):
|
|||
self._read_test([''], [[]])
|
||||
self.assertRaises(csv.Error, self._read_test,
|
||||
['"ab"c'], None, strict = 1)
|
||||
# cannot handle null bytes for the moment
|
||||
self.assertRaises(csv.Error, self._read_test,
|
||||
['ab\0c'], None, strict = 1)
|
||||
self._read_test(['"ab"c'], [['abc']], doublequote = 0)
|
||||
|
||||
self.assertRaises(csv.Error, self._read_test,
|
||||
[b'ab\0c'], None)
|
||||
|
||||
[b'abc'], None)
|
||||
|
||||
def test_read_eol(self):
|
||||
self._read_test(['a,b'], [['a','b']])
|
||||
|
@ -313,6 +320,18 @@ class Test_Csv(unittest.TestCase):
|
|||
self.assertRaises(csv.Error, self._read_test,
|
||||
['^'], [], escapechar='^', strict=True)
|
||||
|
||||
def test_read_nul(self):
|
||||
self._read_test(['\0'], [['\0']])
|
||||
self._read_test(['a,\0b,c'], [['a', '\0b', 'c']])
|
||||
self._read_test(['a,b\0,c'], [['a', 'b\0', 'c']])
|
||||
self._read_test(['a,b\\\0,c'], [['a', 'b\0', 'c']], escapechar='\\')
|
||||
self._read_test(['a,"\0b",c'], [['a', '\0b', 'c']])
|
||||
|
||||
def test_read_delimiter(self):
|
||||
self._read_test(['a,b,c'], [['a', 'b', 'c']])
|
||||
self._read_test(['a;b;c'], [['a', 'b', 'c']], delimiter=';')
|
||||
self._read_test(['a\0b\0c'], [['a', 'b', 'c']], delimiter='\0')
|
||||
|
||||
def test_read_escape(self):
|
||||
self._read_test(['a,\\b,c'], [['a', 'b', 'c']], escapechar='\\')
|
||||
self._read_test(['a,b\\,c'], [['a', 'b,c']], escapechar='\\')
|
||||
|
@ -320,6 +339,11 @@ class Test_Csv(unittest.TestCase):
|
|||
self._read_test(['a,"b,\\c"'], [['a', 'b,c']], escapechar='\\')
|
||||
self._read_test(['a,"b,c\\""'], [['a', 'b,c"']], escapechar='\\')
|
||||
self._read_test(['a,"b,c"\\'], [['a', 'b,c\\']], escapechar='\\')
|
||||
self._read_test(['a,^b,c'], [['a', 'b', 'c']], escapechar='^')
|
||||
self._read_test(['a,\0b,c'], [['a', 'b', 'c']], escapechar='\0')
|
||||
self._read_test(['a,\\b,c'], [['a', '\\b', 'c']], escapechar=None)
|
||||
self._read_test(['a,\\b,c'], [['a', '\\b', 'c']], escapechar='')
|
||||
self._read_test(['a,\\b,c'], [['a', '\\b', 'c']])
|
||||
|
||||
def test_read_quoting(self):
|
||||
self._read_test(['1,",3,",5'], [['1', ',3,', '5']])
|
||||
|
@ -334,6 +358,8 @@ class Test_Csv(unittest.TestCase):
|
|||
self.assertRaises(ValueError, self._read_test,
|
||||
['abc,3'], [[]],
|
||||
quoting=csv.QUOTE_NONNUMERIC)
|
||||
self._read_test(['1,@,3,@,5'], [['1', ',3,', '5']], quotechar='@')
|
||||
self._read_test(['1,\0,3,\0,5'], [['1', ',3,', '5']], quotechar='\0')
|
||||
|
||||
def test_read_bigfield(self):
|
||||
# This exercises the buffer realloc functionality and field size
|
||||
|
@ -1074,6 +1100,12 @@ Stonecutters Seafood and Chop House+ Lemont+ IL+ 12/19/02+ Week Back
|
|||
a,b
|
||||
""")
|
||||
|
||||
sample14 = """\
|
||||
abc\0def
|
||||
ghijkl\0mno
|
||||
ghi\0jkl
|
||||
"""
|
||||
|
||||
def test_issue43625(self):
|
||||
sniffer = csv.Sniffer()
|
||||
self.assertTrue(sniffer.has_header(self.sample12))
|
||||
|
@ -1142,6 +1174,8 @@ Stonecutters Seafood and Chop House+ Lemont+ IL+ 12/19/02+ Week Back
|
|||
dialect = sniffer.sniff(self.sample9)
|
||||
self.assertEqual(dialect.delimiter, '+')
|
||||
self.assertEqual(dialect.quotechar, "'")
|
||||
dialect = sniffer.sniff(self.sample14)
|
||||
self.assertEqual(dialect.delimiter, '\0')
|
||||
|
||||
def test_doublequote(self):
|
||||
sniffer = csv.Sniffer()
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
Add support of null characters in :mod:`csv`.
|
|
@ -14,6 +14,9 @@ module instead.
|
|||
#include "structmember.h" // PyMemberDef
|
||||
#include <stdbool.h>
|
||||
|
||||
#define NOT_SET ((Py_UCS4)-1)
|
||||
#define EOL ((Py_UCS4)-2)
|
||||
|
||||
|
||||
typedef struct {
|
||||
PyObject *error_obj; /* CSV exception */
|
||||
|
@ -153,9 +156,9 @@ get_dialect_from_registry(PyObject *name_obj, _csvstate *module_state)
|
|||
}
|
||||
|
||||
static PyObject *
|
||||
get_nullchar_as_None(Py_UCS4 c)
|
||||
get_char_or_None(Py_UCS4 c)
|
||||
{
|
||||
if (c == '\0') {
|
||||
if (c == NOT_SET) {
|
||||
Py_RETURN_NONE;
|
||||
}
|
||||
else
|
||||
|
@ -172,19 +175,19 @@ Dialect_get_lineterminator(DialectObj *self, void *Py_UNUSED(ignored))
|
|||
static PyObject *
|
||||
Dialect_get_delimiter(DialectObj *self, void *Py_UNUSED(ignored))
|
||||
{
|
||||
return get_nullchar_as_None(self->delimiter);
|
||||
return get_char_or_None(self->delimiter);
|
||||
}
|
||||
|
||||
static PyObject *
|
||||
Dialect_get_escapechar(DialectObj *self, void *Py_UNUSED(ignored))
|
||||
{
|
||||
return get_nullchar_as_None(self->escapechar);
|
||||
return get_char_or_None(self->escapechar);
|
||||
}
|
||||
|
||||
static PyObject *
|
||||
Dialect_get_quotechar(DialectObj *self, void *Py_UNUSED(ignored))
|
||||
{
|
||||
return get_nullchar_as_None(self->quotechar);
|
||||
return get_char_or_None(self->quotechar);
|
||||
}
|
||||
|
||||
static PyObject *
|
||||
|
@ -235,7 +238,7 @@ _set_char_or_none(const char *name, Py_UCS4 *target, PyObject *src, Py_UCS4 dflt
|
|||
*target = dflt;
|
||||
}
|
||||
else {
|
||||
*target = '\0';
|
||||
*target = NOT_SET;
|
||||
if (src != Py_None) {
|
||||
if (!PyUnicode_Check(src)) {
|
||||
PyErr_Format(PyExc_TypeError,
|
||||
|
@ -254,7 +257,7 @@ _set_char_or_none(const char *name, Py_UCS4 *target, PyObject *src, Py_UCS4 dflt
|
|||
return -1;
|
||||
}
|
||||
/* PyUnicode_READY() is called in PyUnicode_GetLength() */
|
||||
else {
|
||||
else if (len > 0) {
|
||||
*target = PyUnicode_READ_CHAR(src, 0);
|
||||
}
|
||||
}
|
||||
|
@ -269,7 +272,7 @@ _set_char(const char *name, Py_UCS4 *target, PyObject *src, Py_UCS4 dflt)
|
|||
*target = dflt;
|
||||
}
|
||||
else {
|
||||
*target = '\0';
|
||||
*target = NOT_SET;
|
||||
if (!PyUnicode_Check(src)) {
|
||||
PyErr_Format(PyExc_TypeError,
|
||||
"\"%s\" must be string, not %.200s", name,
|
||||
|
@ -287,7 +290,7 @@ _set_char(const char *name, Py_UCS4 *target, PyObject *src, Py_UCS4 dflt)
|
|||
return -1;
|
||||
}
|
||||
/* PyUnicode_READY() is called in PyUnicode_GetLength() */
|
||||
else {
|
||||
else if (len > 0) {
|
||||
*target = PyUnicode_READ_CHAR(src, 0);
|
||||
}
|
||||
}
|
||||
|
@ -481,7 +484,7 @@ dialect_new(PyTypeObject *type, PyObject *args, PyObject *kwargs)
|
|||
goto err
|
||||
DIASET(_set_char, "delimiter", &self->delimiter, delimiter, ',');
|
||||
DIASET(_set_bool, "doublequote", &self->doublequote, doublequote, true);
|
||||
DIASET(_set_char_or_none, "escapechar", &self->escapechar, escapechar, 0);
|
||||
DIASET(_set_char_or_none, "escapechar", &self->escapechar, escapechar, NOT_SET);
|
||||
DIASET(_set_str, "lineterminator", &self->lineterminator, lineterminator, "\r\n");
|
||||
DIASET(_set_char_or_none, "quotechar", &self->quotechar, quotechar, '"');
|
||||
DIASET(_set_int, "quoting", &self->quoting, quoting, QUOTE_MINIMAL);
|
||||
|
@ -491,19 +494,19 @@ dialect_new(PyTypeObject *type, PyObject *args, PyObject *kwargs)
|
|||
/* validate options */
|
||||
if (dialect_check_quoting(self->quoting))
|
||||
goto err;
|
||||
if (self->delimiter == 0) {
|
||||
if (self->delimiter == NOT_SET) {
|
||||
PyErr_SetString(PyExc_TypeError,
|
||||
"\"delimiter\" must be a 1-character string");
|
||||
goto err;
|
||||
}
|
||||
if (quotechar == Py_None && quoting == NULL)
|
||||
self->quoting = QUOTE_NONE;
|
||||
if (self->quoting != QUOTE_NONE && self->quotechar == 0) {
|
||||
if (self->quoting != QUOTE_NONE && self->quotechar == NOT_SET) {
|
||||
PyErr_SetString(PyExc_TypeError,
|
||||
"quotechar must be set if quoting enabled");
|
||||
goto err;
|
||||
}
|
||||
if (self->lineterminator == 0) {
|
||||
if (self->lineterminator == NULL) {
|
||||
PyErr_SetString(PyExc_TypeError, "lineterminator must be set");
|
||||
goto err;
|
||||
}
|
||||
|
@ -670,7 +673,7 @@ parse_process_char(ReaderObj *self, _csvstate *module_state, Py_UCS4 c)
|
|||
switch (self->state) {
|
||||
case START_RECORD:
|
||||
/* start of record */
|
||||
if (c == '\0')
|
||||
if (c == EOL)
|
||||
/* empty line - return [] */
|
||||
break;
|
||||
else if (c == '\n' || c == '\r') {
|
||||
|
@ -682,11 +685,11 @@ parse_process_char(ReaderObj *self, _csvstate *module_state, Py_UCS4 c)
|
|||
/* fallthru */
|
||||
case START_FIELD:
|
||||
/* expecting field */
|
||||
if (c == '\n' || c == '\r' || c == '\0') {
|
||||
if (c == '\n' || c == '\r' || c == EOL) {
|
||||
/* save empty field - return [fields] */
|
||||
if (parse_save_field(self) < 0)
|
||||
return -1;
|
||||
self->state = (c == '\0' ? START_RECORD : EAT_CRNL);
|
||||
self->state = (c == EOL ? START_RECORD : EAT_CRNL);
|
||||
}
|
||||
else if (c == dialect->quotechar &&
|
||||
dialect->quoting != QUOTE_NONE) {
|
||||
|
@ -722,7 +725,7 @@ parse_process_char(ReaderObj *self, _csvstate *module_state, Py_UCS4 c)
|
|||
self->state = AFTER_ESCAPED_CRNL;
|
||||
break;
|
||||
}
|
||||
if (c == '\0')
|
||||
if (c == EOL)
|
||||
c = '\n';
|
||||
if (parse_add_char(self, module_state, c) < 0)
|
||||
return -1;
|
||||
|
@ -730,17 +733,17 @@ parse_process_char(ReaderObj *self, _csvstate *module_state, Py_UCS4 c)
|
|||
break;
|
||||
|
||||
case AFTER_ESCAPED_CRNL:
|
||||
if (c == '\0')
|
||||
if (c == EOL)
|
||||
break;
|
||||
/*fallthru*/
|
||||
|
||||
case IN_FIELD:
|
||||
/* in unquoted field */
|
||||
if (c == '\n' || c == '\r' || c == '\0') {
|
||||
if (c == '\n' || c == '\r' || c == EOL) {
|
||||
/* end of line - return [fields] */
|
||||
if (parse_save_field(self) < 0)
|
||||
return -1;
|
||||
self->state = (c == '\0' ? START_RECORD : EAT_CRNL);
|
||||
self->state = (c == EOL ? START_RECORD : EAT_CRNL);
|
||||
}
|
||||
else if (c == dialect->escapechar) {
|
||||
/* possible escaped character */
|
||||
|
@ -761,7 +764,7 @@ parse_process_char(ReaderObj *self, _csvstate *module_state, Py_UCS4 c)
|
|||
|
||||
case IN_QUOTED_FIELD:
|
||||
/* in quoted field */
|
||||
if (c == '\0')
|
||||
if (c == EOL)
|
||||
;
|
||||
else if (c == dialect->escapechar) {
|
||||
/* Possible escape character */
|
||||
|
@ -786,7 +789,7 @@ parse_process_char(ReaderObj *self, _csvstate *module_state, Py_UCS4 c)
|
|||
break;
|
||||
|
||||
case ESCAPE_IN_QUOTED_FIELD:
|
||||
if (c == '\0')
|
||||
if (c == EOL)
|
||||
c = '\n';
|
||||
if (parse_add_char(self, module_state, c) < 0)
|
||||
return -1;
|
||||
|
@ -808,11 +811,11 @@ parse_process_char(ReaderObj *self, _csvstate *module_state, Py_UCS4 c)
|
|||
return -1;
|
||||
self->state = START_FIELD;
|
||||
}
|
||||
else if (c == '\n' || c == '\r' || c == '\0') {
|
||||
else if (c == '\n' || c == '\r' || c == EOL) {
|
||||
/* end of line - return [fields] */
|
||||
if (parse_save_field(self) < 0)
|
||||
return -1;
|
||||
self->state = (c == '\0' ? START_RECORD : EAT_CRNL);
|
||||
self->state = (c == EOL ? START_RECORD : EAT_CRNL);
|
||||
}
|
||||
else if (!dialect->strict) {
|
||||
if (parse_add_char(self, module_state, c) < 0)
|
||||
|
@ -831,7 +834,7 @@ parse_process_char(ReaderObj *self, _csvstate *module_state, Py_UCS4 c)
|
|||
case EAT_CRNL:
|
||||
if (c == '\n' || c == '\r')
|
||||
;
|
||||
else if (c == '\0')
|
||||
else if (c == EOL)
|
||||
self->state = START_RECORD;
|
||||
else {
|
||||
PyErr_Format(module_state->error_obj,
|
||||
|
@ -909,12 +912,6 @@ Reader_iternext(ReaderObj *self)
|
|||
linelen = PyUnicode_GET_LENGTH(lineobj);
|
||||
while (linelen--) {
|
||||
c = PyUnicode_READ(kind, data, pos);
|
||||
if (c == '\0') {
|
||||
Py_DECREF(lineobj);
|
||||
PyErr_Format(module_state->error_obj,
|
||||
"line contains NUL");
|
||||
goto err;
|
||||
}
|
||||
if (parse_process_char(self, module_state, c) < 0) {
|
||||
Py_DECREF(lineobj);
|
||||
goto err;
|
||||
|
@ -922,7 +919,7 @@ Reader_iternext(ReaderObj *self)
|
|||
pos++;
|
||||
}
|
||||
Py_DECREF(lineobj);
|
||||
if (parse_process_char(self, module_state, 0) < 0)
|
||||
if (parse_process_char(self, module_state, EOL) < 0)
|
||||
goto err;
|
||||
} while (self->state != START_RECORD);
|
||||
|
||||
|
@ -1127,7 +1124,7 @@ join_append_data(WriterObj *self, unsigned int field_kind, const void *field_dat
|
|||
*quoted = 1;
|
||||
}
|
||||
if (want_escape) {
|
||||
if (!dialect->escapechar) {
|
||||
if (dialect->escapechar == NOT_SET) {
|
||||
PyErr_Format(self->error_obj,
|
||||
"need to escape, but no escapechar set");
|
||||
return -1;
|
||||
|
|
Loading…
Reference in New Issue