Moved reader \r and \n processing from the iterator to the state machine -

this allows for better handling of newline characters in quoted fields (and hopefully resolves Bug 967934).
2005-01-13 11:30:54 +00:00 · 2005-01-13 11:30:54 +00:00 · f69d94f6c0
parent a1974c1459
commit f69d94f6c0
2 changed files with 78 additions and 99 deletions
--- a/Misc/NEWS
+++ b/Misc/NEWS
@ -48,10 +48,11 @@ Library
    dictates.
  + the parser now removes the escapechar prefix from escaped characters.
  + when quoting=QUOTE_NONNUMERIC, the writer now tests for numeric
-    objects, rather than attempting to cast to float, and using the
-    success of that as the determinator.
+    types, rather than any object than can be represented as a numeric.
  + when quoting=QUOTE_NONNUMERIC, the reader now casts unquoted fields
    to floats.
+  + reader now allows \r characters to be quoted (previously it only allowed
+    \n to be quoted).
  + writer doublequote handling improved.
  + Dialect classes passed to the module are no longer instantiated by
    the module before being parsed (the former validation scheme required
--- a/Modules/_csv.c
+++ b/Modules/_csv.c
@ -48,7 +48,8 @@ static long field_limit = 128 * 1024;	/* max parsed field size */

 typedef enum {
 	START_RECORD, START_FIELD, ESCAPED_CHAR, IN_FIELD, 
-	IN_QUOTED_FIELD, ESCAPE_IN_QUOTED_FIELD, QUOTE_IN_QUOTED_FIELD
+	IN_QUOTED_FIELD, ESCAPE_IN_QUOTED_FIELD, QUOTE_IN_QUOTED_FIELD,
+	EAT_CRNL
 } ParserState;

 typedef enum {
@ -96,7 +97,6 @@ typedef struct {
 	char *field;		/* build current field in here */
 	int field_size;		/* size of allocated buffer */
 	int field_len;		/* length of current field */
-	int had_parse_error;	/* did we have a parse error? */
 	int numeric_field;	/* treat field as numeric */
 	unsigned long line_num;	/* Source-file line number */
 } ReaderObj;
@ -497,6 +497,9 @@ _call_dialect(PyObject *dialect_inst, PyObject *kwargs)
 	return dialect;
 }

+/*
+ * READER
+ */
 static int
 parse_save_field(ReaderObj *self)
 {
@ -543,22 +546,6 @@ parse_grow_buff(ReaderObj *self)
 	return 1;
 }

-static int
-parse_reset(ReaderObj *self)
-{
-	if (self->fields) {
-		Py_DECREF(self->fields);
-	}
-	self->fields = PyList_New(0);
-	if (self->fields == NULL)
-		return -1;
-	self->field_len = 0;
-	self->state = START_RECORD;
-	self->had_parse_error = 0;
-	self->numeric_field = 0;
-	return 0;
-}
-
 static int
 parse_add_char(ReaderObj *self, char c)
 {
@ -581,19 +568,23 @@ parse_process_char(ReaderObj *self, char c)
 	switch (self->state) {
 	case START_RECORD:
 		/* start of record */
-		if (c == '\n')
+		if (c == '\0')
 			/* empty line - return [] */
 			break;
+		else if (c == '\n' || c == '\r') {
+			self->state = EAT_CRNL;
+			break;
+		}
 		/* normal character - handle as START_FIELD */
 		self->state = START_FIELD;
 		/* fallthru */
 	case START_FIELD:
 		/* expecting field */
-		if (c == '\n') {
+		if (c == '\n' || c == '\r' || c == '\0') {
 			/* save empty field - return [fields] */
 			if (parse_save_field(self) < 0)
 				return -1;
-			self->state = START_RECORD;
+			self->state = (c == '\0' ? START_RECORD : EAT_CRNL);
 		}
 		else if (c == dialect->quotechar && 
 			 dialect->quoting != QUOTE_NONE) {
@ -623,6 +614,8 @@ parse_process_char(ReaderObj *self, char c)
 		break;

 	case ESCAPED_CHAR:
+		if (c == '\0')
+			c = '\n';
 		if (parse_add_char(self, c) < 0)
 			return -1;
 		self->state = IN_FIELD;
@ -630,11 +623,11 @@ parse_process_char(ReaderObj *self, char c)

 	case IN_FIELD:
 		/* in unquoted field */
-		if (c == '\n') {
+		if (c == '\n' || c == '\r' || c == '\0') {
 			/* end of line - return [fields] */
 			if (parse_save_field(self) < 0)
 				return -1;
-			self->state = START_RECORD;
+			self->state = (c == '\0' ? START_RECORD : EAT_CRNL);
 		}
 		else if (c == dialect->escapechar) {
 			/* possible escaped character */
@ -655,11 +648,8 @@ parse_process_char(ReaderObj *self, char c)

 	case IN_QUOTED_FIELD:
 		/* in quoted field */
-		if (c == '\n') {
-			/* end of line - save '\n' in field */
-			if (parse_add_char(self, '\n') < 0)
-				return -1;
-		}
+		if (c == '\0')
+			;
 		else if (c == dialect->escapechar) {
 			/* Possible escape character */
 			self->state = ESCAPE_IN_QUOTED_FIELD;
@ -683,6 +673,8 @@ parse_process_char(ReaderObj *self, char c)
 		break;

 	case ESCAPE_IN_QUOTED_FIELD:
+		if (c == '\0')
+			c = '\n';
 		if (parse_add_char(self, c) < 0)
 			return -1;
 		self->state = IN_QUOTED_FIELD;
@ -703,11 +695,11 @@ parse_process_char(ReaderObj *self, char c)
 				return -1;
 			self->state = START_FIELD;
 		}
-		else if (c == '\n') {
+		else if (c == '\n' || c == '\r' || c == '\0') {
 			/* end of line - return [fields] */
 			if (parse_save_field(self) < 0)
 				return -1;
-			self->state = START_RECORD;
+			self->state = (c == '\0' ? START_RECORD : EAT_CRNL);
 		}
 		else if (!dialect->strict) {
 			if (parse_add_char(self, c) < 0)
@ -716,7 +708,6 @@ parse_process_char(ReaderObj *self, char c)
 		}
 		else {
 			/* illegal */
-			self->had_parse_error = 1;
 			PyErr_Format(error_obj, "'%c' expected after '%c'", 
 					dialect->delimiter, 
                                        dialect->quotechar);
@ -724,104 +715,83 @@ parse_process_char(ReaderObj *self, char c)
 		}
 		break;

+	case EAT_CRNL:
+		if (c == '\n' || c == '\r')
+			;
+		else if (c == '\0')
+			self->state = START_RECORD;
+		else {
+			PyErr_Format(error_obj, "new-line character seen in unquoted field - do you need to open the file in universal-newline mode?");
+			return -1;
+		}
+		break;
+
 	}
 	return 0;
 }

-/*
- * READER
- */
-#define R_OFF(x) offsetof(ReaderObj, x)
-
-static struct PyMemberDef Reader_memberlist[] = {
-	{ "dialect", T_OBJECT, R_OFF(dialect), RO },
-	{ "line_num", T_ULONG, R_OFF(line_num), RO },
-	{ NULL }
-};
+static int
+parse_reset(ReaderObj *self)
+{
+	Py_XDECREF(self->fields);
+	self->fields = PyList_New(0);
+	if (self->fields == NULL)
+		return -1;
+	self->field_len = 0;
+	self->state = START_RECORD;
+	self->numeric_field = 0;
+	return 0;
+}

 static PyObject *
 Reader_iternext(ReaderObj *self)
 {
        PyObject *lineobj;
-        PyObject *fields;
-        char *line;
+        PyObject *fields = NULL;
+        char *line, c;
+	int linelen;

+	if (parse_reset(self) < 0)
+		return NULL;
        do {
                lineobj = PyIter_Next(self->input_iter);
                if (lineobj == NULL) {
                        /* End of input OR exception */
                        if (!PyErr_Occurred() && self->field_len != 0)
-                                return PyErr_Format(error_obj,
+                                PyErr_Format(error_obj,
 					     "newline inside string");
                        return NULL;
                }
 		++self->line_num;

-                if (self->had_parse_error)
-			if (parse_reset(self) < 0) {
-				Py_DECREF(lineobj);
-				return NULL;
-			}
                line = PyString_AsString(lineobj);
+		linelen = PyString_Size(lineobj);

-                if (line == NULL) {
+                if (line == NULL || linelen < 0) {
                        Py_DECREF(lineobj);
                        return NULL;
                }
-		if (strlen(line) < (size_t)PyString_GET_SIZE(lineobj)) {
-			self->had_parse_error = 1;
+                while (linelen--) {
+			c = *line++;
+			if (c == '\0') {
 				Py_DECREF(lineobj);
-			return PyErr_Format(error_obj,
-					    "string with NUL bytes");
-		}
-
-                /* Process line of text - send '\n' to processing code to
-                represent end of line.  End of line which is not at end of
-                string is an error. */
-                while (*line) {
-                        char c;
-
-                        c = *line++;
-                        if (c == '\r') {
-                                c = *line++;
-                                if (c == '\0')
-                                        /* macintosh end of line */
-                                        break;
-                                if (c == '\n') {
-                                        c = *line++;
-                                        if (c == '\0')
-                                                /* DOS end of line */
-                                                break;
-                                }
-                                self->had_parse_error = 1;
-                                Py_DECREF(lineobj);
-                                return PyErr_Format(error_obj,
-                                                    "newline inside string");
-                        }
-                        if (c == '\n') {
-                                c = *line++;
-                                if (c == '\0')
-                                        /* unix end of line */
-                                        break;
-                                self->had_parse_error = 1;
-                                Py_DECREF(lineobj);
-                                return PyErr_Format(error_obj, 
-                                                    "newline inside string");
+				PyErr_Format(error_obj,
+					     "line contains NULL byte");
+				goto err;
 			}
 			if (parse_process_char(self, c) < 0) {
 				Py_DECREF(lineobj);
-				return NULL;
+				goto err;
 			}
 		}
-		if (parse_process_char(self, '\n') < 0) {
-			Py_DECREF(lineobj);
-			return NULL;
-		}
                Py_DECREF(lineobj);
+		if (parse_process_char(self, 0) < 0)
+			goto err;
        } while (self->state != START_RECORD);

        fields = self->fields;
-        self->fields = PyList_New(0);
+        self->fields = NULL;
+err:
        return fields;
 }

@ -875,6 +845,14 @@ PyDoc_STRVAR(Reader_Type_doc,
 static struct PyMethodDef Reader_methods[] = {
 	{ NULL, NULL }
 };
+#define R_OFF(x) offsetof(ReaderObj, x)
+
+static struct PyMemberDef Reader_memberlist[] = {
+	{ "dialect", T_OBJECT, R_OFF(dialect), RO },
+	{ "line_num", T_ULONG, R_OFF(line_num), RO },
+	{ NULL }
+};
+

 static PyTypeObject Reader_Type = {
 	PyObject_HEAD_INIT(NULL)