From cb30f97bd3bdea2e884e8faec23751b39db4c0b3 Mon Sep 17 00:00:00 2001 From: David Goodger Date: Tue, 4 Apr 2006 03:05:44 +0000 Subject: [PATCH] added another example of Unicode CSV parsing; reworked the example text a bit; corrected notice in the intro and added a link to the examples --- Doc/lib/libcsv.tex | 57 ++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 48 insertions(+), 9 deletions(-) diff --git a/Doc/lib/libcsv.tex b/Doc/lib/libcsv.tex index ba0df4fcd52..d2203451250 100644 --- a/Doc/lib/libcsv.tex +++ b/Doc/lib/libcsv.tex @@ -33,8 +33,9 @@ form using the \class{DictReader} and \class{DictWriter} classes. \begin{notice} This version of the \module{csv} module doesn't support Unicode input. Also, there are currently some issues regarding \ASCII{} NUL - characters. Accordingly, all input should generally be printable - \ASCII{} to be safe. These restrictions will be removed in the future. + characters. Accordingly, all input should be UTF-8 or printable + \ASCII{} to be safe; see the examples in section~\ref{csv-examples}. + These restrictions will be removed in the future. \end{notice} \begin{seealso} @@ -365,7 +366,7 @@ A read-only description of the dialect in use by the writer. -\subsection{Examples} +\subsection{Examples\label{csv-examples}} The simplest example of reading a CSV file: @@ -426,14 +427,49 @@ for row in csv.reader(['one,two,three']): \end{verbatim} The \module{csv} module doesn't directly support reading and writing -Unicode, but it is 8-bit clean save for some problems with \ASCII{} NUL -characters, so you can write classes that handle the encoding and decoding -for you as long as you avoid encodings like utf-16 that use NULs: +Unicode, but it is 8-bit-clean save for some problems with \ASCII{} NUL +characters. So you can write functions or classes that handle the +encoding and decoding for you as long as you avoid encodings like +UTF-16 that use NULs. UTF-8 is recommended. + +\function{unicode_csv_reader} below is a generator that wraps +\class{csv.reader} to handle Unicode CSV data (a list of Unicode +strings). \function{utf_8_encoder} is a generator that encodes the +Unicode strings as UTF-8, one string (or row) at a time. The encoded +strings are parsed by the CSV reader, and +\function{unicode_csv_reader} decodes the UTF-8-encoded cells back +into Unicode: + +\begin{verbatim} +import csv + +def unicode_csv_reader(unicode_csv_data, dialect=csv.excel, **kwargs): + # csv.py doesn't do Unicode; encode temporarily as UTF-8: + csv_reader = csv.reader(utf_8_encoder(unicode_csv_data), + dialect=dialect, **kwargs) + for row in csv_reader: + # decode UTF-8 back to Unicode, cell by cell: + yield [unicode(cell, 'utf-8') for cell in row] + +def utf_8_encoder(unicode_csv_data): + for line in unicode_csv_data: + yield line.encode('utf-8') +\end{verbatim} + +The classes below work just like the \class{csv.reader} and +\class{csv.writer} classes, but they add an \var{encoding} parameter +to allow for encoded files: \begin{verbatim} import csv class UnicodeReader: + + """ + A CSV reader which will iterate over lines in the CSV file "f", + which is encoded in the given encoding. + """ + def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds): self.reader = csv.reader(f, dialect=dialect, **kwds) self.encoding = encoding @@ -446,6 +482,12 @@ class UnicodeReader: return self class UnicodeWriter: + + """ + A CSV writer which will write rows to CSV file "f", + which is encoded in the given encoding. + """ + def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds): self.writer = csv.writer(f, dialect=dialect, **kwds) self.encoding = encoding @@ -457,6 +499,3 @@ class UnicodeWriter: for row in rows: self.writerow(row) \end{verbatim} - -They should work just like the \class{csv.reader} and \class{csv.writer} -classes but add an \var{encoding} parameter.