Fix a delimiter detection problem in sniffer. Sniffing "a|b|c\r\n" was

returning 'a' as the delimiter.  It now returns '|', but not because I
understood better what the code was supposed to do.  Would someone that
understands the idea behind _guess_delimiter() (see its doc string) look to
see if my fallback choice is better than before or if it's just serendipity
that I picked the proper delimiter?
This commit is contained in:
Skip Montanaro 2005-12-30 05:09:48 +00:00
parent 0174dddc65
commit 39b29be8a6
2 changed files with 17 additions and 3 deletions

View File

@ -152,10 +152,13 @@ class Sniffer:
quotechar, delimiter, skipinitialspace = \
self._guess_quote_and_delimiter(sample, delimiters)
if delimiter is None:
if not delimiter:
delimiter, skipinitialspace = self._guess_delimiter(sample,
delimiters)
if not delimiter:
raise Error, "Could not determine delimiter"
class dialect(Dialect):
_name = "sniffed"
lineterminator = '\r\n'
@ -329,8 +332,12 @@ class Sniffer:
data[0].count("%c " % d))
return (d, skipinitialspace)
# finally, just return the first damn character in the list
delim = delims.keys()[0]
# nothing else indicates a preference, pick the character that
# dominates(?)
items = [(v,k) for (k,v) in delims.items()]
items.sort()
delim = items[-1][1]
skipinitialspace = (data[0].count(delim) ==
data[0].count("%c " % delim))
return (delim, skipinitialspace)

View File

@ -852,6 +852,8 @@ Stonecutters Seafood and Chop House, Lemont, IL, 12/19/02, Week Back
'''
sample5 = "aaa\tbbb\r\nAAA\t\r\nBBB\t\r\n"
sample6 = "a|b|c\r\nd|e|f\r\n"
sample7 = "'a'|'b'|'c'\r\n'd'|e|f\r\n"
def test_has_header(self):
sniffer = csv.Sniffer()
@ -882,6 +884,11 @@ Stonecutters Seafood and Chop House, Lemont, IL, 12/19/02, Week Back
self.assertEqual(dialect.delimiter, ";")
dialect = sniffer.sniff(self.sample5)
self.assertEqual(dialect.delimiter, "\t")
dialect = sniffer.sniff(self.sample6)
self.assertEqual(dialect.delimiter, "|")
dialect = sniffer.sniff(self.sample7)
self.assertEqual(dialect.delimiter, "|")
self.assertEqual(dialect.quotechar, "'")
if not hasattr(sys, "gettotalrefcount"):
if test_support.verbose: print "*** skipping leakage tests ***"