#18155: Regex-escape delimiter, in case it is a regex special char.

Patch by Vajrasky Kok, with slight modification to the tests by me.
This commit is contained in:
R David Murray 2013-06-29 18:43:59 -04:00
parent 1d14246b77
commit 24dc75365e
4 changed files with 48 additions and 6 deletions

View File

@ -261,8 +261,9 @@ class Sniffer:
# if we see an extra quote between delimiters, we've got a
# double quoted format
dq_regexp = re.compile(r"((%(delim)s)|^)\W*%(quote)s[^%(delim)s\n]*%(quote)s[^%(delim)s\n]*%(quote)s\W*((%(delim)s)|$)" % \
{'delim':delim, 'quote':quotechar}, re.MULTILINE)
dq_regexp = re.compile(
r"((%(delim)s)|^)\W*%(quote)s[^%(delim)s\n]*%(quote)s[^%(delim)s\n]*%(quote)s\W*((%(delim)s)|$)" % \
{'delim':re.escape(delim), 'quote':quotechar}, re.MULTILINE)

View File

@ -914,7 +914,7 @@ Stonecutters Seafood and Chop House, Lemont, IL, 12/19/02, Week Back
'Tommy''s Place':'Blue Island':'IL':'12/28/02':'Blue Sunday/White Crow'
'Stonecutters ''Seafood'' and Chop House':'Lemont':'IL':'12/19/02':'Week Back'
"""
header = '''\
header1 = '''\
"venue","city","state","date","performers"
'''
sample3 = '''\
@ -933,10 +933,35 @@ Stonecutters Seafood and Chop House, Lemont, IL, 12/19/02, Week Back
sample6 = "a|b|c\r\nd|e|f\r\n"
sample7 = "'a'|'b'|'c'\r\n'd'|e|f\r\n"
# Issue 18155: Use a delimiter that is a special char to regex:
header2 = '''\
"venue"+"city"+"state"+"date"+"performers"
'''
sample8 = """\
Harry's+ Arlington Heights+ IL+ 2/1/03+ Kimi Hayes
Shark City+ Glendale Heights+ IL+ 12/28/02+ Prezence
Tommy's Place+ Blue Island+ IL+ 12/28/02+ Blue Sunday/White Crow
Stonecutters Seafood and Chop House+ Lemont+ IL+ 12/19/02+ Week Back
"""
sample9 = """\
'Harry''s'+ Arlington Heights'+ 'IL'+ '2/1/03'+ 'Kimi Hayes'
'Shark City'+ Glendale Heights'+' IL'+ '12/28/02'+ 'Prezence'
'Tommy''s Place'+ Blue Island'+ 'IL'+ '12/28/02'+ 'Blue Sunday/White Crow'
'Stonecutters ''Seafood'' and Chop House'+ 'Lemont'+ 'IL'+ '12/19/02'+ 'Week Back'
"""
def test_has_header(self):
sniffer = csv.Sniffer()
self.assertEqual(sniffer.has_header(self.sample1), False)
self.assertEqual(sniffer.has_header(self.header+self.sample1), True)
self.assertEqual(sniffer.has_header(self.header1 + self.sample1),
True)
def test_has_header_regex_special_delimiter(self):
sniffer = csv.Sniffer()
self.assertEqual(sniffer.has_header(self.sample8), False)
self.assertEqual(sniffer.has_header(self.header2 + self.sample8),
True)
def test_sniff(self):
sniffer = csv.Sniffer()
@ -970,13 +995,24 @@ Stonecutters Seafood and Chop House, Lemont, IL, 12/19/02, Week Back
dialect = sniffer.sniff(self.sample7)
self.assertEqual(dialect.delimiter, "|")
self.assertEqual(dialect.quotechar, "'")
dialect = sniffer.sniff(self.sample8)
self.assertEqual(dialect.delimiter, '+')
dialect = sniffer.sniff(self.sample9)
self.assertEqual(dialect.delimiter, '+')
self.assertEqual(dialect.quotechar, "'")
def test_doublequote(self):
sniffer = csv.Sniffer()
dialect = sniffer.sniff(self.header)
dialect = sniffer.sniff(self.header1)
self.assertFalse(dialect.doublequote)
dialect = sniffer.sniff(self.header2)
self.assertFalse(dialect.doublequote)
dialect = sniffer.sniff(self.sample2)
self.assertTrue(dialect.doublequote)
dialect = sniffer.sniff(self.sample8)
self.assertFalse(dialect.doublequote)
dialect = sniffer.sniff(self.sample9)
self.assertTrue(dialect.doublequote)
if not hasattr(sys, "gettotalrefcount"):
if test_support.verbose: print "*** skipping leakage tests ***"

View File

@ -545,6 +545,7 @@ Jeff Knupp
Greg Kochanski
Damon Kohler
Marko Kohtala
Vajrasky Kok
Guido Kollerie
Peter A. Koren
Joseph Koshy

View File

@ -24,11 +24,15 @@ Core and Builtins
Library
-------
- Issue #18155: The csv module now correctly handles csv files that use
a delimiter character that has a special meaning in regexes, instead of
throwing an exception.
- Issue #18135: ssl.SSLSocket.write() now raises an OverflowError if the input
string in longer than 2 gigabytes. The ssl module does not support partial
write.
- Issue #18167: cgi.FieldStorage no more fails to handle multipart/form-data
- Issue #18167: cgi.FieldStorage no longer fails to handle multipart/form-data
when \r\n appears at end of 65535 bytes without other newlines.
- Issue #17403: urllib.parse.robotparser normalizes the urls before adding to