bpo-28595: Allow shlex whitespace_split with punctuation_chars (GH-2071)

This commit is contained in:
Evan 2019-06-02 05:09:22 +10:00 committed by Vinay Sajip
parent 2b843ac0ae
commit 56624a99a9
3 changed files with 61 additions and 23 deletions

View File

@ -225,7 +225,8 @@ variables which either control lexical analysis or can be used for debugging:
appear in filename specifications and command line parameters, will also be appear in filename specifications and command line parameters, will also be
included in this attribute, and any characters which appear in included in this attribute, and any characters which appear in
``punctuation_chars`` will be removed from ``wordchars`` if they are present ``punctuation_chars`` will be removed from ``wordchars`` if they are present
there. there. If :attr:`whitespace_split` is set to ``True``, this will have no
effect.
.. attribute:: shlex.whitespace .. attribute:: shlex.whitespace
@ -258,11 +259,13 @@ variables which either control lexical analysis or can be used for debugging:
If ``True``, tokens will only be split in whitespaces. This is useful, for If ``True``, tokens will only be split in whitespaces. This is useful, for
example, for parsing command lines with :class:`~shlex.shlex`, getting example, for parsing command lines with :class:`~shlex.shlex`, getting
tokens in a similar way to shell arguments. If this attribute is ``True``, tokens in a similar way to shell arguments. When used in combination with
:attr:`punctuation_chars` will have no effect, and splitting will happen :attr:`punctuation_chars`, tokens will be split on whitespace in addition to
only on whitespaces. When using :attr:`punctuation_chars`, which is those characters.
intended to provide parsing closer to that implemented by shells, it is
advisable to leave ``whitespace_split`` as ``False`` (the default value). .. versionchanged:: 3.8
The :attr:`punctuation_chars` attribute was made compatible with the
:attr:`whitespace_split` attribute.
.. attribute:: shlex.infile .. attribute:: shlex.infile
@ -398,12 +401,15 @@ otherwise. To illustrate, you can see the difference in the following snippet:
>>> import shlex >>> import shlex
>>> text = "a && b; c && d || e; f >'abc'; (def \"ghi\")" >>> text = "a && b; c && d || e; f >'abc'; (def \"ghi\")"
>>> list(shlex.shlex(text)) >>> s = shlex.shlex(text, posix=True)
['a', '&', '&', 'b', ';', 'c', '&', '&', 'd', '|', '|', 'e', ';', 'f', '>', >>> s.whitespace_split = True
"'abc'", ';', '(', 'def', '"ghi"', ')'] >>> list(s)
>>> list(shlex.shlex(text, punctuation_chars=True)) ['a', '&&', 'b;', 'c', '&&', 'd', '||', 'e;', 'f', '>abc;', '(def', 'ghi)']
['a', '&&', 'b', ';', 'c', '&&', 'd', '||', 'e', ';', 'f', '>', "'abc'", >>> s = shlex.shlex(text, posix=True, punctuation_chars=True)
';', '(', 'def', '"ghi"', ')'] >>> s.whitespace_split = True
>>> list(s)
['a', '&&', 'b', ';', 'c', '&&', 'd', '||', 'e', ';', 'f', '>', 'abc', ';',
'(', 'def', 'ghi', ')']
Of course, tokens will be returned which are not valid for shells, and you'll Of course, tokens will be returned which are not valid for shells, and you'll
need to implement your own error checks on the returned tokens. need to implement your own error checks on the returned tokens.
@ -428,6 +434,11 @@ which characters constitute punctuation. For example::
>>> list(s) >>> list(s)
['~/a', '&&', 'b-c', '--color=auto', '||', 'd', '*.py?'] ['~/a', '&&', 'b-c', '--color=auto', '||', 'd', '*.py?']
However, to match the shell as closely as possible, it is recommended to
always use ``posix`` and :attr:`~shlex.whitespace_split` when using
:attr:`~shlex.punctuation_chars`, which will negate
:attr:`~shlex.wordchars` entirely.
For best effect, ``punctuation_chars`` should be set in conjunction with For best effect, ``punctuation_chars`` should be set in conjunction with
``posix=True``. (Note that ``posix=False`` is the default for ``posix=True``. (Note that ``posix=False`` is the default for
:class:`~shlex.shlex`.) :class:`~shlex.shlex`.)

View File

@ -246,7 +246,8 @@ class shlex:
escapedstate = 'a' escapedstate = 'a'
self.state = nextchar self.state = nextchar
elif (nextchar in self.wordchars or nextchar in self.quotes elif (nextchar in self.wordchars or nextchar in self.quotes
or self.whitespace_split): or (self.whitespace_split and
nextchar not in self.punctuation_chars)):
self.token += nextchar self.token += nextchar
else: else:
if self.punctuation_chars: if self.punctuation_chars:

View File

@ -1,4 +1,5 @@
import io import io
import itertools
import shlex import shlex
import string import string
import unittest import unittest
@ -183,10 +184,12 @@ class ShlexTest(unittest.TestCase):
src = ['echo hi %s echo bye' % delimiter, src = ['echo hi %s echo bye' % delimiter,
'echo hi%secho bye' % delimiter] 'echo hi%secho bye' % delimiter]
ref = ['echo', 'hi', delimiter, 'echo', 'bye'] ref = ['echo', 'hi', delimiter, 'echo', 'bye']
for ss in src: for ss, ws in itertools.product(src, (False, True)):
s = shlex.shlex(ss, punctuation_chars=True) s = shlex.shlex(ss, punctuation_chars=True)
s.whitespace_split = ws
result = list(s) result = list(s)
self.assertEqual(ref, result, "While splitting '%s'" % ss) self.assertEqual(ref, result,
"While splitting '%s' [ws=%s]" % (ss, ws))
def testSyntaxSplitSemicolon(self): def testSyntaxSplitSemicolon(self):
"""Test handling of syntax splitting of ;""" """Test handling of syntax splitting of ;"""
@ -197,10 +200,12 @@ class ShlexTest(unittest.TestCase):
'echo hi%s echo bye' % delimiter, 'echo hi%s echo bye' % delimiter,
'echo hi%secho bye' % delimiter] 'echo hi%secho bye' % delimiter]
ref = ['echo', 'hi', delimiter, 'echo', 'bye'] ref = ['echo', 'hi', delimiter, 'echo', 'bye']
for ss in src: for ss, ws in itertools.product(src, (False, True)):
s = shlex.shlex(ss, punctuation_chars=True) s = shlex.shlex(ss, punctuation_chars=True)
s.whitespace_split = ws
result = list(s) result = list(s)
self.assertEqual(ref, result, "While splitting '%s'" % ss) self.assertEqual(ref, result,
"While splitting '%s' [ws=%s]" % (ss, ws))
def testSyntaxSplitRedirect(self): def testSyntaxSplitRedirect(self):
"""Test handling of syntax splitting of >""" """Test handling of syntax splitting of >"""
@ -211,10 +216,11 @@ class ShlexTest(unittest.TestCase):
'echo hi%s out' % delimiter, 'echo hi%s out' % delimiter,
'echo hi%sout' % delimiter] 'echo hi%sout' % delimiter]
ref = ['echo', 'hi', delimiter, 'out'] ref = ['echo', 'hi', delimiter, 'out']
for ss in src: for ss, ws in itertools.product(src, (False, True)):
s = shlex.shlex(ss, punctuation_chars=True) s = shlex.shlex(ss, punctuation_chars=True)
result = list(s) result = list(s)
self.assertEqual(ref, result, "While splitting '%s'" % ss) self.assertEqual(ref, result,
"While splitting '%s' [ws=%s]" % (ss, ws))
def testSyntaxSplitParen(self): def testSyntaxSplitParen(self):
"""Test handling of syntax splitting of ()""" """Test handling of syntax splitting of ()"""
@ -222,18 +228,25 @@ class ShlexTest(unittest.TestCase):
src = ['( echo hi )', src = ['( echo hi )',
'(echo hi)'] '(echo hi)']
ref = ['(', 'echo', 'hi', ')'] ref = ['(', 'echo', 'hi', ')']
for ss in src: for ss, ws in itertools.product(src, (False, True)):
s = shlex.shlex(ss, punctuation_chars=True) s = shlex.shlex(ss, punctuation_chars=True)
s.whitespace_split = ws
result = list(s) result = list(s)
self.assertEqual(ref, result, "While splitting '%s'" % ss) self.assertEqual(ref, result,
"While splitting '%s' [ws=%s]" % (ss, ws))
def testSyntaxSplitCustom(self): def testSyntaxSplitCustom(self):
"""Test handling of syntax splitting with custom chars""" """Test handling of syntax splitting with custom chars"""
ref = ['~/a', '&', '&', 'b-c', '--color=auto', '||', 'd', '*.py?']
ss = "~/a&&b-c --color=auto||d *.py?" ss = "~/a&&b-c --color=auto||d *.py?"
ref = ['~/a', '&', '&', 'b-c', '--color=auto', '||', 'd', '*.py?']
s = shlex.shlex(ss, punctuation_chars="|") s = shlex.shlex(ss, punctuation_chars="|")
result = list(s) result = list(s)
self.assertEqual(ref, result, "While splitting '%s'" % ss) self.assertEqual(ref, result, "While splitting '%s' [ws=False]" % ss)
ref = ['~/a&&b-c', '--color=auto', '||', 'd', '*.py?']
s = shlex.shlex(ss, punctuation_chars="|")
s.whitespace_split = True
result = list(s)
self.assertEqual(ref, result, "While splitting '%s' [ws=True]" % ss)
def testTokenTypes(self): def testTokenTypes(self):
"""Test that tokens are split with types as expected.""" """Test that tokens are split with types as expected."""
@ -293,6 +306,19 @@ class ShlexTest(unittest.TestCase):
s = shlex.shlex("'')abc", punctuation_chars=True) s = shlex.shlex("'')abc", punctuation_chars=True)
self.assertEqual(list(s), expected) self.assertEqual(list(s), expected)
def testUnicodeHandling(self):
"""Test punctuation_chars and whitespace_split handle unicode."""
ss = "\u2119\u01b4\u2602\u210c\u00f8\u1f24"
# Should be parsed as one complete token (whitespace_split=True).
ref = ['\u2119\u01b4\u2602\u210c\u00f8\u1f24']
s = shlex.shlex(ss, punctuation_chars=True)
s.whitespace_split = True
self.assertEqual(list(s), ref)
# Without whitespace_split, uses wordchars and splits on all.
ref = ['\u2119', '\u01b4', '\u2602', '\u210c', '\u00f8', '\u1f24']
s = shlex.shlex(ss, punctuation_chars=True)
self.assertEqual(list(s), ref)
def testQuote(self): def testQuote(self):
safeunquoted = string.ascii_letters + string.digits + '@%_-+=:,./' safeunquoted = string.ascii_letters + string.digits + '@%_-+=:,./'
unicode_sample = '\xe9\xe0\xdf' # e + acute accent, a + grave, sharp s unicode_sample = '\xe9\xe0\xdf' # e + acute accent, a + grave, sharp s