bpo-28595: Allow shlex whitespace_split with punctuation_chars (GH-2071)
This commit is contained in:
parent
2b843ac0ae
commit
56624a99a9
|
@ -225,7 +225,8 @@ variables which either control lexical analysis or can be used for debugging:
|
||||||
appear in filename specifications and command line parameters, will also be
|
appear in filename specifications and command line parameters, will also be
|
||||||
included in this attribute, and any characters which appear in
|
included in this attribute, and any characters which appear in
|
||||||
``punctuation_chars`` will be removed from ``wordchars`` if they are present
|
``punctuation_chars`` will be removed from ``wordchars`` if they are present
|
||||||
there.
|
there. If :attr:`whitespace_split` is set to ``True``, this will have no
|
||||||
|
effect.
|
||||||
|
|
||||||
|
|
||||||
.. attribute:: shlex.whitespace
|
.. attribute:: shlex.whitespace
|
||||||
|
@ -258,11 +259,13 @@ variables which either control lexical analysis or can be used for debugging:
|
||||||
|
|
||||||
If ``True``, tokens will only be split in whitespaces. This is useful, for
|
If ``True``, tokens will only be split in whitespaces. This is useful, for
|
||||||
example, for parsing command lines with :class:`~shlex.shlex`, getting
|
example, for parsing command lines with :class:`~shlex.shlex`, getting
|
||||||
tokens in a similar way to shell arguments. If this attribute is ``True``,
|
tokens in a similar way to shell arguments. When used in combination with
|
||||||
:attr:`punctuation_chars` will have no effect, and splitting will happen
|
:attr:`punctuation_chars`, tokens will be split on whitespace in addition to
|
||||||
only on whitespaces. When using :attr:`punctuation_chars`, which is
|
those characters.
|
||||||
intended to provide parsing closer to that implemented by shells, it is
|
|
||||||
advisable to leave ``whitespace_split`` as ``False`` (the default value).
|
.. versionchanged:: 3.8
|
||||||
|
The :attr:`punctuation_chars` attribute was made compatible with the
|
||||||
|
:attr:`whitespace_split` attribute.
|
||||||
|
|
||||||
|
|
||||||
.. attribute:: shlex.infile
|
.. attribute:: shlex.infile
|
||||||
|
@ -398,12 +401,15 @@ otherwise. To illustrate, you can see the difference in the following snippet:
|
||||||
|
|
||||||
>>> import shlex
|
>>> import shlex
|
||||||
>>> text = "a && b; c && d || e; f >'abc'; (def \"ghi\")"
|
>>> text = "a && b; c && d || e; f >'abc'; (def \"ghi\")"
|
||||||
>>> list(shlex.shlex(text))
|
>>> s = shlex.shlex(text, posix=True)
|
||||||
['a', '&', '&', 'b', ';', 'c', '&', '&', 'd', '|', '|', 'e', ';', 'f', '>',
|
>>> s.whitespace_split = True
|
||||||
"'abc'", ';', '(', 'def', '"ghi"', ')']
|
>>> list(s)
|
||||||
>>> list(shlex.shlex(text, punctuation_chars=True))
|
['a', '&&', 'b;', 'c', '&&', 'd', '||', 'e;', 'f', '>abc;', '(def', 'ghi)']
|
||||||
['a', '&&', 'b', ';', 'c', '&&', 'd', '||', 'e', ';', 'f', '>', "'abc'",
|
>>> s = shlex.shlex(text, posix=True, punctuation_chars=True)
|
||||||
';', '(', 'def', '"ghi"', ')']
|
>>> s.whitespace_split = True
|
||||||
|
>>> list(s)
|
||||||
|
['a', '&&', 'b', ';', 'c', '&&', 'd', '||', 'e', ';', 'f', '>', 'abc', ';',
|
||||||
|
'(', 'def', 'ghi', ')']
|
||||||
|
|
||||||
Of course, tokens will be returned which are not valid for shells, and you'll
|
Of course, tokens will be returned which are not valid for shells, and you'll
|
||||||
need to implement your own error checks on the returned tokens.
|
need to implement your own error checks on the returned tokens.
|
||||||
|
@ -428,6 +434,11 @@ which characters constitute punctuation. For example::
|
||||||
>>> list(s)
|
>>> list(s)
|
||||||
['~/a', '&&', 'b-c', '--color=auto', '||', 'd', '*.py?']
|
['~/a', '&&', 'b-c', '--color=auto', '||', 'd', '*.py?']
|
||||||
|
|
||||||
|
However, to match the shell as closely as possible, it is recommended to
|
||||||
|
always use ``posix`` and :attr:`~shlex.whitespace_split` when using
|
||||||
|
:attr:`~shlex.punctuation_chars`, which will negate
|
||||||
|
:attr:`~shlex.wordchars` entirely.
|
||||||
|
|
||||||
For best effect, ``punctuation_chars`` should be set in conjunction with
|
For best effect, ``punctuation_chars`` should be set in conjunction with
|
||||||
``posix=True``. (Note that ``posix=False`` is the default for
|
``posix=True``. (Note that ``posix=False`` is the default for
|
||||||
:class:`~shlex.shlex`.)
|
:class:`~shlex.shlex`.)
|
||||||
|
|
|
@ -246,7 +246,8 @@ class shlex:
|
||||||
escapedstate = 'a'
|
escapedstate = 'a'
|
||||||
self.state = nextchar
|
self.state = nextchar
|
||||||
elif (nextchar in self.wordchars or nextchar in self.quotes
|
elif (nextchar in self.wordchars or nextchar in self.quotes
|
||||||
or self.whitespace_split):
|
or (self.whitespace_split and
|
||||||
|
nextchar not in self.punctuation_chars)):
|
||||||
self.token += nextchar
|
self.token += nextchar
|
||||||
else:
|
else:
|
||||||
if self.punctuation_chars:
|
if self.punctuation_chars:
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
import io
|
import io
|
||||||
|
import itertools
|
||||||
import shlex
|
import shlex
|
||||||
import string
|
import string
|
||||||
import unittest
|
import unittest
|
||||||
|
@ -183,10 +184,12 @@ class ShlexTest(unittest.TestCase):
|
||||||
src = ['echo hi %s echo bye' % delimiter,
|
src = ['echo hi %s echo bye' % delimiter,
|
||||||
'echo hi%secho bye' % delimiter]
|
'echo hi%secho bye' % delimiter]
|
||||||
ref = ['echo', 'hi', delimiter, 'echo', 'bye']
|
ref = ['echo', 'hi', delimiter, 'echo', 'bye']
|
||||||
for ss in src:
|
for ss, ws in itertools.product(src, (False, True)):
|
||||||
s = shlex.shlex(ss, punctuation_chars=True)
|
s = shlex.shlex(ss, punctuation_chars=True)
|
||||||
|
s.whitespace_split = ws
|
||||||
result = list(s)
|
result = list(s)
|
||||||
self.assertEqual(ref, result, "While splitting '%s'" % ss)
|
self.assertEqual(ref, result,
|
||||||
|
"While splitting '%s' [ws=%s]" % (ss, ws))
|
||||||
|
|
||||||
def testSyntaxSplitSemicolon(self):
|
def testSyntaxSplitSemicolon(self):
|
||||||
"""Test handling of syntax splitting of ;"""
|
"""Test handling of syntax splitting of ;"""
|
||||||
|
@ -197,10 +200,12 @@ class ShlexTest(unittest.TestCase):
|
||||||
'echo hi%s echo bye' % delimiter,
|
'echo hi%s echo bye' % delimiter,
|
||||||
'echo hi%secho bye' % delimiter]
|
'echo hi%secho bye' % delimiter]
|
||||||
ref = ['echo', 'hi', delimiter, 'echo', 'bye']
|
ref = ['echo', 'hi', delimiter, 'echo', 'bye']
|
||||||
for ss in src:
|
for ss, ws in itertools.product(src, (False, True)):
|
||||||
s = shlex.shlex(ss, punctuation_chars=True)
|
s = shlex.shlex(ss, punctuation_chars=True)
|
||||||
|
s.whitespace_split = ws
|
||||||
result = list(s)
|
result = list(s)
|
||||||
self.assertEqual(ref, result, "While splitting '%s'" % ss)
|
self.assertEqual(ref, result,
|
||||||
|
"While splitting '%s' [ws=%s]" % (ss, ws))
|
||||||
|
|
||||||
def testSyntaxSplitRedirect(self):
|
def testSyntaxSplitRedirect(self):
|
||||||
"""Test handling of syntax splitting of >"""
|
"""Test handling of syntax splitting of >"""
|
||||||
|
@ -211,10 +216,11 @@ class ShlexTest(unittest.TestCase):
|
||||||
'echo hi%s out' % delimiter,
|
'echo hi%s out' % delimiter,
|
||||||
'echo hi%sout' % delimiter]
|
'echo hi%sout' % delimiter]
|
||||||
ref = ['echo', 'hi', delimiter, 'out']
|
ref = ['echo', 'hi', delimiter, 'out']
|
||||||
for ss in src:
|
for ss, ws in itertools.product(src, (False, True)):
|
||||||
s = shlex.shlex(ss, punctuation_chars=True)
|
s = shlex.shlex(ss, punctuation_chars=True)
|
||||||
result = list(s)
|
result = list(s)
|
||||||
self.assertEqual(ref, result, "While splitting '%s'" % ss)
|
self.assertEqual(ref, result,
|
||||||
|
"While splitting '%s' [ws=%s]" % (ss, ws))
|
||||||
|
|
||||||
def testSyntaxSplitParen(self):
|
def testSyntaxSplitParen(self):
|
||||||
"""Test handling of syntax splitting of ()"""
|
"""Test handling of syntax splitting of ()"""
|
||||||
|
@ -222,18 +228,25 @@ class ShlexTest(unittest.TestCase):
|
||||||
src = ['( echo hi )',
|
src = ['( echo hi )',
|
||||||
'(echo hi)']
|
'(echo hi)']
|
||||||
ref = ['(', 'echo', 'hi', ')']
|
ref = ['(', 'echo', 'hi', ')']
|
||||||
for ss in src:
|
for ss, ws in itertools.product(src, (False, True)):
|
||||||
s = shlex.shlex(ss, punctuation_chars=True)
|
s = shlex.shlex(ss, punctuation_chars=True)
|
||||||
|
s.whitespace_split = ws
|
||||||
result = list(s)
|
result = list(s)
|
||||||
self.assertEqual(ref, result, "While splitting '%s'" % ss)
|
self.assertEqual(ref, result,
|
||||||
|
"While splitting '%s' [ws=%s]" % (ss, ws))
|
||||||
|
|
||||||
def testSyntaxSplitCustom(self):
|
def testSyntaxSplitCustom(self):
|
||||||
"""Test handling of syntax splitting with custom chars"""
|
"""Test handling of syntax splitting with custom chars"""
|
||||||
ref = ['~/a', '&', '&', 'b-c', '--color=auto', '||', 'd', '*.py?']
|
|
||||||
ss = "~/a&&b-c --color=auto||d *.py?"
|
ss = "~/a&&b-c --color=auto||d *.py?"
|
||||||
|
ref = ['~/a', '&', '&', 'b-c', '--color=auto', '||', 'd', '*.py?']
|
||||||
s = shlex.shlex(ss, punctuation_chars="|")
|
s = shlex.shlex(ss, punctuation_chars="|")
|
||||||
result = list(s)
|
result = list(s)
|
||||||
self.assertEqual(ref, result, "While splitting '%s'" % ss)
|
self.assertEqual(ref, result, "While splitting '%s' [ws=False]" % ss)
|
||||||
|
ref = ['~/a&&b-c', '--color=auto', '||', 'd', '*.py?']
|
||||||
|
s = shlex.shlex(ss, punctuation_chars="|")
|
||||||
|
s.whitespace_split = True
|
||||||
|
result = list(s)
|
||||||
|
self.assertEqual(ref, result, "While splitting '%s' [ws=True]" % ss)
|
||||||
|
|
||||||
def testTokenTypes(self):
|
def testTokenTypes(self):
|
||||||
"""Test that tokens are split with types as expected."""
|
"""Test that tokens are split with types as expected."""
|
||||||
|
@ -293,6 +306,19 @@ class ShlexTest(unittest.TestCase):
|
||||||
s = shlex.shlex("'')abc", punctuation_chars=True)
|
s = shlex.shlex("'')abc", punctuation_chars=True)
|
||||||
self.assertEqual(list(s), expected)
|
self.assertEqual(list(s), expected)
|
||||||
|
|
||||||
|
def testUnicodeHandling(self):
|
||||||
|
"""Test punctuation_chars and whitespace_split handle unicode."""
|
||||||
|
ss = "\u2119\u01b4\u2602\u210c\u00f8\u1f24"
|
||||||
|
# Should be parsed as one complete token (whitespace_split=True).
|
||||||
|
ref = ['\u2119\u01b4\u2602\u210c\u00f8\u1f24']
|
||||||
|
s = shlex.shlex(ss, punctuation_chars=True)
|
||||||
|
s.whitespace_split = True
|
||||||
|
self.assertEqual(list(s), ref)
|
||||||
|
# Without whitespace_split, uses wordchars and splits on all.
|
||||||
|
ref = ['\u2119', '\u01b4', '\u2602', '\u210c', '\u00f8', '\u1f24']
|
||||||
|
s = shlex.shlex(ss, punctuation_chars=True)
|
||||||
|
self.assertEqual(list(s), ref)
|
||||||
|
|
||||||
def testQuote(self):
|
def testQuote(self):
|
||||||
safeunquoted = string.ascii_letters + string.digits + '@%_-+=:,./'
|
safeunquoted = string.ascii_letters + string.digits + '@%_-+=:,./'
|
||||||
unicode_sample = '\xe9\xe0\xdf' # e + acute accent, a + grave, sharp s
|
unicode_sample = '\xe9\xe0\xdf' # e + acute accent, a + grave, sharp s
|
||||||
|
|
Loading…
Reference in New Issue