From 56624a99a916fd27152d5b23364303acc0d707de Mon Sep 17 00:00:00 2001 From: Evan Date: Sun, 2 Jun 2019 05:09:22 +1000 Subject: [PATCH] bpo-28595: Allow shlex whitespace_split with punctuation_chars (GH-2071) --- Doc/library/shlex.rst | 35 +++++++++++++++++++++----------- Lib/shlex.py | 3 ++- Lib/test/test_shlex.py | 46 +++++++++++++++++++++++++++++++++--------- 3 files changed, 61 insertions(+), 23 deletions(-) diff --git a/Doc/library/shlex.rst b/Doc/library/shlex.rst index 8c5b0239d1f..a8421fdb700 100644 --- a/Doc/library/shlex.rst +++ b/Doc/library/shlex.rst @@ -225,7 +225,8 @@ variables which either control lexical analysis or can be used for debugging: appear in filename specifications and command line parameters, will also be included in this attribute, and any characters which appear in ``punctuation_chars`` will be removed from ``wordchars`` if they are present - there. + there. If :attr:`whitespace_split` is set to ``True``, this will have no + effect. .. attribute:: shlex.whitespace @@ -258,11 +259,13 @@ variables which either control lexical analysis or can be used for debugging: If ``True``, tokens will only be split in whitespaces. This is useful, for example, for parsing command lines with :class:`~shlex.shlex`, getting - tokens in a similar way to shell arguments. If this attribute is ``True``, - :attr:`punctuation_chars` will have no effect, and splitting will happen - only on whitespaces. When using :attr:`punctuation_chars`, which is - intended to provide parsing closer to that implemented by shells, it is - advisable to leave ``whitespace_split`` as ``False`` (the default value). + tokens in a similar way to shell arguments. When used in combination with + :attr:`punctuation_chars`, tokens will be split on whitespace in addition to + those characters. + + .. versionchanged:: 3.8 + The :attr:`punctuation_chars` attribute was made compatible with the + :attr:`whitespace_split` attribute. .. attribute:: shlex.infile @@ -398,12 +401,15 @@ otherwise. To illustrate, you can see the difference in the following snippet: >>> import shlex >>> text = "a && b; c && d || e; f >'abc'; (def \"ghi\")" - >>> list(shlex.shlex(text)) - ['a', '&', '&', 'b', ';', 'c', '&', '&', 'd', '|', '|', 'e', ';', 'f', '>', - "'abc'", ';', '(', 'def', '"ghi"', ')'] - >>> list(shlex.shlex(text, punctuation_chars=True)) - ['a', '&&', 'b', ';', 'c', '&&', 'd', '||', 'e', ';', 'f', '>', "'abc'", - ';', '(', 'def', '"ghi"', ')'] + >>> s = shlex.shlex(text, posix=True) + >>> s.whitespace_split = True + >>> list(s) + ['a', '&&', 'b;', 'c', '&&', 'd', '||', 'e;', 'f', '>abc;', '(def', 'ghi)'] + >>> s = shlex.shlex(text, posix=True, punctuation_chars=True) + >>> s.whitespace_split = True + >>> list(s) + ['a', '&&', 'b', ';', 'c', '&&', 'd', '||', 'e', ';', 'f', '>', 'abc', ';', + '(', 'def', 'ghi', ')'] Of course, tokens will be returned which are not valid for shells, and you'll need to implement your own error checks on the returned tokens. @@ -428,6 +434,11 @@ which characters constitute punctuation. For example:: >>> list(s) ['~/a', '&&', 'b-c', '--color=auto', '||', 'd', '*.py?'] + However, to match the shell as closely as possible, it is recommended to + always use ``posix`` and :attr:`~shlex.whitespace_split` when using + :attr:`~shlex.punctuation_chars`, which will negate + :attr:`~shlex.wordchars` entirely. + For best effect, ``punctuation_chars`` should be set in conjunction with ``posix=True``. (Note that ``posix=False`` is the default for :class:`~shlex.shlex`.) diff --git a/Lib/shlex.py b/Lib/shlex.py index fb1130d4eac..edea0779486 100644 --- a/Lib/shlex.py +++ b/Lib/shlex.py @@ -246,7 +246,8 @@ class shlex: escapedstate = 'a' self.state = nextchar elif (nextchar in self.wordchars or nextchar in self.quotes - or self.whitespace_split): + or (self.whitespace_split and + nextchar not in self.punctuation_chars)): self.token += nextchar else: if self.punctuation_chars: diff --git a/Lib/test/test_shlex.py b/Lib/test/test_shlex.py index a432610d3af..376c5e88d38 100644 --- a/Lib/test/test_shlex.py +++ b/Lib/test/test_shlex.py @@ -1,4 +1,5 @@ import io +import itertools import shlex import string import unittest @@ -183,10 +184,12 @@ class ShlexTest(unittest.TestCase): src = ['echo hi %s echo bye' % delimiter, 'echo hi%secho bye' % delimiter] ref = ['echo', 'hi', delimiter, 'echo', 'bye'] - for ss in src: + for ss, ws in itertools.product(src, (False, True)): s = shlex.shlex(ss, punctuation_chars=True) + s.whitespace_split = ws result = list(s) - self.assertEqual(ref, result, "While splitting '%s'" % ss) + self.assertEqual(ref, result, + "While splitting '%s' [ws=%s]" % (ss, ws)) def testSyntaxSplitSemicolon(self): """Test handling of syntax splitting of ;""" @@ -197,10 +200,12 @@ class ShlexTest(unittest.TestCase): 'echo hi%s echo bye' % delimiter, 'echo hi%secho bye' % delimiter] ref = ['echo', 'hi', delimiter, 'echo', 'bye'] - for ss in src: + for ss, ws in itertools.product(src, (False, True)): s = shlex.shlex(ss, punctuation_chars=True) + s.whitespace_split = ws result = list(s) - self.assertEqual(ref, result, "While splitting '%s'" % ss) + self.assertEqual(ref, result, + "While splitting '%s' [ws=%s]" % (ss, ws)) def testSyntaxSplitRedirect(self): """Test handling of syntax splitting of >""" @@ -211,10 +216,11 @@ class ShlexTest(unittest.TestCase): 'echo hi%s out' % delimiter, 'echo hi%sout' % delimiter] ref = ['echo', 'hi', delimiter, 'out'] - for ss in src: + for ss, ws in itertools.product(src, (False, True)): s = shlex.shlex(ss, punctuation_chars=True) result = list(s) - self.assertEqual(ref, result, "While splitting '%s'" % ss) + self.assertEqual(ref, result, + "While splitting '%s' [ws=%s]" % (ss, ws)) def testSyntaxSplitParen(self): """Test handling of syntax splitting of ()""" @@ -222,18 +228,25 @@ class ShlexTest(unittest.TestCase): src = ['( echo hi )', '(echo hi)'] ref = ['(', 'echo', 'hi', ')'] - for ss in src: + for ss, ws in itertools.product(src, (False, True)): s = shlex.shlex(ss, punctuation_chars=True) + s.whitespace_split = ws result = list(s) - self.assertEqual(ref, result, "While splitting '%s'" % ss) + self.assertEqual(ref, result, + "While splitting '%s' [ws=%s]" % (ss, ws)) def testSyntaxSplitCustom(self): """Test handling of syntax splitting with custom chars""" + ss = "~/a&&b-c --color=auto||d *.py?" ref = ['~/a', '&', '&', 'b-c', '--color=auto', '||', 'd', '*.py?'] - ss = "~/a && b-c --color=auto || d *.py?" s = shlex.shlex(ss, punctuation_chars="|") result = list(s) - self.assertEqual(ref, result, "While splitting '%s'" % ss) + self.assertEqual(ref, result, "While splitting '%s' [ws=False]" % ss) + ref = ['~/a&&b-c', '--color=auto', '||', 'd', '*.py?'] + s = shlex.shlex(ss, punctuation_chars="|") + s.whitespace_split = True + result = list(s) + self.assertEqual(ref, result, "While splitting '%s' [ws=True]" % ss) def testTokenTypes(self): """Test that tokens are split with types as expected.""" @@ -293,6 +306,19 @@ class ShlexTest(unittest.TestCase): s = shlex.shlex("'')abc", punctuation_chars=True) self.assertEqual(list(s), expected) + def testUnicodeHandling(self): + """Test punctuation_chars and whitespace_split handle unicode.""" + ss = "\u2119\u01b4\u2602\u210c\u00f8\u1f24" + # Should be parsed as one complete token (whitespace_split=True). + ref = ['\u2119\u01b4\u2602\u210c\u00f8\u1f24'] + s = shlex.shlex(ss, punctuation_chars=True) + s.whitespace_split = True + self.assertEqual(list(s), ref) + # Without whitespace_split, uses wordchars and splits on all. + ref = ['\u2119', '\u01b4', '\u2602', '\u210c', '\u00f8', '\u1f24'] + s = shlex.shlex(ss, punctuation_chars=True) + self.assertEqual(list(s), ref) + def testQuote(self): safeunquoted = string.ascii_letters + string.digits + '@%_-+=:,./' unicode_sample = '\xe9\xe0\xdf' # e + acute accent, a + grave, sharp s