bpo-28595: Allow shlex whitespace_split with punctuation_chars (GH-2071)

2019-06-02 05:09:22 +10:00 · 2019-06-02 05:09:22 +10:00 · 56624a99a9
parent 2b843ac0ae
commit 56624a99a9
3 changed files with 61 additions and 23 deletions
--- a/Doc/library/shlex.rst
+++ b/Doc/library/shlex.rst
@ -225,7 +225,8 @@ variables which either control lexical analysis or can be used for debugging:
   appear in filename specifications and command line parameters, will also be
   included in this attribute, and any characters which appear in
   ``punctuation_chars`` will be removed from ``wordchars`` if they are present
-   there.
+   there. If :attr:`whitespace_split` is set to ``True``, this will have no
   effect.
 .. attribute:: shlex.whitespace
@ -258,11 +259,13 @@ variables which either control lexical analysis or can be used for debugging:
   If ``True``, tokens will only be split in whitespaces.  This is useful, for
   example, for parsing command lines with :class:`~shlex.shlex`, getting
-   tokens in a similar way to shell arguments.  If this attribute is ``True``,
+   tokens in a similar way to shell arguments.  When used in combination with
-   :attr:`punctuation_chars` will have no effect, and splitting will happen
+   :attr:`punctuation_chars`, tokens will be split on whitespace in addition to
-   only on whitespaces.  When using :attr:`punctuation_chars`, which is
+   those characters.
-   intended to provide parsing closer to that implemented by shells, it is
+
-   advisable to leave ``whitespace_split`` as ``False`` (the default value).
+   .. versionchanged:: 3.8
      The :attr:`punctuation_chars` attribute was made compatible with the
      :attr:`whitespace_split` attribute.
 .. attribute:: shlex.infile
@ -398,12 +401,15 @@ otherwise.  To illustrate, you can see the difference in the following snippet:
    >>> import shlex
    >>> text = "a && b; c && d || e; f >'abc'; (def \"ghi\")"
-    >>> list(shlex.shlex(text))
+    >>> s = shlex.shlex(text, posix=True)
-    ['a', '&', '&', 'b', ';', 'c', '&', '&', 'd', '|', '|', 'e', ';', 'f', '>',
+    >>> s.whitespace_split = True
-    "'abc'", ';', '(', 'def', '"ghi"', ')']
+    >>> list(s)
-    >>> list(shlex.shlex(text, punctuation_chars=True))
+    ['a', '&&', 'b;', 'c', '&&', 'd', '||', 'e;', 'f', '>abc;', '(def', 'ghi)']
-    ['a', '&&', 'b', ';', 'c', '&&', 'd', '||', 'e', ';', 'f', '>', "'abc'",
+    >>> s = shlex.shlex(text, posix=True, punctuation_chars=True)
-    ';', '(', 'def', '"ghi"', ')']
+    >>> s.whitespace_split = True
    >>> list(s)
    ['a', '&&', 'b', ';', 'c', '&&', 'd', '||', 'e', ';', 'f', '>', 'abc', ';',
    '(', 'def', 'ghi', ')']
 Of course, tokens will be returned which are not valid for shells, and you'll
 need to implement your own error checks on the returned tokens.
@ -428,6 +434,11 @@ which characters constitute punctuation. For example::
      >>> list(s)
      ['~/a', '&&', 'b-c', '--color=auto', '||', 'd', '*.py?']
   However, to match the shell as closely as possible, it is recommended to
   always use ``posix`` and :attr:`~shlex.whitespace_split` when using
   :attr:`~shlex.punctuation_chars`, which will negate
   :attr:`~shlex.wordchars` entirely.
 For best effect, ``punctuation_chars`` should be set in conjunction with
 ``posix=True``. (Note that ``posix=False`` is the default for
 :class:`~shlex.shlex`.)
--- a/Lib/shlex.py
+++ b/Lib/shlex.py
@ -246,7 +246,8 @@ class shlex:
                    escapedstate = 'a'
                    self.state = nextchar
                elif (nextchar in self.wordchars or nextchar in self.quotes
-                      or self.whitespace_split):
+                      or (self.whitespace_split and
                          nextchar not in self.punctuation_chars)):
                    self.token += nextchar
                else:
                    if self.punctuation_chars:
--- a/Lib/test/test_shlex.py
+++ b/Lib/test/test_shlex.py
@ -1,4 +1,5 @@
 import io
 import itertools
 import shlex
 import string
 import unittest
@ -183,10 +184,12 @@ class ShlexTest(unittest.TestCase):
            src = ['echo hi %s echo bye' % delimiter,
                   'echo hi%secho bye' % delimiter]
            ref = ['echo', 'hi', delimiter, 'echo', 'bye']
-            for ss in src:
+            for ss, ws in itertools.product(src, (False, True)):
                s = shlex.shlex(ss, punctuation_chars=True)
                s.whitespace_split = ws
                result = list(s)
-                self.assertEqual(ref, result, "While splitting '%s'" % ss)
+                self.assertEqual(ref, result,
                                 "While splitting '%s' [ws=%s]" % (ss, ws))
    def testSyntaxSplitSemicolon(self):
        """Test handling of syntax splitting of ;"""
@ -197,10 +200,12 @@ class ShlexTest(unittest.TestCase):
                   'echo hi%s echo bye' % delimiter,
                   'echo hi%secho bye' % delimiter]
            ref = ['echo', 'hi', delimiter, 'echo', 'bye']
-            for ss in src:
+            for ss, ws in itertools.product(src, (False, True)):
                s = shlex.shlex(ss, punctuation_chars=True)
                s.whitespace_split = ws
                result = list(s)
-                self.assertEqual(ref, result, "While splitting '%s'" % ss)
+                self.assertEqual(ref, result,
                                 "While splitting '%s' [ws=%s]" % (ss, ws))
    def testSyntaxSplitRedirect(self):
        """Test handling of syntax splitting of >"""
@ -211,10 +216,11 @@ class ShlexTest(unittest.TestCase):
                   'echo hi%s out' % delimiter,
                   'echo hi%sout' % delimiter]
            ref = ['echo', 'hi', delimiter, 'out']
-            for ss in src:
+            for ss, ws in itertools.product(src, (False, True)):
                s = shlex.shlex(ss, punctuation_chars=True)
                result = list(s)
-                self.assertEqual(ref, result, "While splitting '%s'" % ss)
+                self.assertEqual(ref, result,
                                 "While splitting '%s' [ws=%s]" % (ss, ws))
    def testSyntaxSplitParen(self):
        """Test handling of syntax splitting of ()"""
@ -222,18 +228,25 @@ class ShlexTest(unittest.TestCase):
        src = ['( echo hi )',
               '(echo hi)']
        ref = ['(', 'echo', 'hi', ')']
-        for ss in src:
+        for ss, ws in itertools.product(src, (False, True)):
            s = shlex.shlex(ss, punctuation_chars=True)
            s.whitespace_split = ws
            result = list(s)
-            self.assertEqual(ref, result, "While splitting '%s'" % ss)
+            self.assertEqual(ref, result,
                             "While splitting '%s' [ws=%s]" % (ss, ws))
    def testSyntaxSplitCustom(self):
        """Test handling of syntax splitting with custom chars"""
        ref = ['~/a', '&', '&', 'b-c', '--color=auto', '||', 'd', '*.py?']
        ss = "~/a&&b-c --color=auto||d *.py?"
        ref = ['~/a', '&', '&', 'b-c', '--color=auto', '||', 'd', '*.py?']
        s = shlex.shlex(ss, punctuation_chars="|")
        result = list(s)
-        self.assertEqual(ref, result, "While splitting '%s'" % ss)
+        self.assertEqual(ref, result, "While splitting '%s' [ws=False]" % ss)
        ref = ['~/a&&b-c', '--color=auto', '||', 'd', '*.py?']
        s = shlex.shlex(ss, punctuation_chars="|")
        s.whitespace_split = True
        result = list(s)
        self.assertEqual(ref, result, "While splitting '%s' [ws=True]" % ss)
    def testTokenTypes(self):
        """Test that tokens are split with types as expected."""
@ -293,6 +306,19 @@ class ShlexTest(unittest.TestCase):
        s = shlex.shlex("'')abc", punctuation_chars=True)
        self.assertEqual(list(s), expected)
    def testUnicodeHandling(self):
        """Test punctuation_chars and whitespace_split handle unicode."""
        ss = "\u2119\u01b4\u2602\u210c\u00f8\u1f24"
        # Should be parsed as one complete token (whitespace_split=True).
        ref = ['\u2119\u01b4\u2602\u210c\u00f8\u1f24']
        s = shlex.shlex(ss, punctuation_chars=True)
        s.whitespace_split = True
        self.assertEqual(list(s), ref)
        # Without whitespace_split, uses wordchars and splits on all.
        ref = ['\u2119', '\u01b4', '\u2602', '\u210c', '\u00f8', '\u1f24']
        s = shlex.shlex(ss, punctuation_chars=True)
        self.assertEqual(list(s), ref)
    def testQuote(self):
        safeunquoted = string.ascii_letters + string.digits + '@%_-+=:,./'
        unicode_sample = '\xe9\xe0\xdf'  # e + acute accent, a + grave, sharp s