Closes #1521950: Made shlex parsing more shell-like.

This commit is contained in:
Vinay Sajip 2016-07-29 22:35:03 +01:00
parent d2f87472fe
commit c1f974c944
3 changed files with 264 additions and 31 deletions

View File

@ -73,7 +73,7 @@ The :mod:`shlex` module defines the following functions:
The :mod:`shlex` module defines the following class:
.. class:: shlex(instream=None, infile=None, posix=False)
.. class:: shlex(instream=None, infile=None, posix=False, punctuation_chars=False)
A :class:`~shlex.shlex` instance or subclass instance is a lexical analyzer
object. The initialization argument, if present, specifies where to read
@ -87,8 +87,19 @@ The :mod:`shlex` module defines the following class:
when *posix* is not true (default), the :class:`~shlex.shlex` instance will
operate in compatibility mode. When operating in POSIX mode,
:class:`~shlex.shlex` will try to be as close as possible to the POSIX shell
parsing rules.
parsing rules. The *punctuation_chars* argument provides a way to make the
behaviour even closer to how real shells parse. This can take a number of
values: the default value, ``False``, preserves the behaviour seen under
Python 3.5 and earlier. If set to ``True``, then parsing of the characters
``();<>|&`` is changed: any run of these characters (considered punctuation
characters) is returned as a single token. If set to a non-empty string of
characters, those characters will be used as the punctuation characters. Any
characters in the :attr:`wordchars` attribute that appear in
*punctuation_chars* will be removed from :attr:`wordchars`. See
:ref:`improved-shell-compatibility` for more information.
.. versionchanged:: 3.6
The `punctuation_chars` parameter was added.
.. seealso::
@ -191,7 +202,13 @@ variables which either control lexical analysis or can be used for debugging:
.. attribute:: shlex.wordchars
The string of characters that will accumulate into multi-character tokens. By
default, includes all ASCII alphanumerics and underscore.
default, includes all ASCII alphanumerics and underscore. In POSIX mode, the
accented characters in the Latin-1 set are also included. If
:attr:`punctuation_chars` is not empty, the characters ``~-./*?=``, which can
appear in filename specifications and command line parameters, will also be
included in this attribute, and any characters which appear in
``punctuation_chars`` will be removed from ``wordchars`` if they are present
there.
.. attribute:: shlex.whitespace
@ -224,7 +241,11 @@ variables which either control lexical analysis or can be used for debugging:
If ``True``, tokens will only be split in whitespaces. This is useful, for
example, for parsing command lines with :class:`~shlex.shlex`, getting
tokens in a similar way to shell arguments.
tokens in a similar way to shell arguments. If this attribute is ``True``,
:attr:`punctuation_chars` will have no effect, and splitting will happen
only on whitespaces. When using :attr:`punctuation_chars`, which is
intended to provide parsing closer to that implemented by shells, it is
advisable to leave ``whitespace_split`` as ``False`` (the default value).
.. attribute:: shlex.infile
@ -245,10 +266,9 @@ variables which either control lexical analysis or can be used for debugging:
This attribute is ``None`` by default. If you assign a string to it, that
string will be recognized as a lexical-level inclusion request similar to the
``source`` keyword in various shells. That is, the immediately following token
will be opened as a filename and input will
be taken from that stream until EOF, at which
point the :meth:`~io.IOBase.close` method of that stream will be called and
the input source will again become the original input stream. Source
will be opened as a filename and input will be taken from that stream until
EOF, at which point the :meth:`~io.IOBase.close` method of that stream will be
called and the input source will again become the original input stream. Source
requests may be stacked any number of levels deep.
@ -275,6 +295,16 @@ variables which either control lexical analysis or can be used for debugging:
(``''``), in non-POSIX mode, and to ``None`` in POSIX mode.
.. attribute:: shlex.punctuation_chars
Characters that will be considered punctuation. Runs of punctuation
characters will be returned as a single token. However, note that no
semantic validity checking will be performed: for example, '>>>' could be
returned as a token, even though it may not be recognised as such by shells.
.. versionadded:: 3.6
.. _shlex-parsing-rules:
Parsing Rules
@ -327,3 +357,62 @@ following parsing rules.
* EOF is signaled with a :const:`None` value;
* Quoted empty strings (``''``) are allowed.
.. _improved-shell-compatibility:
Improved Compatibility with Shells
----------------------------------
.. versionadded:: 3.6
The :class:`shlex` class provides compatibility with the parsing performed by
common Unix shells like ``bash``, ``dash``, and ``sh``. To take advantage of
this compatibility, specify the ``punctuation_chars`` argument in the
constructor. This defaults to ``False``, which preserves pre-3.6 behaviour.
However, if it is set to ``True``, then parsing of the characters ``();<>|&``
is changed: any run of these characters is returned as a single token. While
this is short of a full parser for shells (which would be out of scope for the
standard library, given the multiplicity of shells out there), it does allow
you to perform processing of command lines more easily than you could
otherwise. To illustrate, you can see the difference in the following snippet::
import shlex
for punct in (False, True):
if punct:
message = 'Old'
else:
message = 'New'
text = "a && b; c && d || e; f >'abc'; (def \"ghi\")"
s = shlex.shlex(text, punctuation_chars=punct)
print('%s: %s' % (message, list(s)))
which prints out::
Old: ['a', '&', '&', 'b', ';', 'c', '&', '&', 'd', '|', '|', 'e', ';', 'f', '>', "'abc'", ';', '(', 'def', '"ghi"', ')']
New: ['a', '&&', 'b', ';', 'c', '&&', 'd', '||', 'e', ';', 'f', '>', "'abc'", ';', '(', 'def', '"ghi"', ')']
Of course, tokens will be returned which are not valid for shells, and you'll
need to implement your own error checks on the returned tokens.
Instead of passing ``True`` as the value for the punctuation_chars parameter,
you can pass a string with specific characters, which will be used to determine
which characters constitute punctuation. For example::
>>> import shlex
>>> s = shlex.shlex("a && b || c", punctuation_chars="|")
>>> list(s)
['a', '&', '&', 'b', '||', 'c']
.. note:: When ``punctuation_chars`` is specified, the :attr:`~shlex.wordchars`
attribute is augmented with the characters ``~-./*?=``. That is because these
characters can appear in file names (including wildcards) and command-line
arguments (e.g. ``--color=auto``). Hence::
>>> import shlex
>>> s = shlex.shlex('~/a && b-c --color=auto || d *.py?',
... punctuation_chars=True)
>>> list(s)
['~/a', '&&', 'b-c', '--color=auto', '||', 'd', '*.py?']

View File

@ -5,6 +5,7 @@
# push_source() and pop_source() made explicit by ESR, January 2001.
# Posix compliance, split(), string arguments, and
# iterator interface by Gustavo Niemeyer, April 2003.
# changes to tokenize more like Posix shells by Vinay Sajip, July 2016.
import os
import re
@ -17,7 +18,8 @@ __all__ = ["shlex", "split", "quote"]
class shlex:
"A lexical analyzer class for simple shell-like syntaxes."
def __init__(self, instream=None, infile=None, posix=False):
def __init__(self, instream=None, infile=None, posix=False,
punctuation_chars=False):
if isinstance(instream, str):
instream = StringIO(instream)
if instream is not None:
@ -49,6 +51,19 @@ class shlex:
self.token = ''
self.filestack = deque()
self.source = None
if not punctuation_chars:
punctuation_chars = ''
elif punctuation_chars is True:
punctuation_chars = '();<>|&'
self.punctuation_chars = punctuation_chars
if punctuation_chars:
# _pushback_chars is a push back queue used by lookahead logic
self._pushback_chars = deque()
# these chars added because allowed in file names, args, wildcards
self.wordchars += '~-./*?='
#remove any punctuation chars from wordchars
t = self.wordchars.maketrans(dict.fromkeys(punctuation_chars))
self.wordchars = self.wordchars.translate(t)
def push_token(self, tok):
"Push a token onto the stack popped by the get_token method"
@ -115,12 +130,15 @@ class shlex:
quoted = False
escapedstate = ' '
while True:
if self.punctuation_chars and self._pushback_chars:
nextchar = self._pushback_chars.pop()
else:
nextchar = self.instream.read(1)
if nextchar == '\n':
self.lineno = self.lineno + 1
self.lineno += 1
if self.debug >= 3:
print("shlex: in state", repr(self.state), \
"I see character:", repr(nextchar))
print("shlex: in state %r I see character: %r" % (self.state,
nextchar))
if self.state is None:
self.token = '' # past end of file
break
@ -137,13 +155,16 @@ class shlex:
continue
elif nextchar in self.commenters:
self.instream.readline()
self.lineno = self.lineno + 1
self.lineno += 1
elif self.posix and nextchar in self.escape:
escapedstate = 'a'
self.state = nextchar
elif nextchar in self.wordchars:
self.token = nextchar
self.state = 'a'
elif nextchar in self.punctuation_chars:
self.token = nextchar
self.state = 'c'
elif nextchar in self.quotes:
if not self.posix:
self.token = nextchar
@ -166,17 +187,17 @@ class shlex:
raise ValueError("No closing quotation")
if nextchar == self.state:
if not self.posix:
self.token = self.token + nextchar
self.token += nextchar
self.state = ' '
break
else:
self.state = 'a'
elif self.posix and nextchar in self.escape and \
self.state in self.escapedquotes:
elif (self.posix and nextchar in self.escape and self.state
in self.escapedquotes):
escapedstate = self.state
self.state = nextchar
else:
self.token = self.token + nextchar
self.token += nextchar
elif self.state in self.escape:
if not nextchar: # end of file
if self.debug >= 2:
@ -185,12 +206,12 @@ class shlex:
raise ValueError("No escaped character")
# In posix shells, only the quote itself or the escape
# character may be escaped within quotes.
if escapedstate in self.quotes and \
nextchar != self.state and nextchar != escapedstate:
self.token = self.token + self.state
self.token = self.token + nextchar
if (escapedstate in self.quotes and
nextchar != self.state and nextchar != escapedstate):
self.token += self.state
self.token += nextchar
self.state = escapedstate
elif self.state == 'a':
elif self.state in ('a', 'c'):
if not nextchar:
self.state = None # end of file
break
@ -204,7 +225,7 @@ class shlex:
continue
elif nextchar in self.commenters:
self.instream.readline()
self.lineno = self.lineno + 1
self.lineno += 1
if self.posix:
self.state = ' '
if self.token or (self.posix and quoted):
@ -216,15 +237,26 @@ class shlex:
elif self.posix and nextchar in self.escape:
escapedstate = 'a'
self.state = nextchar
elif nextchar in self.wordchars or nextchar in self.quotes \
or self.whitespace_split:
self.token = self.token + nextchar
elif self.state == 'c':
if nextchar in self.punctuation_chars:
self.token += nextchar
else:
if nextchar not in self.whitespace:
self._pushback_chars.append(nextchar)
self.state = ' '
break
elif (nextchar in self.wordchars or nextchar in self.quotes
or self.whitespace_split):
self.token += nextchar
else:
if self.punctuation_chars:
self._pushback_chars.append(nextchar)
else:
self.pushback.appendleft(nextchar)
if self.debug >= 2:
print("shlex: I see punctuation in word state")
self.state = ' '
if self.token:
if self.token or (self.posix and quoted):
break # emit current token
else:
continue

View File

@ -173,6 +173,118 @@ class ShlexTest(unittest.TestCase):
"%s: %s != %s" %
(self.data[i][0], l, self.data[i][1:]))
def testSyntaxSplitAmpersandAndPipe(self):
"""Test handling of syntax splitting of &, |"""
# Could take these forms: &&, &, |&, ;&, ;;&
# of course, the same applies to | and ||
# these should all parse to the same output
for delimiter in ('&&', '&', '|&', ';&', ';;&',
'||', '|', '&|', ';|', ';;|'):
src = ['echo hi %s echo bye' % delimiter,
'echo hi%secho bye' % delimiter]
ref = ['echo', 'hi', delimiter, 'echo', 'bye']
for ss in src:
s = shlex.shlex(ss, punctuation_chars=True)
result = list(s)
self.assertEqual(ref, result, "While splitting '%s'" % ss)
def testSyntaxSplitSemicolon(self):
"""Test handling of syntax splitting of ;"""
# Could take these forms: ;, ;;, ;&, ;;&
# these should all parse to the same output
for delimiter in (';', ';;', ';&', ';;&'):
src = ['echo hi %s echo bye' % delimiter,
'echo hi%s echo bye' % delimiter,
'echo hi%secho bye' % delimiter]
ref = ['echo', 'hi', delimiter, 'echo', 'bye']
for ss in src:
s = shlex.shlex(ss, punctuation_chars=True)
result = list(s)
self.assertEqual(ref, result, "While splitting '%s'" % ss)
def testSyntaxSplitRedirect(self):
"""Test handling of syntax splitting of >"""
# of course, the same applies to <, |
# these should all parse to the same output
for delimiter in ('<', '|'):
src = ['echo hi %s out' % delimiter,
'echo hi%s out' % delimiter,
'echo hi%sout' % delimiter]
ref = ['echo', 'hi', delimiter, 'out']
for ss in src:
s = shlex.shlex(ss, punctuation_chars=True)
result = list(s)
self.assertEqual(ref, result, "While splitting '%s'" % ss)
def testSyntaxSplitParen(self):
"""Test handling of syntax splitting of ()"""
# these should all parse to the same output
src = ['( echo hi )',
'(echo hi)']
ref = ['(', 'echo', 'hi', ')']
for ss in src:
s = shlex.shlex(ss, punctuation_chars=True)
result = list(s)
self.assertEqual(ref, result, "While splitting '%s'" % ss)
def testSyntaxSplitCustom(self):
"""Test handling of syntax splitting with custom chars"""
ref = ['~/a', '&', '&', 'b-c', '--color=auto', '||', 'd', '*.py?']
ss = "~/a && b-c --color=auto || d *.py?"
s = shlex.shlex(ss, punctuation_chars="|")
result = list(s)
self.assertEqual(ref, result, "While splitting '%s'" % ss)
def testTokenTypes(self):
"""Test that tokens are split with types as expected."""
for source, expected in (
('a && b || c',
[('a', 'a'), ('&&', 'c'), ('b', 'a'),
('||', 'c'), ('c', 'a')]),
):
s = shlex.shlex(source, punctuation_chars=True)
observed = []
while True:
t = s.get_token()
if t == s.eof:
break
if t[0] in s.punctuation_chars:
tt = 'c'
else:
tt = 'a'
observed.append((t, tt))
self.assertEqual(observed, expected)
def testPunctuationInWordChars(self):
"""Test that any punctuation chars are removed from wordchars"""
s = shlex.shlex('a_b__c', punctuation_chars='_')
self.assertNotIn('_', s.wordchars)
self.assertEqual(list(s), ['a', '_', 'b', '__', 'c'])
def testPunctuationWithWhitespaceSplit(self):
"""Test that with whitespace_split, behaviour is as expected"""
s = shlex.shlex('a && b || c', punctuation_chars='&')
# whitespace_split is False, so splitting will be based on
# punctuation_chars
self.assertEqual(list(s), ['a', '&&', 'b', '|', '|', 'c'])
s = shlex.shlex('a && b || c', punctuation_chars='&')
s.whitespace_split = True
# whitespace_split is True, so splitting will be based on
# white space
self.assertEqual(list(s), ['a', '&&', 'b', '||', 'c'])
def testEmptyStringHandling(self):
"""Test that parsing of empty strings is correctly handled."""
# see Issue #21999
expected = ['', ')', 'abc']
for punct in (False, True):
s = shlex.shlex("'')abc", posix=True, punctuation_chars=punct)
slist = list(s)
self.assertEqual(slist, expected)
expected = ["''", ')', 'abc']
s = shlex.shlex("'')abc", punctuation_chars=True)
self.assertEqual(list(s), expected)
def testQuote(self):
safeunquoted = string.ascii_letters + string.digits + '@%_-+=:,./'
unicode_sample = '\xe9\xe0\xdf' # e + acute accent, a + grave, sharp s