diff -r e379617b4c4c -r 9e12275eec25 Doc/library/shlex.rst --- a/Doc/library/shlex.rst Thu Jan 05 19:49:27 2012 +0100 +++ b/Doc/library/shlex.rst Fri Jan 06 18:54:26 2012 +0000 @@ -70,7 +70,7 @@ The :mod:`shlex` module defines the following class: -.. class:: shlex(instream=None, infile=None, posix=False) +.. class:: shlex(instream=None, infile=None, posix=False, control=False) A :class:`shlex` instance or subclass instance is a lexical analyzer object. The initialization argument, if present, specifies where to read characters @@ -84,6 +84,9 @@ operate in compatibility mode. When operating in POSIX mode, :class:`shlex` will try to be as close as possible to the POSIX shell parsing rules. + .. versionchanged:: 3.3 + The `control` parameter was added. See + :ref:`improved-shell-compatibility`. .. seealso:: @@ -316,3 +319,47 @@ * EOF is signaled with a :const:`None` value; * Quoted empty strings (``''``) are allowed. + +.. _improved-shell-compatibility: + +Improved Compatibility with Shells +---------------------------------- + +.. versionadded:: 3.3 + +The :class:`shlex` class now provides added compatibility with the parsing +performed by common Unix shells like `bash`, `dash`, and `sh`. In order to take +advantage of this improved compatibility, an additional keyword parameter, +`control`, has been added to the constructor. This defaults to ``False``, which +preserves existing behaviour. However, if it is set to ``True``, then parsing of +the characters ``();<>|&`` is changed: any run of these characters is returned as +a single token. While this is short of a full parser for shells (which would be +out of scope for the standard library, given the multiplicity of shells out +there), it does allow you to perform processing of command lines more easily +than you could before. To illustrate, you can see the difference in the +following snippet:: + + import shlex + + for ctrl, message in ((False, 'Old'), (True, 'New')): + text = "a && b; c && d || e; f >'abc'; (def \"ghi\")" + s = shlex.shlex(text, control=ctrl) + print('%s: %s' % (message, list(s))) + +which prints out:: + + Old: ['a', '&', '&', 'b', ';', 'c', '&', '&', 'd', '|', '|', 'e', ';', 'f', '>', "'abc'", ';', '(', 'def', '"ghi"', ')'] + New: ['a', '&&', 'b', ';', 'c', '&&', 'd', '||', 'e', ';', 'f', '>', "'abc'", ';', '(', 'def', '"ghi"', ')'] + +Of course, tokens will be returned which are not valid for shells, and you'll +need to implement your own error checks on the returned tokens. + +Instead of passing ``True`` as the value for the control parameter, you can pass +a string with specific characters, which will be used to determine how "control" +characters are determined. For example:: + + >>> import shlex + >>> s = shlex.shlex("a && b || c", control="|") + >>> list(s) + ['a', '&', '&', 'b', '||', 'c'] + diff -r e379617b4c4c -r 9e12275eec25 Lib/shlex.py --- a/Lib/shlex.py Thu Jan 05 19:49:27 2012 +0100 +++ b/Lib/shlex.py Fri Jan 06 18:54:26 2012 +0000 @@ -17,7 +17,7 @@ class shlex: "A lexical analyzer class for simple shell-like syntaxes." - def __init__(self, instream=None, infile=None, posix=False): + def __init__(self, instream=None, infile=None, posix=False, control=False): if isinstance(instream, str): instream = StringIO(instream) if instream is not None: @@ -49,6 +49,14 @@ self.token = '' self.filestack = deque() self.source = None + if not control: + control = '' + elif control is True: + control = '();<>|&' + self.control = control + if control: + self.pbchars = deque() + self.wordchars += '-' if self.debug: print('shlex: reading from %s, line %d' \ % (self.instream, self.lineno)) @@ -118,7 +126,10 @@ quoted = False escapedstate = ' ' while True: - nextchar = self.instream.read(1) + if self.control and self.pbchars: + nextchar = self.pbchars.pop() + else: + nextchar = self.instream.read(1) if nextchar == '\n': self.lineno = self.lineno + 1 if self.debug >= 3: @@ -147,6 +158,9 @@ elif nextchar in self.wordchars: self.token = nextchar self.state = 'a' + elif nextchar in self.control: + self.token = nextchar + self.state = 'c' elif nextchar in self.quotes: if not self.posix: self.token = nextchar @@ -193,7 +207,7 @@ self.token = self.token + self.state self.token = self.token + nextchar self.state = escapedstate - elif self.state == 'a': + elif self.state in ('a', 'c'): if not nextchar: self.state = None # end of file break @@ -219,11 +233,22 @@ elif self.posix and nextchar in self.escape: escapedstate = 'a' self.state = nextchar + elif self.state == 'c': + if nextchar in self.control: + self.token = self.token + nextchar + else: + if nextchar not in self.whitespace: + self.pbchars.append(nextchar) + self.state = ' ' + break elif nextchar in self.wordchars or nextchar in self.quotes \ or self.whitespace_split: self.token = self.token + nextchar else: - self.pushback.appendleft(nextchar) + if self.control: + self.pbchars.append(nextchar) + else: + self.pushback.appendleft(nextchar) if self.debug >= 2: print("shlex: I see punctuation in word state") self.state = ' ' diff -r e379617b4c4c -r 9e12275eec25 Lib/test/test_shlex.py --- a/Lib/test/test_shlex.py Thu Jan 05 19:49:27 2012 +0100 +++ b/Lib/test/test_shlex.py Fri Jan 06 18:54:26 2012 +0000 @@ -174,6 +174,68 @@ "%s: %s != %s" % (self.data[i][0], l, self.data[i][1:])) + def testSyntaxSplitAmpersandAndPipe(self): + """Test handling of syntax splitting of &, |""" + # Could take these forms: &&, &, |&, ;&, ;;& + # of course, the same applies to | and || + # these should all parse to the same output + for delimiter in ('&&', '&', '|&', ';&', ';;&', + '||', '|', '&|', ';|', ';;|'): + src = ['echo hi %s echo bye' % delimiter, + 'echo hi%secho bye' % delimiter] + ref = ['echo', 'hi', delimiter, 'echo', 'bye'] + for ss in src: + s = shlex.shlex(ss, control=True) + result = list(s) + self.assertEqual(ref, result, "While splitting '%s'" % ss) + + def testSyntaxSplitSemicolon(self): + """Test handling of syntax splitting of ;""" + # Could take these forms: ;, ;;, ;&, ;;& + # these should all parse to the same output + for delimiter in (';', ';;', ';&', ';;&'): + src = ['echo hi %s echo bye' % delimiter, + 'echo hi%s echo bye' % delimiter, + 'echo hi%secho bye' % delimiter] + ref = ['echo', 'hi', delimiter, 'echo', 'bye'] + for ss in src: + s = shlex.shlex(ss, control=True) + result = list(s) + self.assertEqual(ref, result, "While splitting '%s'" % ss) + + def testSyntaxSplitRedirect(self): + """Test handling of syntax splitting of >""" + # of course, the same applies to <, | + # these should all parse to the same output + for delimiter in ('<', '|'): + src = ['echo hi %s out' % delimiter, + 'echo hi%s out' % delimiter, + 'echo hi%sout' % delimiter] + ref = ['echo', 'hi', delimiter, 'out'] + for ss in src: + s = shlex.shlex(ss, control=True) + result = list(s) + self.assertEqual(ref, result, "While splitting '%s'" % ss) + + def testSyntaxSplitParen(self): + """Test handling of syntax splitting of ()""" + # these should all parse to the same output + src = ['( echo hi )', + '(echo hi)'] + ref = ['(', 'echo', 'hi', ')'] + for ss in src: + s = shlex.shlex(ss, control=True) + result = list(s) + self.assertEqual(ref, result, "While splitting '%s'" % ss) + + def testSyntaxSplitCustom(self): + """Test handling of syntax splitting with custom chars""" + ref = ['a', '&', '&', 'b', '||', 'c'] + ss = "a && b || c" + s = shlex.shlex(ss, control="|") + result = list(s) + self.assertEqual(ref, result, "While splitting '%s'" % ss) + def testQuote(self): safeunquoted = string.ascii_letters + string.digits + '@%_-+=:,./' unicode_sample = '\xe9\xe0\xdf' # e + acute accent, a + grave, sharp s