diff -urN old/Lib/shlex.py new/Lib/shlex.py --- old/Lib/shlex.py 2009-12-29 00:02:14.000000000 -0800 +++ new/Lib/shlex.py 2009-12-29 00:02:04.000000000 -0800 @@ -6,6 +6,8 @@ # push_source() and pop_source() made explicit by ESR, January 2001. # Posix compliance, split(), string arguments, and # iterator interface by Gustavo Niemeyer, April 2003. +# conversion to generator, mass optimization and cleanup +# by Brian Harring, December 2009 import os.path import sys @@ -18,28 +20,58 @@ __all__ = ["shlex", "split"] + +class stream_source(object): + + def __init__(self, stream, filename=None): + self.stream = stream + self.lineno = 1 + self.eof = False + self.filename = filename + + def readline(self): + data = self.stream.readline() + self.lineno += 1 + return data + + def __iter__(self): + if self.stream is None: + raise StopIteration() + read = self.stream.read + data = read(1) + while data: + if data in '\n': + self.lineno += 1 + yield data + data = read(1) + self.close() + + def close(self): + self.eof = True + self.stream = None + + def __str__(self): + if self.filename: + return "streamed filename %r" % (self.filename,) + return "stream %r" % (self.stream,) + + class shlex: "A lexical analyzer class for simple shell-like syntaxes." - def __init__(self, instream=None, infile=None, posix=False): - if isinstance(instream, basestring): - instream = StringIO(instream) - if instream is not None: - self.instream = instream - self.infile = infile - else: - self.instream = sys.stdin - self.infile = None + def __init__(self, instream=None, infile=None, posix=False, + debug=0): self.posix = posix if posix: self.eof = None else: self.eof = '' self.commenters = '#' - self.wordchars = ('abcdfeghijklmnopqrstuvwxyz' - 'ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_') + self._wordchars = set(('abcdfeghijklmnopqrstuvwxyz' + 'ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_')) if self.posix: - self.wordchars += ('ßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ' + self._wordchars.update('ßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ' 'ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞ') + self.whitespace = ' \t\r\n' self.whitespace_split = False self.quotes = '\'"' @@ -47,57 +79,81 @@ self.escapedquotes = '"' self.state = ' ' self.pushback = deque() - self.lineno = 1 - self.debug = 0 - self.token = '' + self.debug = debug self.filestack = deque() self.source = None + if self.debug: - print 'shlex: reading from %s, line %d' \ - % (self.instream, self.lineno) + self.emit_debug('reading from %s, line %d', + (self.instream, self.lineno,)) + + self.instream = None + self.token_stream = None + self.push_source(instream, newfile=infile) + + @property + def infile(self): + return self.instream.filename + + @property + def lineno(self): + return self.instream.lineno + + def emit_debug(self, format, args=(), level=1): + if self.level >= level: + print "shlex: " + (format % args) + + + # purely for backwards compatibility... annoying. + def _set_wordchars(self, arg): + self._wordchars = set(arg) + + def _get_wordchars(self, arg): + return tuple(self._wordchars) + + wordchards = property(_get_wordchars, _set_wordchars) def push_token(self, tok): "Push a token onto the stack popped by the get_token method" - if self.debug >= 1: - print "shlex: pushing token " + repr(tok) + if self.debug: + self.emit_debug("pushing token %r", (tok,)) self.pushback.appendleft(tok) def push_source(self, newstream, newfile=None): "Push an input source onto the lexer's input source stack." if isinstance(newstream, basestring): newstream = StringIO(newstream) - self.filestack.appendleft((self.infile, self.instream, self.lineno)) - self.infile = newfile - self.instream = newstream - self.lineno = 1 + if self.instream: + self.filestack.appendleft((self.token_stream, self.instream)) + self.instream = stream_source(newstream, filename=newfile) + self.token_stream = self.read_token_stream() if self.debug: - if newfile is not None: - print 'shlex: pushing to file %s' % (self.infile,) - else: - print 'shlex: pushing to stream %s' % (self.instream,) + self.emit_debug("pushing to %s", (self.instream,)) def pop_source(self): "Pop the input source stack." self.instream.close() - (self.infile, self.instream, self.lineno) = self.filestack.popleft() + (self.token_stream, self.instream) = self.filestack.popleft() if self.debug: - print 'shlex: popping to %s, line %d' \ - % (self.instream, self.lineno) + self.emit_debug("popping to %s, line %d", (self.instream, self.lineno)) self.state = ' ' def get_token(self): "Get a token from the input stream (or from stack if it's nonempty)" if self.pushback: tok = self.pushback.popleft() - if self.debug >= 1: - print "shlex: popping token " + repr(tok) + if self.debug: + self.emit_debug("popping token %r", (tok,)) return tok # No pushback. Get a token. raw = self.read_token() # Handle inclusions if self.source is not None: while raw == self.source: - spec = self.sourcehook(self.read_token()) + token = self.read_token() + if self.debug: + self.emit_debug("raw token=%r" % (token,)) + spec = self.sourcehook(token) if spec: (newfile, newstream) = spec self.push_source(newstream, newfile) @@ -106,144 +162,179 @@ while raw == self.eof: if not self.filestack: return self.eof - else: - self.pop_source() - raw = self.get_token() + self.pop_source() + raw = self.get_token() # Neither inclusion nor EOF - if self.debug >= 1: - if raw != self.eof: - print "shlex: token=" + repr(raw) - else: - print "shlex: token=EOF" + if self.debug: + self.emit_debug("token=%r", (raw == self.eof and 'EOF' or raw)) return raw def read_token(self): - quoted = False - escapedstate = ' ' - while True: - nextchar = self.instream.read(1) - if nextchar == '\n': - self.lineno = self.lineno + 1 - if self.debug >= 3: - print "shlex: in state", repr(self.state), \ - "I see character:", repr(nextchar) - if self.state is None: - self.token = '' # past end of file + try: + return self.token_stream.next() + except StopIteration: + return self.eof + + def _nonposix_parse_quote(self, stream_i, quoted, token): + token = quoted + for nextchar in stream_i: + token += nextchar + if nextchar == quoted: break - elif self.state == ' ': - if not nextchar: - self.state = None # end of file + else: + raise ValueError("No closing quotation") + # got all of it, yield it; basicall "DO" of "DO"monkey + return token, True + + def _posix_parse_quote(self, stream_i, quoted, token): + if quoted in self.escapedquotes: + escape = self.escape + try: + for nextchar in stream_i: + if nextchar in escape: + escaper = nextchar + nextchar = stream_i.next() + if nextchar != escaper and nextchar != quoted: + token += escaper + elif nextchar == quoted: + break + token += nextchar + else: + raise ValueError("No closing quotation") + except StopIteration: + # escaped char... + raise ValueError("No closing quotation") + else: + for nextchar in stream_i: + if nextchar == quoted: break - elif nextchar in self.whitespace: - if self.debug >= 2: - print "shlex: I see whitespace in whitespace state" - if self.token or (self.posix and quoted): - break # emit current token - else: + token += nextchar + else: + raise ValueError("No closing quotation") + + if not self.whitespace_split: + return token, True + return token, False + + def read_token_stream(self): + if self.state is None: + return + + debug = self.debug + + instream = self.instream + whitespace = self.whitespace + commenters = self.commenters + escape = self.escape + wordchars = self._wordchars + posix = self.posix + if posix: + parse_quote = self._posix_parse_quote + else: + parse_quote = self._nonposix_parse_quote + + quotes = self.quotes + + got_input = False + + assert instream is not None + stream_i = iter(instream) + + while True: + + got_input = False + token = '' + + for nextchar in stream_i: + got_input = True + if nextchar in whitespace: + continue + + if nextchar in commenters: + # literally just like this comment here, ignore everything that follows + instream.readline() + continue + + elif posix and nextchar in escape: + try: + token += stream_i.next() + except StopIteration: + raise ValueError("No escaped character") + + elif nextchar in wordchars: + token = nextchar + + elif nextchar in quotes: + chunk, reset = parse_quote(stream_i, nextchar, token) + token += chunk + if reset: + yield token + token = '' continue - elif nextchar in self.commenters: - self.instream.readline() - self.lineno = self.lineno + 1 - elif self.posix and nextchar in self.escape: - escapedstate = 'a' - self.state = nextchar - elif nextchar in self.wordchars: - self.token = nextchar - self.state = 'a' - elif nextchar in self.quotes: - if not self.posix: - self.token = nextchar - self.state = nextchar + elif self.whitespace_split: - self.token = nextchar - self.state = 'a' + token = nextchar + else: - self.token = nextchar - if self.token or (self.posix and quoted): - break # emit current token - else: - continue - elif self.state in self.quotes: - quoted = True - if not nextchar: # end of file - if self.debug >= 2: - print "shlex: I see EOF in quotes state" - # XXX what error should be raised here? - raise ValueError, "No closing quotation" - if nextchar == self.state: - if not self.posix: - self.token = self.token + nextchar - self.state = ' ' + # punctuation... + yield nextchar + continue + # and... we're done processing the whitespace. + break + + # non whitespace appending + for nextchar in stream_i: + got_input = True + + if nextchar in wordchars: + token += nextchar + continue + + if nextchar in whitespace: + if debug: + self.emit_debug("I see whitespace in word state", level=2) + if token or posix: + yield token + break + + elif nextchar in commenters: + instream.readline() + if posix: + if token: + yield token + break + + elif posix and nextchar in quotes: + chunk, reset = parse_quote(stream_i, nextchar, token) + token = chunk + if reset: + yield token break - else: - self.state = 'a' - elif self.posix and nextchar in self.escape and \ - self.state in self.escapedquotes: - escapedstate = self.state - self.state = nextchar + + elif posix and nextchar in escape: + try: + token += stream_i.next() + except StopIteration: + raise ValueError("no escape character") + + elif nextchar in quotes or self.whitespace_split: + token += nextchar + else: - self.token = self.token + nextchar - elif self.state in self.escape: - if not nextchar: # end of file - if self.debug >= 2: - print "shlex: I see EOF in escape state" - # XXX what error should be raised here? - raise ValueError, "No escaped character" - # In posix shells, only the quote itself or the escape - # character may be escaped within quotes. - if escapedstate in self.quotes and \ - nextchar != self.state and nextchar != escapedstate: - self.token = self.token + self.state - self.token = self.token + nextchar - self.state = escapedstate - elif self.state == 'a': - if not nextchar: - self.state = None # end of file + if debug: + self.emit_debug("I see punctuation in word state", level=2) + assert token + yield token + yield nextchar # now yield the punctuation... break - elif nextchar in self.whitespace: - if self.debug >= 2: - print "shlex: I see whitespace in word state" - self.state = ' ' - if self.token or (self.posix and quoted): - break # emit current token - else: - continue - elif nextchar in self.commenters: - self.instream.readline() - self.lineno = self.lineno + 1 - if self.posix: - self.state = ' ' - if self.token or (self.posix and quoted): - break # emit current token - else: - continue - elif self.posix and nextchar in self.quotes: - self.state = nextchar - elif self.posix and nextchar in self.escape: - escapedstate = 'a' - self.state = nextchar - elif nextchar in self.wordchars or nextchar in self.quotes \ - or self.whitespace_split: - self.token = self.token + nextchar - else: - self.pushback.appendleft(nextchar) - if self.debug >= 2: - print "shlex: I see punctuation in word state" - self.state = ' ' - if self.token: - break # emit current token - else: - continue - result = self.token - self.token = '' - if self.posix and not quoted and result == '': - result = None - if self.debug > 1: - if result: - print "shlex: raw token=" + repr(result) else: - print "shlex: raw token=EOF" - return result + if not token and not got_input: + token = self.eof + yield token + break + + + self.state = None def sourcehook(self, newfile): "Hook called on a filename to be sourced." @@ -263,13 +354,13 @@ return "\"%s\", line %d: " % (infile, lineno) def __iter__(self): - return self + get_token = self.get_token + token = get_token() + eof = self.eof + while token != eof: + yield token + token = get_token() - def next(self): - token = self.get_token() - if token == self.eof: - raise StopIteration - return token def split(s, comments=False, posix=True): lex = shlex(s, posix=posix) @@ -280,13 +371,8 @@ if __name__ == '__main__': if len(sys.argv) == 1: - lexer = shlex() + args = [sys.stdin] else: - file = sys.argv[1] - lexer = shlex(open(file), file) - while 1: - tt = lexer.get_token() - if tt: - print "Token: " + repr(tt) - else: - break + args = [open(sys.argv[1]), sys.argv[1]] + for token in shlex(*args): + print "Token %r" % (token,) diff -urN old/Lib/test/test_shlex.py new/Lib/test/test_shlex.py --- old/Lib/test/test_shlex.py 2009-12-29 00:02:19.000000000 -0800 +++ new/Lib/test/test_shlex.py 2009-12-29 00:02:04.000000000 -0800 @@ -25,6 +25,7 @@ foo \ x bar|foo|\|x|bar| foo \ bar|foo|\|bar| foo "bar" bla|foo|"bar"|bla| +"foo" "bar"|"foo"|"bar"| "foo" "bar" "bla"|"foo"|"bar"|"bla"| "foo" bar "bla"|"foo"|bar|"bla"| "foo" bar bla|"foo"|bar|bla| @@ -139,22 +140,37 @@ áéíóú|áéíóú| """ +# data that is used for shlex direct invocation, instead of split. +# split flips on whitespace_split always... not great for testing the +# parser, especially consider x=\ dar # assignments +posix_data_shlex_direct = r"""x=\ dar|x|=| dar| +\ dar| dar| +dar |dar| +""" + + class ShlexTest(unittest.TestCase): def setUp(self): - self.data = [x.split("|")[:-1] - for x in data.splitlines()] - self.posix_data = [x.split("|")[:-1] - for x in posix_data.splitlines()] - for item in self.data: - item[0] = item[0].replace(r"\n", "\n") - for item in self.posix_data: - item[0] = item[0].replace(r"\n", "\n") - - def splitTest(self, data, comments): + for attr, src in (("data", data), ("posix_data", posix_data), + ("posix_data_shlex_direct", posix_data_shlex_direct)): + l = [x.split("|")[:-1] for x in src.splitlines()] + for item in l: + item[0] = item[0].replace(r"\n", "\n") + setattr(self, attr, l) + + def splitTest(self, data, comments, use_split=True): + if use_split: + f = lambda val: shlex.split(val, comments=comments) + else: + def f(val): + parser = shlex.shlex(val, posix=True) + if not comments: + parser.commenters = '' + return list(parser) for i in range(len(data)): - l = shlex.split(data[i][0], comments=comments) + l = f(data[i][0]) self.assertEqual(l, data[i][1:], - "%s: %s != %s" % + "%r: %r != %r" % (data[i][0], l, data[i][1:])) def oldSplit(self, s): @@ -170,12 +186,16 @@ """Test data splitting with posix parser""" self.splitTest(self.posix_data, comments=True) + def testPosix_unsplit(self): + self.splitTest(self.posix_data_shlex_direct, comments=True, + use_split=False) + def testCompat(self): """Test compatibility interface""" for i in range(len(self.data)): l = self.oldSplit(self.data[i][0]) self.assertEqual(l, self.data[i][1:], - "%s: %s != %s" % + "%r: %r != %r" % (self.data[i][0], l, self.data[i][1:])) # Allow this test to be used with old shlex.py