--- /usr/lib64/python2.5/shlex.py 2007-10-30 13:45:31.000000000 -0400 +++ hotwire/shlex.py 2007-12-22 13:05:52.000000000 -0500 @@ -1,4 +1,4 @@ -# -*- coding: iso-8859-1 -*- +# -*- coding: utf-8 -*- """A lexical analyzer class for simple shell-like syntaxes.""" # Module and documentation by Eric S. Raymond, 21 Dec 1998 @@ -6,9 +6,11 @@ # push_source() and pop_source() made explicit by ESR, January 2001. # Posix compliance, split(), string arguments, and # iterator interface by Gustavo Niemeyer, April 2003. +# Modified to support Unicode by Colin Walters, Dec 2007 import os.path import sys +import unicodedata from collections import deque try: @@ -20,7 +22,7 @@ class shlex: "A lexical analyzer class for simple shell-like syntaxes." - def __init__(self, instream=None, infile=None, posix=False): + def __init__(self, instream=None, infile=None, posix=False, utf=True): if isinstance(instream, basestring): instream = StringIO(instream) if instream is not None: @@ -34,13 +36,21 @@ self.eof = None else: self.eof = '' + self.utf = utf self.commenters = '#' self.wordchars = ('abcdfeghijklmnopqrstuvwxyz' 'ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_') - if self.posix: - self.wordchars += ('' - '') - self.whitespace = ' \t\r\n' + if self.posix and not self.utf: + self.wordchars += ('ßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ' + 'ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞ') + elif self.posix: + # We dynamically determine character classes below, except + # by default _ is a word character + self.wordchars = '_' + if not self.utf: + self.whitespace = ' \t\r\n' + else: + self.whitespace = '' self.whitespace_split = False self.quotes = '\'"' self.escape = '\\' @@ -116,12 +126,22 @@ else: print "shlex: token=EOF" return raw + + def __is_whitespace(self, c, category): + return c in self.whitespace or (self.utf and category[0] == 'Z') + + def __is_wordchar(self, c, category): + return c in self.wordchars or (self.utf and category[0] in ('L', 'N')) def read_token(self): quoted = False escapedstate = ' ' while True: nextchar = self.instream.read(1) + if nextchar and self.utf: + nextcategory = unicodedata.category(nextchar) + else: + nextcategory = None if nextchar == '\n': self.lineno = self.lineno + 1 if self.debug >= 3: @@ -134,7 +154,7 @@ if not nextchar: self.state = None # end of file break - elif nextchar in self.whitespace: + if self.__is_whitespace(nextchar, nextcategory): if self.debug >= 2: print "shlex: I see whitespace in whitespace state" if self.token or (self.posix and quoted): @@ -147,7 +167,7 @@ elif self.posix and nextchar in self.escape: escapedstate = 'a' self.state = nextchar - elif nextchar in self.wordchars: + elif self.__is_wordchar(nextchar, nextcategory): self.token = nextchar self.state = 'a' elif nextchar in self.quotes: @@ -199,8 +219,8 @@ elif self.state == 'a': if not nextchar: self.state = None # end of file - break - elif nextchar in self.whitespace: + break + if self.__is_whitespace(nextchar, nextcategory): if self.debug >= 2: print "shlex: I see whitespace in word state" self.state = ' ' @@ -222,7 +242,7 @@ elif self.posix and nextchar in self.escape: escapedstate = 'a' self.state = nextchar - elif nextchar in self.wordchars or nextchar in self.quotes \ + elif self.__is_wordchar(nextchar, nextcategory) or nextchar in self.quotes \ or self.whitespace_split: self.token = self.token + nextchar else: