import collections import re Token = collections.namedtuple('Token', 'typ value line column') def tokenize(s): keywords = {'IF', 'THEN', 'FOR', 'NEXT', 'GOSUB', 'RETURN'} tok_spec = [ ('NUMBER', r'\d+(\.\d*)?'), # Integer or decimal number ('ASSIGN', r':='), # Assignment operator ('END', ';'), # Statement terminator ('ID', r'[A-Za-z]+'), # Identifiers ('OP', r'[+*\/\-]'), # Arithmetic operators ('NEWLINE', r'\n'), # Line endings ('SKIP', r'[ \t]'), # Skip over spaces and tabs ] tok_re = '|'.join('(?P<%s>%s)' % pair for pair in tok_spec) gettok = re.compile(tok_re).match line = 1 pos = line_start = 0 mo = gettok(s) while mo is not None: typ = mo.lastgroup if typ == 'NEWLINE': line_start = pos line += 1 elif typ != 'SKIP': val = mo.group(typ) if typ == 'ID' and val in keywords: typ = val yield Token(typ, val, line, mo.start()-line_start) pos = mo.end() mo = gettok(s, pos) if pos != len(s): raise RuntimeError('Unexpected character %r on line %d' %(s[pos], line)) statements = '''\ total := total + price * quantity; tax := price * 0.05; ''' for token in tokenize(statements): print(token)