diff --git a/Misc/NEWS b/Misc/NEWS --- a/Misc/NEWS +++ b/Misc/NEWS @@ -26,6 +26,11 @@ - Issue #19995: %c, %o, %x, and %X now raise TypeError on non-integer input. +- Issue #19655: The ASDL parser - used by the build process to generate code for + managing the Python AST in C - was rewritten. The new parser is self contained + and does not require to carry long the spark.py parser-generator library; + spark.py was removed from the source base. + Library ------- diff --git a/Parser/asdl.py b/Parser/asdl.py --- a/Parser/asdl.py +++ b/Parser/asdl.py @@ -1,255 +1,53 @@ -"""An implementation of the Zephyr Abstract Syntax Definition Language. +#------------------------------------------------------------------------------- +# Parser for ASDL [1] definition files. Reads in an ASDL description and parses +# it into an AST that describes it. +# +# The EBNF we're parsing here: Figure 1 of the paper [1]. Extended to support +# modules and attributes after a product. Words starting with Capital letters +# are terminals. Literal tokens are in "double quotes". Others are +# non-terminals. Id is either TokenId or ConstructorId. +# +# module ::= "module" Id "{" [definitions] "}" +# definitions ::= { TypeId "=" type } +# type ::= product | sum +# product ::= fields ["attributes" fields] +# fields ::= "(" { field, "," } field ")" +# field ::= TypeId ["?" | "*"] [Id] +# sum ::= constructor { "|" constructor } ["attributes" fields] +# constructor ::= ConstructorId [fields] +# +# [1] "The Zephyr Abstract Syntax Description Language" by Wang, et. al. See +# http://asdl.sourceforge.net/ +#------------------------------------------------------------------------------- +from collections import namedtuple +import re -See http://asdl.sourceforge.net/ and -http://www.cs.princeton.edu/research/techreps/TR-554-97 +__all__ = [ + 'builtin_types', 'parse', 'AST', 'Module', 'Type', 'Constructor', + 'Field', 'Sum', 'Product', 'VisitorBase', 'Check', 'check'] -Only supports top level module decl, not view. I'm guessing that view -is intended to support the browser and I'm not interested in the -browser. +# The following classes define nodes into which the ASDL description is parsed. +# Note: this is a "meta-AST". ASDL files (such as Python.asdl) describe the AST +# structure used by a programming language. But ASDL files themselves need to be +# parsed. This module parses ASDL files and uses a simple AST to represent them. +# See the EBNF at the top of the file to understand the logical connection +# between the various node types. -Changes for Python: Add support for module versions -""" +builtin_types = set( + ['identifier', 'string', 'bytes', 'int', 'object', 'singleton']) -import os -import sys -import traceback - -import spark - -def output(*strings): - for s in strings: - sys.stdout.write(str(s) + "\n") - - -class Token(object): - # spark seems to dispatch in the parser based on a token's - # type attribute - def __init__(self, type, lineno): - self.type = type - self.lineno = lineno - - def __str__(self): - return self.type - +class AST: def __repr__(self): - return str(self) - -class Id(Token): - def __init__(self, value, lineno): - self.type = 'Id' - self.value = value - self.lineno = lineno - - def __str__(self): - return self.value - -class String(Token): - def __init__(self, value, lineno): - self.type = 'String' - self.value = value - self.lineno = lineno - -class ASDLSyntaxError(Exception): - - def __init__(self, lineno, token=None, msg=None): - self.lineno = lineno - self.token = token - self.msg = msg - - def __str__(self): - if self.msg is None: - return "Error at '%s', line %d" % (self.token, self.lineno) - else: - return "%s, line %d" % (self.msg, self.lineno) - -class ASDLScanner(spark.GenericScanner, object): - - def tokenize(self, input): - self.rv = [] - self.lineno = 1 - super(ASDLScanner, self).tokenize(input) - return self.rv - - def t_id(self, s): - r"[\w\.]+" - # XXX doesn't distinguish upper vs. lower, which is - # significant for ASDL. - self.rv.append(Id(s, self.lineno)) - - def t_string(self, s): - r'"[^"]*"' - self.rv.append(String(s, self.lineno)) - - def t_xxx(self, s): # not sure what this production means - r"<=" - self.rv.append(Token(s, self.lineno)) - - def t_punctuation(self, s): - r"[\{\}\*\=\|\(\)\,\?\:]" - self.rv.append(Token(s, self.lineno)) - - def t_comment(self, s): - r"\-\-[^\n]*" - pass - - def t_newline(self, s): - r"\n" - self.lineno += 1 - - def t_whitespace(self, s): - r"[ \t]+" - pass - - def t_default(self, s): - r" . +" - raise ValueError("unmatched input: %r" % s) - -class ASDLParser(spark.GenericParser, object): - def __init__(self): - super(ASDLParser, self).__init__("module") - - def typestring(self, tok): - return tok.type - - def error(self, tok): - raise ASDLSyntaxError(tok.lineno, tok) - - def p_module_0(self, info): - " module ::= Id Id { } " - module, name, _0, _1 = info - if module.value != "module": - raise ASDLSyntaxError(module.lineno, - msg="expected 'module', found %s" % module) - return Module(name, None) - - def p_module(self, info): - " module ::= Id Id { definitions } " - module, name, _0, definitions, _1 = info - if module.value != "module": - raise ASDLSyntaxError(module.lineno, - msg="expected 'module', found %s" % module) - return Module(name, definitions) - - def p_definition_0(self, definition): - " definitions ::= definition " - return definition[0] - - def p_definition_1(self, definitions): - " definitions ::= definition definitions " - return definitions[0] + definitions[1] - - def p_definition(self, info): - " definition ::= Id = type " - id, _, type = info - return [Type(id, type)] - - def p_type_0(self, product): - " type ::= product " - return product[0] - - def p_type_1(self, sum): - " type ::= sum " - return Sum(sum[0]) - - def p_type_2(self, info): - " type ::= sum Id ( fields ) " - sum, id, _0, attributes, _1 = info - if id.value != "attributes": - raise ASDLSyntaxError(id.lineno, - msg="expected attributes, found %s" % id) - return Sum(sum, attributes) - - def p_product_0(self, info): - " product ::= ( fields ) " - _0, fields, _1 = info - return Product(fields) - - def p_product_1(self, info): - " product ::= ( fields ) Id ( fields ) " - _0, fields, _1, id, _2, attributes, _3 = info - if id.value != "attributes": - raise ASDLSyntaxError(id.lineno, - msg="expected attributes, found %s" % id) - return Product(fields, attributes) - - def p_sum_0(self, constructor): - " sum ::= constructor " - return [constructor[0]] - - def p_sum_1(self, info): - " sum ::= constructor | sum " - constructor, _, sum = info - return [constructor] + sum - - def p_sum_2(self, info): - " sum ::= constructor | sum " - constructor, _, sum = info - return [constructor] + sum - - def p_constructor_0(self, id): - " constructor ::= Id " - return Constructor(id[0]) - - def p_constructor_1(self, info): - " constructor ::= Id ( fields ) " - id, _0, fields, _1 = info - return Constructor(id, fields) - - def p_fields_0(self, field): - " fields ::= field " - return [field[0]] - - def p_fields_1(self, info): - " fields ::= fields , field " - fields, _, field = info - return fields + [field] - - def p_field_0(self, type_): - " field ::= Id " - return Field(type_[0]) - - def p_field_1(self, info): - " field ::= Id Id " - type, name = info - return Field(type, name) - - def p_field_2(self, info): - " field ::= Id * Id " - type, _, name = info - return Field(type, name, seq=True) - - def p_field_3(self, info): - " field ::= Id ? Id " - type, _, name = info - return Field(type, name, opt=True) - - def p_field_4(self, type_): - " field ::= Id * " - return Field(type_[0], seq=True) - - def p_field_5(self, type_): - " field ::= Id ? " - return Field(type[0], opt=True) - -builtin_types = ("identifier", "string", "bytes", "int", "object", "singleton") - -# below is a collection of classes to capture the AST of an AST :-) -# not sure if any of the methods are useful yet, but I'm adding them -# piecemeal as they seem helpful - -class AST(object): - pass # a marker class + raise NotImplementedError class Module(AST): def __init__(self, name, dfns): self.name = name self.dfns = dfns - self.types = {} # maps type name to value (from dfns) - for type in dfns: - self.types[type.name.value] = type.value + self.types = {type.name: type.value for type in dfns} def __repr__(self): - return "Module(%s, %s)" % (self.name, self.dfns) + return 'Module({0.name}, {0.dfns})'.format(self) class Type(AST): def __init__(self, name, value): @@ -257,7 +55,7 @@ self.value = value def __repr__(self): - return "Type(%s, %s)" % (self.name, self.value) + return 'Type({0.name}, {0.value})'.format(self) class Constructor(AST): def __init__(self, name, fields=None): @@ -265,7 +63,7 @@ self.fields = fields or [] def __repr__(self): - return "Constructor(%s, %s)" % (self.name, self.fields) + return 'Constructor({0.name}, {0.fields})'.format(self) class Field(AST): def __init__(self, type, name=None, seq=False, opt=False): @@ -282,9 +80,9 @@ else: extra = "" if self.name is None: - return "Field(%s%s)" % (self.type, extra) + return 'Field({0.type}{1})'.format(self, extra) else: - return "Field(%s, %s%s)" % (self.type, self.name, extra) + return 'Field({0.type}, {0.name}{1})'.format(self, extra) class Sum(AST): def __init__(self, types, attributes=None): @@ -292,10 +90,10 @@ self.attributes = attributes or [] def __repr__(self): - if self.attributes is None: - return "Sum(%s)" % self.types + if self.attributes: + return 'Sum({0.types}, {0.attributes})'.format(self) else: - return "Sum(%s, %s)" % (self.types, self.attributes) + return 'Sum({0.types})'.format(self) class Product(AST): def __init__(self, fields, attributes=None): @@ -303,49 +101,43 @@ self.attributes = attributes or [] def __repr__(self): - if self.attributes is None: - return "Product(%s)" % self.fields + if self.attributes: + return 'Product({0.fields}, {0.attributes})'.format(self) else: - return "Product(%s, %s)" % (self.fields, self.attributes) + return 'Product({0.fields})'.format(self) -class VisitorBase(object): +# A generic visitor for the meta-AST that describes ASDL. This can be used by +# emitters. Note that this visitor does not provide a generic visit method, so a +# subclass needs to define visit methods from visitModule to as deep as the +# interesting node. +# We also define a Check visitor that makes sure the parsed ASDL is well-formed. - def __init__(self, skip=False): +class VisitorBase: + """Generic tree visitor for ASTs.""" + def __init__(self): self.cache = {} - self.skip = skip - def visit(self, object, *args): - meth = self._dispatch(object) - if meth is None: - return - try: - meth(object, *args) - except Exception: - output("Error visiting" + repr(object)) - output(str(sys.exc_info()[1])) - traceback.print_exc() - # XXX hack - if hasattr(self, 'file'): - self.file.flush() - os._exit(1) - - def _dispatch(self, object): - assert isinstance(object, AST), repr(object) - klass = object.__class__ + def visit(self, obj, *args): + klass = obj.__class__ meth = self.cache.get(klass) if meth is None: methname = "visit" + klass.__name__ - if self.skip: - meth = getattr(self, methname, None) - else: - meth = getattr(self, methname) + meth = getattr(self, methname, None) self.cache[klass] = meth - return meth + if meth: + try: + meth(obj, *args) + except Exception as e: + print("Error visiting %r: %s" % (obj, e)) + raise class Check(VisitorBase): + """A visitor that checks a parsed ASDL tree for correctness. + Errors are printed and accumulated. + """ def __init__(self): - super(Check, self).__init__(skip=True) + super().__init__() self.cons = {} self.errors = 0 self.types = {} @@ -367,8 +159,8 @@ if conflict is None: self.cons[key] = name else: - output("Redefinition of constructor %s" % key) - output("Defined in %s and %s" % (conflict, name)) + print('Redefinition of constructor {}'.format(key)) + print('Defined in {} and {}'.format(conflict, name)) self.errors += 1 for f in cons.fields: self.visit(f, key) @@ -383,6 +175,11 @@ self.visit(f, name) def check(mod): + """Check the parsed ASDL tree for correctness. + + Return True if success. For failure, the errors are printed out and False + is returned. + """ v = Check() v.visit(mod) @@ -390,47 +187,190 @@ if t not in mod.types and not t in builtin_types: v.errors += 1 uses = ", ".join(v.types[t]) - output("Undefined type %s, used in %s" % (t, uses)) - + print('Undefined type {}, used in {}'.format(t, uses)) return not v.errors -def parse(file): - scanner = ASDLScanner() - parser = ASDLParser() +# The ASDL parser itself comes next. The only interesting external interface +# here is the top-level parse function. - f = open(file) - try: - buf = f.read() - finally: - f.close() - tokens = scanner.tokenize(buf) - try: - return parser.parse(tokens) - except ASDLSyntaxError: - err = sys.exc_info()[1] - output(str(err)) - lines = buf.split("\n") - output(lines[err.lineno - 1]) # lines starts at 0, files at 1 +def parse(filename): + """Parse ASDL from the given file and return a Module node describing it.""" + with open(filename) as f: + parser = ASDLParser() + return parser.parse(f.read()) -if __name__ == "__main__": - import glob - import sys +# Types for describing tokens in an ASDL specification. +class TokenKind: + """TokenKind is provides a scope for enumerated token kinds.""" + (ConstructorId, TypeId, Equals, Comma, Question, Pipe, Asterisk, + LParen, RParen, LBrace, RBrace) = range(11) - if len(sys.argv) > 1: - files = sys.argv[1:] - else: - testdir = "tests" - files = glob.glob(testdir + "/*.asdl") + operator_table = { + '=': Equals, ',': Comma, '?': Question, '|': Pipe, '(': LParen, + ')': RParen, '*': Asterisk, '{': LBrace, '}': RBrace} - for file in files: - output(file) - mod = parse(file) - if not mod: - break - output("module", mod.name) - output(len(mod.dfns), "definitions") - if not check(mod): - output("Check failed") +Token = namedtuple('Token', 'kind value lineno') + +class ASDLSyntaxError(Exception): + def __init__(self, msg, lineno=None): + self.msg = msg + self.lineno = lineno or '' + + def __str__(self): + return 'Syntax error on line {0.lineno}: {0.msg}'.format(self) + +def tokenize_asdl(buf): + """Tokenize the given buffer. Yield Token objects.""" + for lineno, line in enumerate(buf.splitlines(), 1): + for m in re.finditer(r'\s*(\w+|--.*|.)', line.strip()): + c = m.group(1) + if c[0].isalpha(): + # Some kind of identifier + if c[0].isupper(): + yield Token(TokenKind.ConstructorId, c, lineno) + else: + yield Token(TokenKind.TypeId, c, lineno) + elif c[:2] == '--': + # Comment + break + else: + # Operators + try: + op_kind = TokenKind.operator_table[c] + except KeyError: + raise ASDLSyntaxError('Invalid operator %s' % c, lineno) + yield Token(op_kind, c, lineno) + +class ASDLParser: + """Parser for ASDL files. + + Create, then call the parse method on a buffer containing ASDL. + This is a simple recursive descent parser that uses tokenize_asdl for the + lexing. + """ + def __init__(self): + self._tokenizer = None + self.cur_token = None + + def parse(self, buf): + """Parse the ASDL in the buffer and return an AST with a Module root. + """ + self._tokenizer = tokenize_asdl(buf) + self._advance() + return self._parse_module() + + def _parse_module(self): + if self._at_keyword('module'): + self._advance() else: - for dfn in mod.dfns: - output(dfn.name, dfn.value) + raise ASDLSyntaxError( + 'Expected "module" (found {})'.format(self.cur_token.value), + self.cur_token.lineno) + name = self._match(self._id_kinds) + self._match(TokenKind.LBrace) + defs = self._parse_definitions() + self._match(TokenKind.RBrace) + return Module(name, defs) + + def _parse_definitions(self): + defs = [] + while self.cur_token.kind == TokenKind.TypeId: + typename = self._advance() + self._match(TokenKind.Equals) + type = self._parse_type() + defs.append(Type(typename, type)) + return defs + + def _parse_type(self): + if self.cur_token.kind == TokenKind.LParen: + # If we see a (, it's a product + return self._parse_product() + else: + # Otherwise it's a sum. Look for ConstructorId + sumlist = [Constructor(self._match(TokenKind.ConstructorId), + self._parse_optional_fields())] + while self.cur_token.kind == TokenKind.Pipe: + # More constructors + self._advance() + sumlist.append(Constructor( + self._match(TokenKind.ConstructorId), + self._parse_optional_fields())) + return Sum(sumlist, self._parse_optional_attributes()) + + def _parse_product(self): + return Product(self._parse_fields(), self._parse_optional_attributes()) + + def _parse_fields(self): + fields = [] + self._match(TokenKind.LParen) + while self.cur_token.kind == TokenKind.TypeId: + typename = self._advance() + is_seq, is_opt = self._parse_optional_field_quantifier() + id = (self._advance() if self.cur_token.kind in self._id_kinds + else None) + fields.append(Field(typename, id, seq=is_seq, opt=is_opt)) + if self.cur_token.kind == TokenKind.RParen: + break + elif self.cur_token.kind == TokenKind.Comma: + self._advance() + self._match(TokenKind.RParen) + return fields + + def _parse_optional_fields(self): + if self.cur_token.kind == TokenKind.LParen: + return self._parse_fields() + else: + return None + + def _parse_optional_attributes(self): + if self._at_keyword('attributes'): + self._advance() + return self._parse_fields() + else: + return None + + def _parse_optional_field_quantifier(self): + is_seq, is_opt = False, False + if self.cur_token.kind == TokenKind.Asterisk: + is_seq = True + self._advance() + elif self.cur_token.kind == TokenKind.Question: + is_opt = True + self._advance() + return is_seq, is_opt + + def _advance(self): + """ Return the value of the current token and read the next one into + self.cur_token. + """ + cur_val = None if self.cur_token is None else self.cur_token.value + try: + self.cur_token = next(self._tokenizer) + except StopIteration: + self.cur_token = None + return cur_val + + _id_kinds = (TokenKind.ConstructorId, TokenKind.TypeId) + + def _match(self, kind): + """The 'match' primitive of RD parsers. + + * Verifies that the current token is of the given kind (kind can + be a tuple, in which the kind must match one of its members). + * Returns the value of the current token + * Reads in the next token + """ + if (isinstance(kind, tuple) and self.cur_token.kind in kind or + self.cur_token.kind == kind + ): + value = self.cur_token.value + self._advance() + return value + else: + raise ASDLSyntaxError( + 'Unmatched {} (found {})'.format(kind, self.cur_token.kind), + self.cur_token.lineno) + + def _at_keyword(self, keyword): + return (self.cur_token.kind == TokenKind.TypeId and + self.cur_token.value == keyword) diff --git a/Parser/asdl_c.py b/Parser/asdl_c.py --- a/Parser/asdl_c.py +++ b/Parser/asdl_c.py @@ -1,9 +1,6 @@ #! /usr/bin/env python """Generate C code from an ASDL description.""" -# TO DO -# handle fields that have a type but no name - import os, sys import asdl @@ -14,12 +11,8 @@ def get_c_type(name): """Return a string for the C name of the type. - This function special cases the default types provided by asdl: - identifier, string, int. + This function special cases the default types provided by asdl. """ - # XXX ack! need to figure out where Id is useful and where string - if isinstance(name, asdl.Id): - name = name.value if name in asdl.builtin_types: return name else: @@ -144,7 +137,7 @@ class StructVisitor(EmitVisitor): - """Visitor to generate typdefs for AST.""" + """Visitor to generate typedefs for AST.""" def visitModule(self, mod): for dfn in mod.dfns: @@ -188,9 +181,6 @@ self.visit(f, depth + 1) self.emit("} %s;" % cons.name, depth) self.emit("", depth) - else: - # XXX not sure what I want here, nothing is probably fine - pass def visitField(self, field, depth): # XXX need to lookup field.type, because it might be something @@ -198,7 +188,7 @@ ctype = get_c_type(field.type) name = field.name if field.seq: - if field.type.value in ('cmpop',): + if field.type == 'cmpop': self.emit("asdl_int_seq *%(name)s;" % locals(), depth) else: self.emit("asdl_seq *%(name)s;" % locals(), depth) @@ -253,7 +243,7 @@ name = f.name # XXX should extend get_c_type() to handle this if f.seq: - if f.type.value in ('cmpop',): + if f.type == 'cmpop': ctype = "asdl_int_seq *" else: ctype = "asdl_seq *" @@ -437,7 +427,7 @@ self.emit("", 0) for f in t.fields: self.visitField(f, t.name, sum=sum, depth=2) - args = [f.name.value for f in t.fields] + [a.name.value for a in sum.attributes] + args = [f.name for f in t.fields] + [a.name for a in sum.attributes] self.emit("*out = %s(%s);" % (t.name, self.buildArgs(args)), 2) self.emit("if (*out == NULL) goto failed;", 2) self.emit("return 0;", 2) @@ -465,7 +455,7 @@ self.emit("", 0) for f in prod.fields: self.visitField(f, name, prod=prod, depth=1) - args = [f.name.value for f in prod.fields] + args = [f.name for f in prod.fields] self.emit("*out = %s(%s);" % (name, self.buildArgs(args)), 1) self.emit("return 0;", 1) self.emit("failed:", 0) @@ -487,8 +477,8 @@ def isSimpleSum(self, field): # XXX can the members of this list be determined automatically? - return field.type.value in ('expr_context', 'boolop', 'operator', - 'unaryop', 'cmpop') + return field.type in ('expr_context', 'boolop', 'operator', + 'unaryop', 'cmpop') def isNumeric(self, field): return get_c_type(field.type) in ("int", "bool") @@ -960,7 +950,7 @@ def visitProduct(self, prod, name): if prod.fields: - fields = name.value+"_fields" + fields = name+"_fields" else: fields = "NULL" self.emit('%s_type = make_type("%s", &AST_type, %s, %d);' % @@ -987,7 +977,7 @@ def visitConstructor(self, cons, name, simple): if cons.fields: - fields = cons.name.value+"_fields" + fields = cons.name+"_fields" else: fields = "NULL" self.emit('%s_type = make_type("%s", %s_type, %s, %d);' % @@ -1170,7 +1160,7 @@ def set(self, field, value, depth): if field.seq: # XXX should really check for is_simple, but that requires a symbol table - if field.type.value == "cmpop": + if field.type == "cmpop": # While the sequence elements are stored as void*, # ast2obj_cmpop expects an enum self.emit("{", depth) @@ -1249,12 +1239,15 @@ common_msg = "/* File automatically generated by %s. */\n\n" -def main(srcfile): +def main(srcfile, dump_module=False): argv0 = sys.argv[0] components = argv0.split(os.sep) argv0 = os.sep.join(components[-2:]) auto_gen_msg = common_msg % argv0 mod = asdl.parse(srcfile) + if dump_module: + print('Parsed Module:') + print(mod) if not asdl.check(mod): sys.exit(1) if INC_DIR: @@ -1301,16 +1294,19 @@ INC_DIR = '' SRC_DIR = '' - opts, args = getopt.getopt(sys.argv[1:], "h:c:") - if len(opts) != 1: - sys.stdout.write("Must specify exactly one output file\n") - sys.exit(1) + dump_module = False + opts, args = getopt.getopt(sys.argv[1:], "dh:c:") for o, v in opts: if o == '-h': INC_DIR = v if o == '-c': SRC_DIR = v - if len(args) != 1: - sys.stdout.write("Must specify single input file\n") + if o == '-d': + dump_module = True + if INC_DIR and SRC_DIR: + print('Must specify exactly one output file') sys.exit(1) - main(args[0]) + elif len(args) != 1: + print('Must specify single input file') + sys.exit(1) + main(args[0], dump_module) diff --git a/Parser/spark.py b/Parser/spark.py deleted file mode 100644 --- a/Parser/spark.py +++ /dev/null @@ -1,849 +0,0 @@ -# Copyright (c) 1998-2002 John Aycock -# -# Permission is hereby granted, free of charge, to any person obtaining -# a copy of this software and associated documentation files (the -# "Software"), to deal in the Software without restriction, including -# without limitation the rights to use, copy, modify, merge, publish, -# distribute, sublicense, and/or sell copies of the Software, and to -# permit persons to whom the Software is furnished to do so, subject to -# the following conditions: -# -# The above copyright notice and this permission notice shall be -# included in all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. -# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY -# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, -# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE -# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - -__version__ = 'SPARK-0.7 (pre-alpha-5)' - -import re - -# Compatibility with older pythons. -def output(string='', end='\n'): - sys.stdout.write(string + end) - -try: - sorted -except NameError: - def sorted(seq): - seq2 = seq[:] - seq2.sort() - return seq2 - -def _namelist(instance): - namelist, namedict, classlist = [], {}, [instance.__class__] - for c in classlist: - for b in c.__bases__: - classlist.append(b) - for name in c.__dict__.keys(): - if name not in namedict: - namelist.append(name) - namedict[name] = 1 - return namelist - -class GenericScanner: - def __init__(self, flags=0): - pattern = self.reflect() - self.re = re.compile(pattern, re.VERBOSE|flags) - - self.index2func = {} - for name, number in self.re.groupindex.items(): - self.index2func[number-1] = getattr(self, 't_' + name) - - def makeRE(self, name): - doc = getattr(self, name).__doc__ - rv = '(?P<%s>%s)' % (name[2:], doc) - return rv - - def reflect(self): - rv = [] - for name in _namelist(self): - if name[:2] == 't_' and name != 't_default': - rv.append(self.makeRE(name)) - - rv.append(self.makeRE('t_default')) - return '|'.join(rv) - - def error(self, s, pos): - output("Lexical error at position %s" % pos) - raise SystemExit - - def tokenize(self, s): - pos = 0 - n = len(s) - while pos < n: - m = self.re.match(s, pos) - if m is None: - self.error(s, pos) - - groups = m.groups() - for i in range(len(groups)): - if groups[i] and i in self.index2func: - self.index2func[i](groups[i]) - pos = m.end() - - def t_default(self, s): - r'( . | \n )+' - output("Specification error: unmatched input") - raise SystemExit - -# -# Extracted from GenericParser and made global so that [un]picking works. -# -class _State: - def __init__(self, stateno, items): - self.T, self.complete, self.items = [], [], items - self.stateno = stateno - -class GenericParser: - # - # An Earley parser, as per J. Earley, "An Efficient Context-Free - # Parsing Algorithm", CACM 13(2), pp. 94-102. Also J. C. Earley, - # "An Efficient Context-Free Parsing Algorithm", Ph.D. thesis, - # Carnegie-Mellon University, August 1968. New formulation of - # the parser according to J. Aycock, "Practical Earley Parsing - # and the SPARK Toolkit", Ph.D. thesis, University of Victoria, - # 2001, and J. Aycock and R. N. Horspool, "Practical Earley - # Parsing", unpublished paper, 2001. - # - - def __init__(self, start): - self.rules = {} - self.rule2func = {} - self.rule2name = {} - self.collectRules() - self.augment(start) - self.ruleschanged = 1 - - _NULLABLE = '\e_' - _START = 'START' - _BOF = '|-' - - # - # When pickling, take the time to generate the full state machine; - # some information is then extraneous, too. Unfortunately we - # can't save the rule2func map. - # - def __getstate__(self): - if self.ruleschanged: - # - # XXX - duplicated from parse() - # - self.computeNull() - self.newrules = {} - self.new2old = {} - self.makeNewRules() - self.ruleschanged = 0 - self.edges, self.cores = {}, {} - self.states = { 0: self.makeState0() } - self.makeState(0, self._BOF) - # - # XXX - should find a better way to do this.. - # - changes = 1 - while changes: - changes = 0 - for k, v in self.edges.items(): - if v is None: - state, sym = k - if state in self.states: - self.goto(state, sym) - changes = 1 - rv = self.__dict__.copy() - for s in self.states.values(): - del s.items - del rv['rule2func'] - del rv['nullable'] - del rv['cores'] - return rv - - def __setstate__(self, D): - self.rules = {} - self.rule2func = {} - self.rule2name = {} - self.collectRules() - start = D['rules'][self._START][0][1][1] # Blech. - self.augment(start) - D['rule2func'] = self.rule2func - D['makeSet'] = self.makeSet_fast - self.__dict__ = D - - # - # A hook for GenericASTBuilder and GenericASTMatcher. Mess - # thee not with this; nor shall thee toucheth the _preprocess - # argument to addRule. - # - def preprocess(self, rule, func): return rule, func - - def addRule(self, doc, func, _preprocess=1): - fn = func - rules = doc.split() - - index = [] - for i in range(len(rules)): - if rules[i] == '::=': - index.append(i-1) - index.append(len(rules)) - - for i in range(len(index)-1): - lhs = rules[index[i]] - rhs = rules[index[i]+2:index[i+1]] - rule = (lhs, tuple(rhs)) - - if _preprocess: - rule, fn = self.preprocess(rule, func) - - if lhs in self.rules: - self.rules[lhs].append(rule) - else: - self.rules[lhs] = [ rule ] - self.rule2func[rule] = fn - self.rule2name[rule] = func.__name__[2:] - self.ruleschanged = 1 - - def collectRules(self): - for name in _namelist(self): - if name[:2] == 'p_': - func = getattr(self, name) - doc = func.__doc__ - self.addRule(doc, func) - - def augment(self, start): - rule = '%s ::= %s %s' % (self._START, self._BOF, start) - self.addRule(rule, lambda args: args[1], 0) - - def computeNull(self): - self.nullable = {} - tbd = [] - - for rulelist in self.rules.values(): - lhs = rulelist[0][0] - self.nullable[lhs] = 0 - for rule in rulelist: - rhs = rule[1] - if len(rhs) == 0: - self.nullable[lhs] = 1 - continue - # - # We only need to consider rules which - # consist entirely of nonterminal symbols. - # This should be a savings on typical - # grammars. - # - for sym in rhs: - if sym not in self.rules: - break - else: - tbd.append(rule) - changes = 1 - while changes: - changes = 0 - for lhs, rhs in tbd: - if self.nullable[lhs]: - continue - for sym in rhs: - if not self.nullable[sym]: - break - else: - self.nullable[lhs] = 1 - changes = 1 - - def makeState0(self): - s0 = _State(0, []) - for rule in self.newrules[self._START]: - s0.items.append((rule, 0)) - return s0 - - def finalState(self, tokens): - # - # Yuck. - # - if len(self.newrules[self._START]) == 2 and len(tokens) == 0: - return 1 - start = self.rules[self._START][0][1][1] - return self.goto(1, start) - - def makeNewRules(self): - worklist = [] - for rulelist in self.rules.values(): - for rule in rulelist: - worklist.append((rule, 0, 1, rule)) - - for rule, i, candidate, oldrule in worklist: - lhs, rhs = rule - n = len(rhs) - while i < n: - sym = rhs[i] - if sym not in self.rules or \ - not self.nullable[sym]: - candidate = 0 - i = i + 1 - continue - - newrhs = list(rhs) - newrhs[i] = self._NULLABLE+sym - newrule = (lhs, tuple(newrhs)) - worklist.append((newrule, i+1, - candidate, oldrule)) - candidate = 0 - i = i + 1 - else: - if candidate: - lhs = self._NULLABLE+lhs - rule = (lhs, rhs) - if lhs in self.newrules: - self.newrules[lhs].append(rule) - else: - self.newrules[lhs] = [ rule ] - self.new2old[rule] = oldrule - - def typestring(self, token): - return None - - def error(self, token): - output("Syntax error at or near `%s' token" % token) - raise SystemExit - - def parse(self, tokens): - sets = [ [(1,0), (2,0)] ] - self.links = {} - - if self.ruleschanged: - self.computeNull() - self.newrules = {} - self.new2old = {} - self.makeNewRules() - self.ruleschanged = 0 - self.edges, self.cores = {}, {} - self.states = { 0: self.makeState0() } - self.makeState(0, self._BOF) - - for i in range(len(tokens)): - sets.append([]) - - if sets[i] == []: - break - self.makeSet(tokens[i], sets, i) - else: - sets.append([]) - self.makeSet(None, sets, len(tokens)) - - #_dump(tokens, sets, self.states) - - finalitem = (self.finalState(tokens), 0) - if finalitem not in sets[-2]: - if len(tokens) > 0: - self.error(tokens[i-1]) - else: - self.error(None) - - return self.buildTree(self._START, finalitem, - tokens, len(sets)-2) - - def isnullable(self, sym): - # - # For symbols in G_e only. If we weren't supporting 1.5, - # could just use sym.startswith(). - # - return self._NULLABLE == sym[0:len(self._NULLABLE)] - - def skip(self, hs, pos=0): - n = len(hs[1]) - while pos < n: - if not self.isnullable(hs[1][pos]): - break - pos = pos + 1 - return pos - - def makeState(self, state, sym): - assert sym is not None - # - # Compute \epsilon-kernel state's core and see if - # it exists already. - # - kitems = [] - for rule, pos in self.states[state].items: - lhs, rhs = rule - if rhs[pos:pos+1] == (sym,): - kitems.append((rule, self.skip(rule, pos+1))) - core = kitems - - core.sort() - tcore = tuple(core) - if tcore in self.cores: - return self.cores[tcore] - # - # Nope, doesn't exist. Compute it and the associated - # \epsilon-nonkernel state together; we'll need it right away. - # - k = self.cores[tcore] = len(self.states) - K, NK = _State(k, kitems), _State(k+1, []) - self.states[k] = K - predicted = {} - - edges = self.edges - rules = self.newrules - for X in K, NK: - worklist = X.items - for item in worklist: - rule, pos = item - lhs, rhs = rule - if pos == len(rhs): - X.complete.append(rule) - continue - - nextSym = rhs[pos] - key = (X.stateno, nextSym) - if nextSym not in rules: - if key not in edges: - edges[key] = None - X.T.append(nextSym) - else: - edges[key] = None - if nextSym not in predicted: - predicted[nextSym] = 1 - for prule in rules[nextSym]: - ppos = self.skip(prule) - new = (prule, ppos) - NK.items.append(new) - # - # Problem: we know K needs generating, but we - # don't yet know about NK. Can't commit anything - # regarding NK to self.edges until we're sure. Should - # we delay committing on both K and NK to avoid this - # hacky code? This creates other problems.. - # - if X is K: - edges = {} - - if NK.items == []: - return k - - # - # Check for \epsilon-nonkernel's core. Unfortunately we - # need to know the entire set of predicted nonterminals - # to do this without accidentally duplicating states. - # - core = sorted(predicted.keys()) - tcore = tuple(core) - if tcore in self.cores: - self.edges[(k, None)] = self.cores[tcore] - return k - - nk = self.cores[tcore] = self.edges[(k, None)] = NK.stateno - self.edges.update(edges) - self.states[nk] = NK - return k - - def goto(self, state, sym): - key = (state, sym) - if key not in self.edges: - # - # No transitions from state on sym. - # - return None - - rv = self.edges[key] - if rv is None: - # - # Target state isn't generated yet. Remedy this. - # - rv = self.makeState(state, sym) - self.edges[key] = rv - return rv - - def gotoT(self, state, t): - return [self.goto(state, t)] - - def gotoST(self, state, st): - rv = [] - for t in self.states[state].T: - if st == t: - rv.append(self.goto(state, t)) - return rv - - def add(self, set, item, i=None, predecessor=None, causal=None): - if predecessor is None: - if item not in set: - set.append(item) - else: - key = (item, i) - if item not in set: - self.links[key] = [] - set.append(item) - self.links[key].append((predecessor, causal)) - - def makeSet(self, token, sets, i): - cur, next = sets[i], sets[i+1] - - ttype = token is not None and self.typestring(token) or None - if ttype is not None: - fn, arg = self.gotoT, ttype - else: - fn, arg = self.gotoST, token - - for item in cur: - ptr = (item, i) - state, parent = item - add = fn(state, arg) - for k in add: - if k is not None: - self.add(next, (k, parent), i+1, ptr) - nk = self.goto(k, None) - if nk is not None: - self.add(next, (nk, i+1)) - - if parent == i: - continue - - for rule in self.states[state].complete: - lhs, rhs = rule - for pitem in sets[parent]: - pstate, pparent = pitem - k = self.goto(pstate, lhs) - if k is not None: - why = (item, i, rule) - pptr = (pitem, parent) - self.add(cur, (k, pparent), - i, pptr, why) - nk = self.goto(k, None) - if nk is not None: - self.add(cur, (nk, i)) - - def makeSet_fast(self, token, sets, i): - # - # Call *only* when the entire state machine has been built! - # It relies on self.edges being filled in completely, and - # then duplicates and inlines code to boost speed at the - # cost of extreme ugliness. - # - cur, next = sets[i], sets[i+1] - ttype = token is not None and self.typestring(token) or None - - for item in cur: - ptr = (item, i) - state, parent = item - if ttype is not None: - k = self.edges.get((state, ttype), None) - if k is not None: - #self.add(next, (k, parent), i+1, ptr) - #INLINED --v - new = (k, parent) - key = (new, i+1) - if new not in next: - self.links[key] = [] - next.append(new) - self.links[key].append((ptr, None)) - #INLINED --^ - #nk = self.goto(k, None) - nk = self.edges.get((k, None), None) - if nk is not None: - #self.add(next, (nk, i+1)) - #INLINED --v - new = (nk, i+1) - if new not in next: - next.append(new) - #INLINED --^ - else: - add = self.gotoST(state, token) - for k in add: - if k is not None: - self.add(next, (k, parent), i+1, ptr) - #nk = self.goto(k, None) - nk = self.edges.get((k, None), None) - if nk is not None: - self.add(next, (nk, i+1)) - - if parent == i: - continue - - for rule in self.states[state].complete: - lhs, rhs = rule - for pitem in sets[parent]: - pstate, pparent = pitem - #k = self.goto(pstate, lhs) - k = self.edges.get((pstate, lhs), None) - if k is not None: - why = (item, i, rule) - pptr = (pitem, parent) - #self.add(cur, (k, pparent), - # i, pptr, why) - #INLINED --v - new = (k, pparent) - key = (new, i) - if new not in cur: - self.links[key] = [] - cur.append(new) - self.links[key].append((pptr, why)) - #INLINED --^ - #nk = self.goto(k, None) - nk = self.edges.get((k, None), None) - if nk is not None: - #self.add(cur, (nk, i)) - #INLINED --v - new = (nk, i) - if new not in cur: - cur.append(new) - #INLINED --^ - - def predecessor(self, key, causal): - for p, c in self.links[key]: - if c == causal: - return p - assert 0 - - def causal(self, key): - links = self.links[key] - if len(links) == 1: - return links[0][1] - choices = [] - rule2cause = {} - for p, c in links: - rule = c[2] - choices.append(rule) - rule2cause[rule] = c - return rule2cause[self.ambiguity(choices)] - - def deriveEpsilon(self, nt): - if len(self.newrules[nt]) > 1: - rule = self.ambiguity(self.newrules[nt]) - else: - rule = self.newrules[nt][0] - #output(rule) - - rhs = rule[1] - attr = [None] * len(rhs) - - for i in range(len(rhs)-1, -1, -1): - attr[i] = self.deriveEpsilon(rhs[i]) - return self.rule2func[self.new2old[rule]](attr) - - def buildTree(self, nt, item, tokens, k): - state, parent = item - - choices = [] - for rule in self.states[state].complete: - if rule[0] == nt: - choices.append(rule) - rule = choices[0] - if len(choices) > 1: - rule = self.ambiguity(choices) - #output(rule) - - rhs = rule[1] - attr = [None] * len(rhs) - - for i in range(len(rhs)-1, -1, -1): - sym = rhs[i] - if sym not in self.newrules: - if sym != self._BOF: - attr[i] = tokens[k-1] - key = (item, k) - item, k = self.predecessor(key, None) - #elif self.isnullable(sym): - elif self._NULLABLE == sym[0:len(self._NULLABLE)]: - attr[i] = self.deriveEpsilon(sym) - else: - key = (item, k) - why = self.causal(key) - attr[i] = self.buildTree(sym, why[0], - tokens, why[1]) - item, k = self.predecessor(key, why) - return self.rule2func[self.new2old[rule]](attr) - - def ambiguity(self, rules): - # - # XXX - problem here and in collectRules() if the same rule - # appears in >1 method. Also undefined results if rules - # causing the ambiguity appear in the same method. - # - sortlist = [] - name2index = {} - for i in range(len(rules)): - lhs, rhs = rule = rules[i] - name = self.rule2name[self.new2old[rule]] - sortlist.append((len(rhs), name)) - name2index[name] = i - sortlist.sort() - list = [b for a, b in sortlist] - return rules[name2index[self.resolve(list)]] - - def resolve(self, list): - # - # Resolve ambiguity in favor of the shortest RHS. - # Since we walk the tree from the top down, this - # should effectively resolve in favor of a "shift". - # - return list[0] - -# -# GenericASTBuilder automagically constructs a concrete/abstract syntax tree -# for a given input. The extra argument is a class (not an instance!) -# which supports the "__setslice__" and "__len__" methods. -# -# XXX - silently overrides any user code in methods. -# - -class GenericASTBuilder(GenericParser): - def __init__(self, AST, start): - GenericParser.__init__(self, start) - self.AST = AST - - def preprocess(self, rule, func): - rebind = lambda lhs, self=self: \ - lambda args, lhs=lhs, self=self: \ - self.buildASTNode(args, lhs) - lhs, rhs = rule - return rule, rebind(lhs) - - def buildASTNode(self, args, lhs): - children = [] - for arg in args: - if isinstance(arg, self.AST): - children.append(arg) - else: - children.append(self.terminal(arg)) - return self.nonterminal(lhs, children) - - def terminal(self, token): return token - - def nonterminal(self, type, args): - rv = self.AST(type) - rv[:len(args)] = args - return rv - -# -# GenericASTTraversal is a Visitor pattern according to Design Patterns. For -# each node it attempts to invoke the method n_, falling -# back onto the default() method if the n_* can't be found. The preorder -# traversal also looks for an exit hook named n__exit (no default -# routine is called if it's not found). To prematurely halt traversal -# of a subtree, call the prune() method -- this only makes sense for a -# preorder traversal. Node type is determined via the typestring() method. -# - -class GenericASTTraversalPruningException: - pass - -class GenericASTTraversal: - def __init__(self, ast): - self.ast = ast - - def typestring(self, node): - return node.type - - def prune(self): - raise GenericASTTraversalPruningException - - def preorder(self, node=None): - if node is None: - node = self.ast - - try: - name = 'n_' + self.typestring(node) - if hasattr(self, name): - func = getattr(self, name) - func(node) - else: - self.default(node) - except GenericASTTraversalPruningException: - return - - for kid in node: - self.preorder(kid) - - name = name + '_exit' - if hasattr(self, name): - func = getattr(self, name) - func(node) - - def postorder(self, node=None): - if node is None: - node = self.ast - - for kid in node: - self.postorder(kid) - - name = 'n_' + self.typestring(node) - if hasattr(self, name): - func = getattr(self, name) - func(node) - else: - self.default(node) - - - def default(self, node): - pass - -# -# GenericASTMatcher. AST nodes must have "__getitem__" and "__cmp__" -# implemented. -# -# XXX - makes assumptions about how GenericParser walks the parse tree. -# - -class GenericASTMatcher(GenericParser): - def __init__(self, start, ast): - GenericParser.__init__(self, start) - self.ast = ast - - def preprocess(self, rule, func): - rebind = lambda func, self=self: \ - lambda args, func=func, self=self: \ - self.foundMatch(args, func) - lhs, rhs = rule - rhslist = list(rhs) - rhslist.reverse() - - return (lhs, tuple(rhslist)), rebind(func) - - def foundMatch(self, args, func): - func(args[-1]) - return args[-1] - - def match_r(self, node): - self.input.insert(0, node) - children = 0 - - for child in node: - if children == 0: - self.input.insert(0, '(') - children = children + 1 - self.match_r(child) - - if children > 0: - self.input.insert(0, ')') - - def match(self, ast=None): - if ast is None: - ast = self.ast - self.input = [] - - self.match_r(ast) - self.parse(self.input) - - def resolve(self, list): - # - # Resolve ambiguity in favor of the longest RHS. - # - return list[-1] - -def _dump(tokens, sets, states): - for i in range(len(sets)): - output('set %d' % i) - for item in sets[i]: - output('\t', item) - for (lhs, rhs), pos in states[item[0]].items: - output('\t\t', lhs, '::=', end='') - output(' '.join(rhs[:pos]), end='') - output('.', end='') - output(' '.join(rhs[pos:])) - if i < len(tokens): - output() - output('token %s' % str(tokens[i])) - output()