# simplepcp.py - a simple demo precedence climbing parser # ======================================================= # Licence (2-clause BSD License) (see minimalpcp). # Copyright 2017 JoeCreu # Python versions 3.* should work. # Python versions 2.7.* will probably work after slight modifications # JoeCreu, 2017-06-15; last revised 2018-06-05. # See description of the minimalpcp parser in minimalpcp.pdf; # minimalpcp is similar, but even simpler. In particular, binding # powers for unary operators are used here as described there. # Parsing of operator expressions is controlled by LBP and RBP # (left and right binding powers). # Additional features of simplepcp (compared to minimalpcp) # --------------------------------------------------------- # 1. Parentheses '( ... )' override precedence in the usual way. # 2. Function invocation as in f(x), or f(x,y); the arguments can be # expressions. The syntax f() is used to invoke f without arguments. # 3. The same operator symbol can be used for prefix and infix, or for # prefix and postfix, depending on the position. So it is possible # to use '+' and '-' as prefix and infix, or '++' and '--' as prefix # and postfix. The tokenizer replaces the operator symbol by another # symbol if it is in prefix operator position. It is not allowed to # denote an infix and a postfix operator by the same symbol. # 4. Control structures of the form # 'begin_keyword expr1, ..., exprn end_keyword'. # Control structures can have a pre-operand or a post-operand, or # both (examples: 'ternary' operator '... ? ... : ... ', or # 'do ... while...'; the keywords are here '?' ':' 'do' 'while'). # Index operator also fit in here: in 'a[i]', the brackets are the # keywords, and 'a' is a pre-operand. # 5. A real tokenizer (module simplerawtokenizer and class Tokenizer). # 6. For pre- and postfix-operators only the "real" binding power has # to be specified. Examples: "not":("pre",20), or "!":("post",80). # The "very high" binding power of unary operators towards their # dummies is computed when the tokenizer is initialized. It is set # to 100 if all other binding powers are less than 100. # 7. "Flat" infix operators (e.g., +, *, and, ...) are supported: This # means that '3 + 4 + 5' can be made to parse to (+ 3 4 5) instead # of (+ (+ 3 4) 5). Flat operators must be left associative infix # operators (i.e. their RBP must not be less than their LBP). # # Tokens # ------ # Dollar sign ($) and underscore (_) are considered alphabetic. Identi- # fiers can consist, as usual, of alphabetic characters and digits, # they must start with an alphabetic character. Moreover, user created # identifiers must not start with a dollar sign unless otherwise stated. # The following six characters have a fixed meaning: , ( ) # " \ # Comma and parentheses have the usual meaning. Not implemented here, # but meant to be used in future extensions are '#' (comment sign), '"' # (for strings literals) and '\' (escape character in string literals). # The remaining 24 printable special characters in the ASCII range are # ! % & / = ? { [ ] } ' ` @ + * ~ | < > ; . : - ^ # These characters have no fixed meaning to the parser. They can be # used to form operator or keyword tokens, alone or together with other # special characters. # Operators and keywords may also be formed like identifiers (letters, # digits, dollar, underscore, starting with a letter or underscore). # They must not consist of a mix of special and alphanumeric characters. # # Token recognition in the input string # ------------------------------------- # Token recognition is performed by the "raw tokenizer" module. # Whitespace alway separates tokens. Two consecutive tokens that are # formed like identifiers must be separated by at least one whitespace. # A special character always ends an identifier-like token. An digit or # an alphanumeric character always ends a token consisting of special # characters. Comma and parentheses (',', '(', ')') are one-character # tokens, they do not need whitespace before or after them. # Two or more consecutive tokens consisting of special characters # without whitespace in between are split by a 'greed rule': the first # token is the longest starting character sequence that forms a valid # token. This rule is then applied to the rest of special character # sequence to get the next token, and so on. # Number literals are recognized as usual. A plus or minus sign before # a number is initially considered as an operator (if these operators # are defined in the syntax), not as part of the number literal. E.g., # '-12' can be 'prefix minus', followed by number 12; the tokenizer # does not see it immediately as the number 'minus 12'. Combined with # appropriate binding powers this ensures that e.g., '-2^a' is parsed # as (- (^ 2 a)). If after honouring binding powers subexpressions like # ($PRE- $PREDUMMY 12) remain they will be converted to simple numbers. # Here, '$PRE-' denotes the prefix operator replacement of '-'. # A single leading zero before a decimal point, or a single trailing # zero after a demical point can be omitted in numbers (but not both). # Exponential parts of number literals must start with 'e' or 'E'. # Missing features in simplepcp # ----------------------------- # - Control structures with "inner keywords": # if ... then ... else ... # or # if ... then ... else ... endif # cannot be defined as part of the syntax. # - Comparison chaining is not implemented (something like a < x <= b). # - Comment recognition (from # to end of line) is not implemented yet. # - String literals and Boolean literals (true, false) are not allowed. # - Almost no error reporting. # Syntax definition # ----------------- # The syntax is not defined in the code. It is defined as Python data. # See synA and the first two tests in the source of simplepcp_test. # A very simple syntax definition is # {"+":(6,7)} # The invocation # parse("a+b+c",{"+":(6,7)}) parses to "(+ (+ a b) c)", and # parse("a+b+c",{"+":(6,7,True)}) parses to "(+ a b c)" # Here, '+' is the only operator (a left associative infix operator). # The 'True' value in the second examples means the operator is "flat": # # The comma is parsed as a low precedence flat infix operator. For real # operators, use binding powers between 6 and 94. The binding power of # unary operators towards their "dummy" is set greater than any "real" # binding power (usually it is set 100). Binding powers between 95 and # 99 should be used for operators inserted by the tokenizer before or # after keywords (only towards their respective keyword). The defini- # tion of control structures in the syntax is shown at the end of this # module (see also minimalpcp.pdf). # The begin_keyword and end_keyword can be (or composed of) any special # character (other than ( ) , # " \ $ _). Even brackets or curly braces # or the semicolon can be used. Different control structures can have # the same end_keyword. In this case, all uses of the end_keyword must # be with the same post-operand (or all without post-operand). # Usage examples (see also simplepcp_test.py) # ------------------------------------------- # import simplepcp as sp # sp.parse("a+b+c",{"+":(6,7)}) # or # sp.parse("a0') are allowed by # the parser. It is up to the next processing step to deal with this. import simplerawtokenizer class Tokenizer: # An instance of the Tokenizer class performs the high-level # tokenization tasks (token insertion, token replacement). # Methods whose names start with "_" are meant to be private. def __init__(self,ts,syn) : # Prepare syntax if type(syn) is list : self.iops = syn[0].copy() self.Controls = syn[1].copy() else : self.iops = syn.copy() self.Controls = {} self.prePlusOp = False self.preMinusOp = False hdict = {} # hdict is temporary for op in self.iops : if len(self.iops[op]) > 3 : hdict[self.iops[op][3][0]] = self.iops[op][3][1:] if op == "+" : self.prePlusOp = self.iops[op][3][0] if op == "-" : self.preMinusOp = self.iops[op][3][0] self.iops.update(hdict) self.EndingWords = {} for c in self.Controls : cv = self.Controls[c] self.EndingWords[cv[0]] = False if cv[1] : self.iops[cv[1][0]] = cv[1][1:] if cv[2] : self.iops[cv[2][0]] = cv[2][1:] self.EndingWords[cv[0]] = cv[2][0] # Get maximal LBP and RBP, determine binding power for dummy ops maxlbp = max((self.iops[op][0] for op in self.iops if type(self.iops[op][0]) is int)) maxrbp = max((self.iops[op][1] for op in self.iops if type(self.iops[op][1]) is int)) self.dummyBP = max(max(maxlbp,maxrbp)+1,100) for op in self.iops : bp = self.iops[op] if bp[0] == "pre" : self.iops[op] = (self.dummyBP,bp[1]) + bp[1:] elif bp[0] == "post" : self.iops[op] = (bp[1],self.dummyBP) + bp[2:] # Hard-coded operators and keywords: comma, $apply, $END, parentheses self.iops[","] = (3,4,True) # the comma is a flat infix op. self.iops["$apply"] = (self.dummyBP-4,self.dummyBP-3,False) self.Controls["("] = (")",False,False) self.EndingWords["$END"] = False self.EndingWords[")"] = False # Create raw tokenizer instance, prepare tokenization self.srt = simplerawtokenizer.RawTokenizer( self.iops,self.Controls,self.EndingWords,ts) self.Current = "$BEGIN" self.Buffer = "" # End of tokenizer initialization def LBP(self,t) : return self.iops[t][0] # Left Binding power def RBP(self,t) : return self.iops[t][1] # Right Binding power # If an operator follows the current token, is it a prefix operator? def _inPrefixPos(self) : a = self.Current return a == "$BEGIN" or a in self.Controls or (a in self.iops and self.RBP(a) < self.dummyBP) # 'An operator o is flat' means: parse A o A o A ... to (o A A A ...) def _isflat(self,n) : return (n in self.iops and len(self.iops[n]) > 2 and self.iops[n][2]) # Next: Advance to next token, return it; take care of token insertion def Next(self) : if self.Buffer != "" : self.Current = self.Buffer self.Buffer = "" elif (self.Current in self.iops and self.RBP(self.Current)) == self.dummyBP : self.Current = "$POSTDUMMY" elif self.EndingWords.get(self.Current) : self.Current = self.EndingWords[self.Current] else : atok = self.srt.nextRaw() if (self._inPrefixPos() and atok in self.iops and len(self.iops[atok])) >3 : atok = self.iops[atok][3][0] if atok == "$END" : self.Current = atok else : if atok in self.Controls and self.Controls[atok][1] : self.Current = self.Controls[atok][1][0] self.Buffer = atok elif (atok in self.iops and self.LBP(atok) == self.dummyBP) : self.Current = "$PREDUMMY" self.Buffer = atok elif (atok == "(" and self.Current not in self.Controls and self.Current not in self.iops and self.Current != "$BEGIN") : self.Current = "$apply" self.Buffer = atok elif (self.Current in self.Controls and self.Controls[self.Current][0] == atok) : self.Current = "$EMPTYDUMMY" self.Buffer = atok else : self.Current = atok return self.Current # TokSeq and RawTokSeq can be used to check the token sequence. These # methods are not used for parsing. def TokSeq(self) : # Token sequence as seen by the parser s = self.Current while self.Current != "$END" : s += (" " + str(self.Next())) return s def RawTokSeq(self) : # Token sequence as provided by the self.srt.printRawTokens()# raw tokenizer # End of class Tokenizer # Three functions are used for the actual parsing : # - parsePrimary: parse primary expression # - parseInfix: parse infix expression # - makeNode: create a node (from atoms or already existing nodes) # The functions parseInfix and parsePrimary are mutually recursive. # If tok is a fresh tokenizer, parseInfix(tok) will return the parse # tree as Python list. # 'parse' combines tokenization, parsing and output formatting. def parsePrimary(tok) : s = tok.Next() if s == "$END" : print("Unexpected end of string") return "$Error" elif s in tok.iops or s in tok.EndingWords : print("Atom or start keyword expected instead of ",s) return "$Error" elif s not in tok.Controls : return s else : t = parseInfix(tok) if tok.Current != tok.Controls[s][0] : print("Invalid ending keyword or unexpected end: ", tok.Current) return "$Error" else : if s == "(" : return t else : if type(t) is list and len(t) > 0 and t[0] == "," : # resolve commas return [s] + t[1:] elif t == "$EMPTYDUMMY" : return [s] else : return [s, t] def parseInfix(tok,minrbp=0): s = parsePrimary(tok) n = tok.Next() while n not in tok.EndingWords and minrbp < tok.LBP(n) : s = makeNode(tok,n,s,parseInfix(tok,tok.RBP(n))) n = tok.Current return s def makeNode(tok,n,s,t) : if t == "$EMPTYDUMMY" and n == "$apply" : return ["$apply",s] elif s == "$EMPTYDUMMY" or t == "$EMPTYDUMMY" : print("Error - empty subexpression in parentheses: ()") return "$ERROR" elif (type(s) is list and len(s) > 1 and n == s[0] and tok._isflat(n)) : return s + [t] elif (n == "$apply" and type(t) is list and len(t) > 1 and t[0] == "," ) : return ["$apply",s] + t[1:] # Create simple numbers from subexpressions consisting of prefix-plus # or prefix-minus, followed by a number (after honoring binding powers) elif (s == "$PREDUMMY" and (type(t) is int or type(t) is float) and (n == tok.prePlusOp or n == tok.preMinusOp)) : if n == tok.prePlusOp : return t else : return -t else : return [n,s,t] def PL2LL(pl) : # Create a string containing if type(pl) is list : # a Lisp-like list. s = "(" if len(pl) > 0 : s += PL2LL(pl[0]) for t in pl[1:] : s += " " + PL2LL(t) s += ")" else : s = str(pl) return s def parse(ts,syn): # tokenize, parse, format t = Tokenizer(ts,syn) r = parseInfix(t) if t.Current != "$END" : print("Parsing stopped at misplaced ending keyword: ",t.Current) return PL2LL(r) # Test data (see also simplepcp_test.py). ts1 = "12 * x" ts2 = "3 * not a !" ts3 = "res = 4 * 3 + n ^ 12 - x2" ts4 = "res = 4 * 3 * a + n ^ 3 ^ 2 - x2" ts5 = "f := x -> A and B" ts6 = "f := x -> not A and not B !" # synA is an example syntax. It is used in the test (simplepcp_test). synA = [{ "^" : (81,80), "**" : (81,80), "+" : (70,71,True,("$PRE+","pre",71)), # If in prefix posi- "-" : (70,71,False,("$PRE-","pre",71)), # tion, the data in "++" : ("post",79,False,("$PRE++","pre",78)),# ("$PRE ...") are "--" : ("post",79,False,("$PRE--","pre",78)),# used, otherwise the "*" : (75,76,True), # data at start are "/" : (75,76), # used. "%" : (75,76), # If the Boolean value "=" : (50,49), # is present and True, "<" : (50,51), # the operator is flat ">" : (50,51), # (e.g., '3 + 4 + a' "->" : (90,25), # is parsed as ":=" : (41,40), # (+ 3 4 a) ). "+=" : (41,40), "not" : ("pre",33), # "not" is a prefix, "!" : ("post",78,False,("$PRE!","pre",40)), # "!" may be postfix "and" : (30,31,True)}, # or prefix. { "[" : ("]",("$index",80,96),False), "?" : (":",("$pre?",48,95),("$post?",94,48)), "do" : ("while",False,("$postwhile",95,35)), "begin" : ("end",False,False), "[:" : (":]",False,False) # This line is all you need to # define a [: ... :] construct. } ] # The control structure definition scheme is # start_keyword : (end_keyword, pre_operator, post_operator) # False means here: The corresponding operator does not exist. # Operator entries: (name, left binding power, right binding power) # pre_operator and post_operator connect the control structure with the # corresponding pre-operand and post-operand, respectively.