# minimalpcp: Minimal demo precedence climbing parser for infix, prefix # and postfix operator expressions. A rudimentary tokenizer is included. # Python version 3.* (or 2.7.*); no imports required. # --------------------------------------------------------------------- # Notes on this Python module can be found in 'minimalpcp.pdf' # Licence (2-clause BSD License) # Copyright 2017 JoeCreu # Redistribution and use in source and binary forms, with or without # modification, are permitted, provided that the following conditions are met: # 1. Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # 2. Redistributions in binary form must reproduce the above copyright notice, # this list of conditions and the following disclaimer in the documentation # and/or other materials provided with the distribution. # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER # IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. # JoeCreu, 2017-04-23. # Example usage (see test data ts1, ts2, ..., iopsA below): # Import this module: from minimalpcp import * # Display data, e.g. ts4; iopsA # Display token sequence: Tokenizer(ts4,iopsA).TokSeq() # Create fresh tokenizer: tok = Tokenizer(ts4,iopsA) # Parse (using tokenizer): parseExpr(tok) # Parse (using 'parse'): parse(ts4,iopsA) # Use literal arguments: parse("not 3 < 4",{"not":(100,5),"<":(6,7)}) # There is a unittest; run it from shell: python3 minimalpcp_test.py -v # 'Tokenizer' and 'parse' take the string to be parsed as first and the # dictionary of binding powers as second argument. A tokenizer can't be # used twice; create a fresh one if required. Tokens in the string must # be space-separated. Pre- and postfix operators are recognized by # their special left (resp. right) binding power. The tokenizer inserts # dummy operands before pre- and after postfix operators. # 'TokSeq', 'PL2LL' and 'parse' are not needed for the actual parsing. class Tokenizer: # Create a tokenizer instance using def __init__(self,ts,iops): # e.g. tok=Tokenizer(ts6,iopsA) self.tl = ts.split() # then try: tok.Current self.length = len(self.tl) # tok.Next() self.pos = -1 # tok.Next() self.iops = iops # ... self.iops["$END"] = (-1,0) # Use a fresh tokenizer instance as self.Current = "$BEGIN" # input for parsing: parseExpr(tok) def LBP(self,t) : return self.iops[t][0] # Left Binding power def RBP(self,t) : return self.iops[t][1] # Right Binding power def Next(self) : # Advance to next token, return it if self.Current == "$PREDUMMY" : self.Current = self.tl[self.pos] elif (self.Current in self.iops and self.RBP(self.Current)) == 100 : self.Current = "$POSTDUMMY" else : self.pos += 1 if self.pos >= self.length : self.Current = "$END" else : atok = self.tl[self.pos] if atok in self.iops and self.LBP(atok) == 100 : self.Current = "$PREDUMMY" else : self.Current = atok return self.Current def TokSeq(self) : # token sequence as seen by the s = self.Current # parser, converted to a string. while self.Current != "$END" : s += (" " + self.Next()) return s # The function parseExpr is the actual infix parser. If tok is a fresh # tokenizer, parseExpr(tok) will return the parse tree as Python list. # 'parse' combines tokenization, parsing and output formatting. def parseExpr(tok,minrbp=0): # tok : Tokenizer s = tok.Next() # minrbp: minimal right binding n = tok.Next() # power. Start with minrbp = 0. while minrbp < tok.LBP(n) : # s = [n,s,parseExpr(tok,tok.RBP(n))] n = tok.Current # If LBP(o)<=RBP(o) for operator o, return s # o will be parsed left associative. def PL2LL(pl) : # Format Python list as a string if type(pl) is list : # that looks like a lisp list. PL2LL s = "(" # is a utility function for 'parse'. if len(pl) > 0 : s += PL2LL(pl[0]) for t in pl[1:] : s += " " + PL2LL(t) s += ")" else : s = str(pl) return s def parse(ts,iops): # tokenize, parse, format return PL2LL(parseExpr(Tokenizer(ts,iops))) # Test data ts1 = "12 * x" # ts1, ..., t6 are test strings for parsing. ts2 = "3 * not a !" # The tokens must be space separated. ts3 = "res = 4 * 3 + n ^ 12 - x2" ts4 = "res = 4 * 3 * a + n ^ 3 ^ 2 - x2" ts5 = "f := x -> A and B" # Try different binding powers, ts6 = "f := x -> not A and not B !" # e.g., for '->' (see iopsA below). iopsA = { # iopsA is a dictionary containing operators "^" : (81,80), # and their LBP and RBP (left and right binding "+" : (70,71), # power). Prefix operators have LBP 100, postfix "-" : (70,71), # fix operators have RBP 100 ("not" is prefix, "*" : (75,76), # "!" is postfix). "Real" binding powers should "/" : (75,76), # be integers in range 10 to 90. "=" : (51,50), "->" : (90,25), # LBP and RBP of "->" are intentionally "very" ":=" : (41,40), # different. See token strings ts5 and ts6. "not" : (100,33), # LBP("*")RBP("^").