# minimalpcp: Minimal demo precedence climbing parser for infix, prefix
# and postfix operator expressions. A rudimentary tokenizer is included.
# Python version 3.* (or 2.7.*); no imports required.
# ---------------------------------------------------------------------
# Notes on this Python module can be found in 'minimalpcp.pdf'

# Licence (2-clause BSD License)

# Copyright 2017 JoeCreu

# Redistribution and use in source and binary forms, with or without
# modification, are permitted, provided that the following conditions are met:

# 1. Redistributions of source code must retain the above copyright notice, 
# this list of conditions and the following disclaimer.

# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation 
# and/or other materials provided with the distribution.

# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER 
# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.

# JoeCreu, 2017-04-23.

# Example usage (see test data ts1, ts2, ..., iopsA below):
# Import this module:      from minimalpcp import *
# Display data, e.g.       ts4; iopsA
# Display token sequence:  Tokenizer(ts4,iopsA).TokSeq()
# Create fresh tokenizer:  tok = Tokenizer(ts4,iopsA)
# Parse (using tokenizer): parseExpr(tok)
# Parse (using 'parse'):   parse(ts4,iopsA)
# Use literal arguments:   parse("not 3 < 4",{"not":(100,5),"<":(6,7)})

# There is a unittest; run it from shell: python3 minimalpcp_test.py -v

# 'Tokenizer' and 'parse' take the string to be parsed as first and the
# dictionary of binding powers as second argument. A tokenizer can't be
# used twice; create a fresh one if required. Tokens in the string must
# be space-separated. Pre- and postfix operators are recognized by
# their special left (resp. right) binding power. The tokenizer inserts
# dummy operands before pre- and after postfix operators.
# 'TokSeq', 'PL2LL' and 'parse' are not needed for the actual parsing.

class Tokenizer:                   # Create a tokenizer instance using
    def __init__(self,ts,iops):    # e.g.      tok=Tokenizer(ts6,iopsA)
        self.tl = ts.split()       # then try: tok.Current
        self.length = len(self.tl) #           tok.Next()
        self.pos = -1              #           tok.Next()
        self.iops = iops           #           ...
        self.iops["$END"] = (-1,0) # Use a fresh tokenizer instance as
        self.Current = "$BEGIN"    # input for parsing: parseExpr(tok)

    def LBP(self,t) : return self.iops[t][0] # Left Binding power
    def RBP(self,t) : return self.iops[t][1] # Right Binding power
    
    def Next(self) :               # Advance to next token, return it
        if self.Current == "$PREDUMMY" :
            self.Current = self.tl[self.pos]
        elif (self.Current in self.iops
                              and self.RBP(self.Current)) == 100 :
            self.Current = "$POSTDUMMY"
        else :
            self.pos += 1
            if self.pos >= self.length :
                self.Current = "$END"
            else :
                atok = self.tl[self.pos]
                if atok in self.iops and self.LBP(atok) == 100 :
                    self.Current = "$PREDUMMY"
                else :
                    self.Current = atok
        return self.Current

    def TokSeq(self) :             # token sequence as seen by the
        s = self.Current           # parser, converted to a string.
        while self.Current != "$END" : s += (" " + self.Next())
        return s

# The function parseExpr is the actual infix parser. If tok is a fresh
# tokenizer, parseExpr(tok) will return the parse tree as Python list.
# 'parse' combines tokenization, parsing and output formatting.

def parseExpr(tok,minrbp=0):       # tok : Tokenizer
    s = tok.Next()                 # minrbp: minimal right binding
    n = tok.Next()                 # power. Start with minrbp = 0.
    while minrbp < tok.LBP(n) :    #        
       s = [n,s,parseExpr(tok,tok.RBP(n))]
       n = tok.Current             # If LBP(o)<=RBP(o) for operator o,
    return s                       # o will be parsed left associative.

def PL2LL(pl) :                    # Format Python list as a string
    if type(pl) is list :          # that looks like a lisp list. PL2LL
       s = "("                     # is a utility function for 'parse'.
       if len(pl) > 0  : s += PL2LL(pl[0])
       for t in pl[1:] : s += " " + PL2LL(t)
       s += ")"
    else : s = str(pl)
    return s

def parse(ts,iops):                # tokenize, parse, format
    return PL2LL(parseExpr(Tokenizer(ts,iops)))

# Test data 

ts1 = "12 * x"         # ts1, ..., t6 are test strings for parsing.
ts2 = "3 * not a !"    # The tokens must be space separated.
ts3 = "res = 4 * 3 + n ^ 12 - x2"
ts4 = "res = 4 * 3 * a + n ^ 3 ^ 2 - x2"
ts5 = "f := x -> A and B"           # Try different binding powers,
ts6 = "f := x -> not A and not B !" # e.g., for '->' (see iopsA below).

iopsA = {              # iopsA is a dictionary containing operators
      "^" :  (81,80),  # and their LBP and RBP (left and right binding
      "+" :  (70,71),  # power). Prefix operators have LBP 100, postfix
      "-" :  (70,71),  # fix operators have RBP 100 ("not" is prefix,
      "*" :  (75,76),  # "!" is postfix). "Real" binding powers should
      "/" :  (75,76),  # be integers in range 10 to 90.
      "=" :  (51,50),
     "->" :  (90,25),  # LBP and RBP of "->" are intentionally "very"
     ":=" :  (41,40),  # different. See token strings ts5 and ts6.
    "not" :  (100,33), # LBP("*")<RBP("*") means "*" is left associa-
      "!" :  (78,100), # tive, so a*b*c is parsed as (a*b)*c. But a^b^c
    "and" :  (30,31)}  # is parsed as a^(b^c) since LBP("^")>RBP("^").