# simplepcp.py - a simple demo precedence climbing parser
# =======================================================

# Licence (2-clause BSD License) (see minimalpcp).

# Copyright 2017 JoeCreu

# Python versions 3.* should work.
# Python versions 2.7.* will probably work after slight modifications

# JoeCreu, 2017-06-15; last revised 2018-06-05.

# See description of the minimalpcp parser in minimalpcp.pdf;
# minimalpcp is similar, but even simpler. In particular, binding
# powers for unary operators are used here as described there.
# Parsing of operator expressions is controlled by LBP and RBP 
# (left and right binding powers).

# Additional features of simplepcp (compared to minimalpcp)
# ---------------------------------------------------------
# 1. Parentheses '( ... )' override precedence in the usual way.
# 2. Function invocation as in f(x), or f(x,y); the arguments can be
#    expressions. The syntax f() is used to invoke f without arguments.
# 3. The same operator symbol can be used for prefix and infix, or for
#    prefix and postfix, depending on the position. So it is possible
#    to use '+' and '-' as prefix and infix, or '++' and '--' as prefix
#    and postfix. The tokenizer replaces the operator symbol by another
#    symbol if it is in prefix operator position. It is not allowed to
#    denote an infix and a postfix operator by the same symbol.
# 4. Control structures of the form
#    'begin_keyword expr1, ..., exprn end_keyword'.
#    Control structures can have a pre-operand or a post-operand, or
#    both (examples: 'ternary' operator '... ? ... : ... ', or
#    'do ... while...'; the keywords are here '?' ':' 'do' 'while').
#    Index operator also fit in here: in 'a[i]', the brackets are the
#    keywords, and 'a' is a pre-operand.
# 5. A real tokenizer (module simplerawtokenizer and class Tokenizer).
# 6. For pre- and postfix-operators only the "real" binding power has 
#    to be specified. Examples: "not":("pre",20), or "!":("post",80).
#    The "very high" binding power of unary operators towards their
#    dummies is computed when the tokenizer is initialized. It is set
#    to 100 if all other binding powers are less than 100.
# 7. "Flat" infix operators (e.g., +, *, and, ...) are supported: This
#    means that '3 + 4 + 5' can be made to parse to (+ 3 4 5) instead
#    of (+ (+ 3 4) 5). Flat operators must be left associative infix
#    operators (i.e. their RBP must not be less than their LBP).
#
# Tokens
# ------
# Dollar sign ($) and underscore (_) are considered alphabetic. Identi-
# fiers can consist, as usual, of alphabetic characters and digits,
# they must start with an alphabetic character. Moreover, user created
# identifiers must not start with a dollar sign unless otherwise stated.

# The following six characters have a fixed meaning: , ( ) # " \
# Comma and parentheses have the usual meaning. Not implemented here,
# but meant to be used in future extensions are '#' (comment sign), '"'
# (for strings literals) and '\' (escape character in string literals).

# The remaining 24 printable special characters in the ASCII range are
# ! % & / = ? { [ ] } ' ` @ + * ~ | < > ; . : - ^
# These characters have no fixed meaning to the parser. They can be
# used to form operator or keyword tokens, alone or together with other
# special characters.

# Operators and keywords may also be formed like identifiers (letters,
# digits, dollar, underscore, starting with a letter or underscore).
# They must not consist of a mix of special and alphanumeric characters.
#
# Token recognition in the input string
# -------------------------------------
# Token recognition is performed by the "raw tokenizer" module.

# Whitespace alway separates tokens. Two consecutive tokens that are
# formed like identifiers must be separated by at least one whitespace.
# A special character always ends an identifier-like token. An digit or
# an alphanumeric character always ends a token consisting of special
# characters. Comma and parentheses (',', '(', ')') are one-character
# tokens, they do not need whitespace before or after them.
# Two or more consecutive tokens consisting of special characters
# without whitespace in between are split by a 'greed rule': the first
# token is the longest starting character sequence that forms a valid
# token. This rule is then applied to the rest of special character
# sequence to get the next token, and so on.

# Number literals are recognized as usual. A plus or minus sign before
# a number is initially considered as an operator (if these operators
# are defined in the syntax), not as part of the number literal. E.g.,
# '-12' can be 'prefix minus', followed by number 12; the tokenizer
# does not see it immediately as the number 'minus 12'. Combined with
# appropriate binding powers this ensures that e.g., '-2^a' is parsed
# as (- (^ 2 a)). If after honouring binding powers subexpressions like
# ($PRE- $PREDUMMY 12) remain they will be converted to simple numbers.
# Here, '$PRE-' denotes the prefix operator replacement of '-'.
# A single leading zero before a decimal point, or a single trailing
# zero after a demical point can be omitted in numbers (but not both).
# Exponential parts of number literals must start with 'e' or 'E'.

# Missing features in simplepcp
# -----------------------------
# - Control structures with "inner keywords":
#      if ... then ... else ...
#   or
#      if ... then ... else ... endif
#   cannot be defined as part of the syntax.
# - Comparison chaining is not implemented (something like a < x <= b).
# - Comment recognition (from # to end of line) is not implemented yet.
# - String literals and Boolean literals (true, false) are not allowed.
# - Almost no error reporting.

# Syntax definition
# -----------------
# The syntax is not defined in the code. It is defined as Python data.
# See synA and the first two tests in the source of simplepcp_test.
# A very simple syntax definition is
# {"+":(6,7)}
# The invocation
# parse("a+b+c",{"+":(6,7)})      parses to "(+ (+ a b) c)", and
# parse("a+b+c",{"+":(6,7,True)}) parses to "(+ a b c)"
# Here, '+' is the only operator (a left associative infix operator).
# The 'True' value in the second examples means the operator is "flat":
# 
# The comma is parsed as a low precedence flat infix operator. For real
# operators, use binding powers between 6 and 94. The binding power of
# unary operators towards their "dummy" is set greater than any "real"
# binding power (usually it is set 100). Binding powers between 95 and
# 99 should be used for operators inserted by the tokenizer before or
# after keywords (only towards their respective keyword). The defini-
# tion of control structures in the syntax is shown at the end of this
# module (see also minimalpcp.pdf).
# The begin_keyword and end_keyword can be (or composed of) any special
# character (other than ( ) , # " \ $ _). Even brackets or curly braces
# or the semicolon can be used. Different control structures can have
# the same end_keyword. In this case, all uses of the end_keyword must
# be with the same post-operand (or all without post-operand).

# Usage examples (see also simplepcp_test.py)
# -------------------------------------------
# import simplepcp as sp
# sp.parse("a+b+c",{"+":(6,7)})
# or
# sp.parse("a<b? print(a),a++: b-a",sp.synA)

# Notes:
# - Most tokens inserted by the tokenizer are kept in the parse result.
#   For example, 'f(x,y)' is parsed to ($apply f x y), and 'not A' to
#   (not $PREDUMMY A). This could be simplified to (f x y) or (not A).
# - Tokens that are defined and inserted by the tokenizer are strings
#   starting with a '$' sign. Tokens in the syntax that are meant to be
#   inserted by the tokenizer should also be strings starting with '$'.
# - A valid token sequence that is used by the actual parser is, after
#   the token insertion step, of the form
#       P0 o1 P1 o2 P2 ... on Pn
#   where P0 ... Pn are primary expressions und o1 ... on are infix
#   operators. A primary expression is an atom, or a valid token
#   sequence enclosed in parentheses or between start and end keyword.
# - Empty parenthesized expression, i.e. (), are only allowed to denote
#   function invocations without arguments. E.g., 'a+()' is not valid.
# - Control structures without content ('do while a>0') are allowed by
#   the parser. It is up to the next processing step to deal with this.

import simplerawtokenizer

class Tokenizer:

# An instance of the Tokenizer class performs the high-level
# tokenization tasks (token insertion, token replacement).

# Methods whose names start with "_" are meant to be private.

    def __init__(self,ts,syn) :

#   Prepare syntax
        if type(syn) is list :
            self.iops = syn[0].copy()
            self.Controls = syn[1].copy()
        else :
            self.iops = syn.copy()
            self.Controls = {}

        self.prePlusOp  = False
        self.preMinusOp = False
        hdict = {}                    # hdict is temporary
        for op in self.iops :
            if len(self.iops[op]) > 3 :
                hdict[self.iops[op][3][0]] = self.iops[op][3][1:]
                if op == "+" : self.prePlusOp = self.iops[op][3][0]
                if op == "-" : self.preMinusOp = self.iops[op][3][0]
        self.iops.update(hdict)        

        self.EndingWords = {}
        for c in self.Controls :
            cv = self.Controls[c]
            self.EndingWords[cv[0]] = False
            if cv[1] :
                self.iops[cv[1][0]] = cv[1][1:]
            if cv[2] :
                self.iops[cv[2][0]] = cv[2][1:]
                self.EndingWords[cv[0]] = cv[2][0]

#   Get maximal LBP and RBP, determine binding power for dummy ops
        maxlbp = max((self.iops[op][0] for op in self.iops
                                   if type(self.iops[op][0]) is int))
        maxrbp = max((self.iops[op][1] for op in self.iops
                                   if type(self.iops[op][1]) is int))
        self.dummyBP = max(max(maxlbp,maxrbp)+1,100)

        for op in self.iops :
           bp = self.iops[op] 
           if bp[0] == "pre" :
               self.iops[op] = (self.dummyBP,bp[1]) + bp[1:]
           elif bp[0] == "post" :
               self.iops[op] = (bp[1],self.dummyBP) + bp[2:]

#   Hard-coded operators and keywords: comma, $apply, $END, parentheses
        self.iops[","] = (3,4,True)   # the comma is a flat infix op.
        self.iops["$apply"] = (self.dummyBP-4,self.dummyBP-3,False)
        self.Controls["("] = (")",False,False)
        self.EndingWords["$END"] = False
        self.EndingWords[")"] = False

#   Create raw tokenizer instance, prepare tokenization

        self.srt = simplerawtokenizer.RawTokenizer(
            self.iops,self.Controls,self.EndingWords,ts)
        self.Current = "$BEGIN"
        self.Buffer = ""

# End of tokenizer initialization
        
    def LBP(self,t) : return self.iops[t][0] # Left Binding power
    def RBP(self,t) : return self.iops[t][1] # Right Binding power

# If an operator follows the current token, is it a prefix operator? 
    def _inPrefixPos(self) :
        a = self.Current
        return a == "$BEGIN" or a in self.Controls or (a in self.iops
                                    and self.RBP(a) < self.dummyBP)
    
# 'An operator o is flat' means: parse A o A o A  ... to (o A A A ...)
    def _isflat(self,n) : return (n in self.iops and
                            len(self.iops[n]) > 2 and self.iops[n][2])

# Next: Advance to next token, return it; take care of token insertion
    def Next(self) :
        if self.Buffer != "" :
            self.Current = self.Buffer
            self.Buffer = ""
        elif (self.Current in self.iops
                     and self.RBP(self.Current)) == self.dummyBP :
            self.Current = "$POSTDUMMY"
        elif self.EndingWords.get(self.Current) :
            self.Current = self.EndingWords[self.Current]
        else :
            atok = self.srt.nextRaw()
            if (self._inPrefixPos() and atok in self.iops and
                len(self.iops[atok])) >3 :
                atok = self.iops[atok][3][0]
            if atok == "$END" :
                self.Current = atok
            else :
                if atok in self.Controls and self.Controls[atok][1] :
                    self.Current = self.Controls[atok][1][0]
                    self.Buffer = atok
                elif (atok in self.iops and self.LBP(atok) ==
                                                     self.dummyBP) :
                    self.Current = "$PREDUMMY"
                    self.Buffer = atok
                elif (atok == "(" and self.Current not in self.Controls
                             and self.Current not in self.iops and
                                        self.Current != "$BEGIN") :
                    self.Current = "$apply"
                    self.Buffer = atok
                elif (self.Current in self.Controls and
                       self.Controls[self.Current][0] == atok) :
                    self.Current = "$EMPTYDUMMY"
                    self.Buffer = atok  
                else :
                    self.Current = atok
        return self.Current

# TokSeq and RawTokSeq can be used to check the token sequence. These
# methods are not used for parsing.

    def TokSeq(self) :           # Token sequence as seen by the parser
        s = self.Current
        while self.Current != "$END" : s += (" " + str(self.Next()))
        return s

    def RawTokSeq(self) :        # Token sequence as provided by the
        self.srt.printRawTokens()# raw tokenizer

# End of class Tokenizer

# Three functions are used for the actual parsing :
# - parsePrimary: parse primary expression
# - parseInfix: parse infix expression
# - makeNode: create a node (from atoms or already existing nodes)
# The functions parseInfix and parsePrimary are mutually recursive.
# If tok is a fresh tokenizer, parseInfix(tok) will return the parse
# tree as Python list.
# 'parse' combines tokenization, parsing and output formatting.

def parsePrimary(tok) :
    s = tok.Next()
    if s == "$END" :
        print("Unexpected end of string")
        return "$Error"
    elif s in tok.iops or s in tok.EndingWords :
        print("Atom or start keyword expected instead of ",s)
        return "$Error"
    elif s not in tok.Controls :
        return s
    else :
        t = parseInfix(tok)
        if tok.Current != tok.Controls[s][0] :
           print("Invalid ending keyword or unexpected end: ",
                                                    tok.Current)
           return "$Error"
        else :
           if s == "(" : 
              return t
           else :
              if type(t) is list and len(t) > 0 and t[0] == "," :
              # resolve commas    
                 return [s] + t[1:]
              elif t == "$EMPTYDUMMY" :
                 return [s]
              else :
                 return [s, t]

def parseInfix(tok,minrbp=0):
    s = parsePrimary(tok)
    n = tok.Next()                 
    while n not in tok.EndingWords and minrbp < tok.LBP(n) :        
       s = makeNode(tok,n,s,parseInfix(tok,tok.RBP(n)))
       n = tok.Current            
    return s

def makeNode(tok,n,s,t) :
    if t == "$EMPTYDUMMY" and n == "$apply" :
       return ["$apply",s]
    elif s == "$EMPTYDUMMY" or t == "$EMPTYDUMMY" :
       print("Error - empty subexpression in parentheses: ()")  
       return "$ERROR"           
    elif (type(s) is list and len(s) > 1 and n == s[0]
                          and tok._isflat(n)) : 
       return s + [t]
    elif (n == "$apply" and type(t) is list and len(t) > 1 and
                                                t[0] == "," ) :
       return ["$apply",s] + t[1:]
    
# Create simple numbers from subexpressions consisting of prefix-plus
# or prefix-minus, followed by a number (after honoring binding powers)

    elif (s == "$PREDUMMY" and (type(t) is int or type(t) is float) and
                       (n == tok.prePlusOp or n == tok.preMinusOp)) :
        if n == tok.prePlusOp : return t
        else : return -t
    else :
       return [n,s,t]

def PL2LL(pl) :                   # Create a string containing 
    if type(pl) is list :         # a Lisp-like list.
       s = "("                    
       if len(pl) > 0  : s += PL2LL(pl[0])
       for t in pl[1:] : s += " " + PL2LL(t)
       s += ")"
    else : s = str(pl)
    return s

def parse(ts,syn):                # tokenize, parse, format
    t = Tokenizer(ts,syn)
    r = parseInfix(t)
    if t.Current != "$END" :
       print("Parsing stopped at misplaced ending keyword: ",t.Current)
    return PL2LL(r)

# Test data (see also simplepcp_test.py).

ts1 = "12 * x"
ts2 = "3 * not a !"
ts3 = "res = 4 * 3 + n ^ 12 - x2"
ts4 = "res = 4 * 3 * a + n ^ 3 ^ 2 - x2"
ts5 = "f := x -> A and B"
ts6 = "f := x -> not A and not B !"

# synA is an example syntax. It is used in the test (simplepcp_test).

synA = [{
    "^" :  (81,80),
   "**" :  (81,80),
    "+" :  (70,71,True,("$PRE+","pre",71)),      # If in prefix posi-
    "-" :  (70,71,False,("$PRE-","pre",71)),     # tion, the data in
   "++" :  ("post",79,False,("$PRE++","pre",78)),# ("$PRE ...") are
   "--" :  ("post",79,False,("$PRE--","pre",78)),# used, otherwise the
    "*" :  (75,76,True),                         # data at start are
    "/" :  (75,76),                              # used.
    "%" :  (75,76),                              # If the Boolean value
    "=" :  (50,49),                              # is present and True,
    "<" :  (50,51),                              # the operator is flat 
    ">" :  (50,51),                              # (e.g., '3 + 4 + a'
   "->" :  (90,25),                              # is parsed as
   ":=" :  (41,40),                              # (+ 3 4 a) ).
   "+=" :  (41,40),
  "not" :  ("pre",33),                           # "not" is a prefix,    
    "!" :  ("post",78,False,("$PRE!","pre",40)), # "!" may be postfix 
  "and" :  (30,31,True)},                        # or prefix.

    {
        "["     : ("]",("$index",80,96),False),
        "?"     : (":",("$pre?",48,95),("$post?",94,48)),
        "do"    : ("while",False,("$postwhile",95,35)),
        "begin" : ("end",False,False),
        "[:"    : (":]",False,False)   # This line is all you need to
                                       # define a [: ... :] construct.
      }
  ]
# The control structure definition scheme is
#    start_keyword :   (end_keyword, pre_operator, post_operator)
# False means here: The corresponding operator does not exist.
#    Operator entries: (name, left binding power, right binding power)
# pre_operator and post_operator connect the control structure with the
# corresponding pre-operand and post-operand, respectively.