# Raw tokenizer
# -------------
# The raw tokenizer extracts tokens one by one from a string that is
# meant to contain an expression.
# The word 'raw' is used to emphasize that this tokenizer does not take
# care of token insertion, token replacements or other high level tasks.

# The raw tokenizer consists of the class RawTokenizer. Initialization
# requires preprocessed syntax data (ops, cons, ends) and the string to
# be tokenized. The method nextRaw() will proceed to the next raw token
# and return this token. The utility method printRawTokens() can be
# used to display the token sequence as created by the raw tokenizer.
# The raw tokenizer is meant to be used by the class Tokenizer in the
# module simplepcp.

# Regexps (regular expressions) are not used for tokenization.

# See also documentation in simplepcp.

# Licence (2-clause BSD License) (see minimalpcp).

# Copyright 2017 JoeCreu

# Python versions 3.* should work.

# JoeCreu, 2017-06-07

class RawTokenizer:
 
   def __init__(self,ops,cons,ends,exstr) :

#  Methods whose names start with an underscore are considered private.

#  Operator names, keywords and special characters are extracted from
#  the 'syntax' (contained in parameters ops, cons, ends).

      self.exstr = exstr
      self.length = len(self.exstr)
      self.Operators = set(ops)
      self.Keywords = set(cons).union(set(ends))
      self.allopskeys = self.Operators.union(self.Keywords)

#  Create a list of valid tokens consisting of special characters, sort
#  the list by size (descending). Extract all valid special characters.
      self.specialopskeys = []
      for ok in self.allopskeys :
         if  not (ok[0].isalpha() or ok[0] in ["_","$"] ) :
                                 self.specialopskeys.append(ok)
      self.specialopskeys.sort(key=len,reverse=True)
      self.specialChars = set(list("".join(self.specialopskeys)))

#  Set initial state
      self.pos = 0
      self.specialbuf = ""
      
   @staticmethod
   def _isIdChar(c) :
      return c.isalpha() or c.isdigit() or c in {"_","$"}

   def _nextSpecial(self) :
      for s in self.specialopskeys :
         if self.specialbuf.startswith(s) :
            self.specialbuf = self.specialbuf[len(s):]
            return s
      print("error in substring of specialchars: ",self.specialbuf)
      self.specialbuf = ""
      return "error"
         
   def _parseNumber(self) :
      bpos = self.pos
      pointfound = (self.exstr[bpos] == ".") 
      expfound = False
      expdigitfound = False
      self.pos += 1
      nc = False              # next character
      if self.pos < self.length : nc = self.exstr[self.pos]
      while (self.pos < self.length and
             (nc.isdigit()
              or (nc == "." and not pointfound and not expfound)
              or ((nc == "e" or nc == "E") and not expfound)
              or (nc in ["+","-"]
                       and self.exstr[self.pos-1] in ["e","E"]))) :
         pointfound = pointfound or nc == "."
         expfound = expfound or nc in ["e","E"]
         expdigitfound = expdigitfound or (expfound and nc.isdigit())
         self.pos += 1
         nc = False
         if self.pos < self.length : nc = self.exstr[self.pos]
      if self.exstr[self.pos-1] in ["e","E"] :
         expfound = False
         self.pos -= 1
      if pointfound or expfound :
         if expfound and not expdigitfound : print("error in number")
         return float(self.exstr[bpos:self.pos])
      else :
         return int(self.exstr[bpos:self.pos])
              
   def nextRaw(self) :
      if self.specialbuf != "" :
         return self._nextSpecial()
      while self.pos < self.length and self.exstr[self.pos].isspace() :
         self.pos += 1
      if self.pos >= self.length : return "$END"
      apos = self.pos
      sc = self.exstr[apos]
      if sc.isalpha() :
         self.pos += 1
         while (self.pos < self.length and
                self._isIdChar(self.exstr[self.pos])) :
            self.pos += 1
         s = self.exstr[apos:self.pos]
         return s
      elif (sc.isdigit() or (sc == "." and apos+1 < self.length
              and self.exstr[apos+1].isdigit())) :
         return self._parseNumber()        
      elif sc in self.specialChars :
         self.pos += 1
         while (self.pos < self.length
                and self.exstr[self.pos] in self.specialChars) :
            self.pos += 1
         self.specialbuf = self.exstr[apos:self.pos]   
         return self._nextSpecial()
      else :
         print("Invalid character",sc)
         self.pos += 1
         return "error"

   def printRawTokens(self) :   # Utility method; prints raw tokens
      t = self.nextRaw()
      while t != "$END" :
         print(t,end=" ")     # Use this line in Python 3.*
#        print t,             # Use this line in Python 2.7.*
         t = self.nextRaw()
      print(t)