# Raw tokenizer # ------------- # The raw tokenizer extracts tokens one by one from a string that is # meant to contain an expression. # The word 'raw' is used to emphasize that this tokenizer does not take # care of token insertion, token replacements or other high level tasks. # The raw tokenizer consists of the class RawTokenizer. Initialization # requires preprocessed syntax data (ops, cons, ends) and the string to # be tokenized. The method nextRaw() will proceed to the next raw token # and return this token. The utility method printRawTokens() can be # used to display the token sequence as created by the raw tokenizer. # The raw tokenizer is meant to be used by the class Tokenizer in the # module simplepcp. # Regexps (regular expressions) are not used for tokenization. # See also documentation in simplepcp. # Licence (2-clause BSD License) (see minimalpcp). # Copyright 2017 JoeCreu # Python versions 3.* should work. # JoeCreu, 2017-06-07 class RawTokenizer: def __init__(self,ops,cons,ends,exstr) : # Methods whose names start with an underscore are considered private. # Operator names, keywords and special characters are extracted from # the 'syntax' (contained in parameters ops, cons, ends). self.exstr = exstr self.length = len(self.exstr) self.Operators = set(ops) self.Keywords = set(cons).union(set(ends)) self.allopskeys = self.Operators.union(self.Keywords) # Create a list of valid tokens consisting of special characters, sort # the list by size (descending). Extract all valid special characters. self.specialopskeys = [] for ok in self.allopskeys : if not (ok[0].isalpha() or ok[0] in ["_","$"] ) : self.specialopskeys.append(ok) self.specialopskeys.sort(key=len,reverse=True) self.specialChars = set(list("".join(self.specialopskeys))) # Set initial state self.pos = 0 self.specialbuf = "" @staticmethod def _isIdChar(c) : return c.isalpha() or c.isdigit() or c in {"_","$"} def _nextSpecial(self) : for s in self.specialopskeys : if self.specialbuf.startswith(s) : self.specialbuf = self.specialbuf[len(s):] return s print("error in substring of specialchars: ",self.specialbuf) self.specialbuf = "" return "error" def _parseNumber(self) : bpos = self.pos pointfound = (self.exstr[bpos] == ".") expfound = False expdigitfound = False self.pos += 1 nc = False # next character if self.pos < self.length : nc = self.exstr[self.pos] while (self.pos < self.length and (nc.isdigit() or (nc == "." and not pointfound and not expfound) or ((nc == "e" or nc == "E") and not expfound) or (nc in ["+","-"] and self.exstr[self.pos-1] in ["e","E"]))) : pointfound = pointfound or nc == "." expfound = expfound or nc in ["e","E"] expdigitfound = expdigitfound or (expfound and nc.isdigit()) self.pos += 1 nc = False if self.pos < self.length : nc = self.exstr[self.pos] if self.exstr[self.pos-1] in ["e","E"] : expfound = False self.pos -= 1 if pointfound or expfound : if expfound and not expdigitfound : print("error in number") return float(self.exstr[bpos:self.pos]) else : return int(self.exstr[bpos:self.pos]) def nextRaw(self) : if self.specialbuf != "" : return self._nextSpecial() while self.pos < self.length and self.exstr[self.pos].isspace() : self.pos += 1 if self.pos >= self.length : return "$END" apos = self.pos sc = self.exstr[apos] if sc.isalpha() : self.pos += 1 while (self.pos < self.length and self._isIdChar(self.exstr[self.pos])) : self.pos += 1 s = self.exstr[apos:self.pos] return s elif (sc.isdigit() or (sc == "." and apos+1 < self.length and self.exstr[apos+1].isdigit())) : return self._parseNumber() elif sc in self.specialChars : self.pos += 1 while (self.pos < self.length and self.exstr[self.pos] in self.specialChars) : self.pos += 1 self.specialbuf = self.exstr[apos:self.pos] return self._nextSpecial() else : print("Invalid character",sc) self.pos += 1 return "error" def printRawTokens(self) : # Utility method; prints raw tokens t = self.nextRaw() while t != "$END" : print(t,end=" ") # Use this line in Python 3.* # print t, # Use this line in Python 2.7.* t = self.nextRaw() print(t)