# lexer for perl code: from positions import PositionFactory from tokens import Token from sys import exit from perlnumber import convert as perlNumber pos = PositionFactory() def Error(msg, pos): print 'ERROR: %s: %s' % (pos, msg) exit(1) def DelimMatch(begin, end): if begin == '/' and end == '/': return True if begin == '(' and end == ')': return True if begin == '{' and end == '}': return True if begin == '[' and end == ']': return True return False definitions: NUMBER "0|([1-9][[:digit:]_\.]*)" HEXA "(0x)[[:xdigit:]_]+" OCTAL "0[0-7_]+" BINARY "0b[01_]+" OPERATORS "[\-\+\*/=\<\>\|]|<=|>=|\!=" SYMBOLS ":|;|,|\.|==|=>|::|\(|\)|\{|\}|\[|\]|\?|\\|$|@|%|&" NAME "[[:alpha:]_][[:alnum:]_]*" INITIAL: "<>": pos.mark() self.value = Token('$EOF$', pos.get(), self.value) return '$EOF$' "{OPERATORS}": pos.colAdd(len(self.value)) self.value = Token('OPERATORS', pos.get(), self.value) return self.value.token "q(q|w|x)?[\(\{\[\/]": pos.colAdd(len(self.value)); self.stringbuf = '' self.nestlvl = 0 self.delim = self.value[-1] self.PUSHSTATE(LITERAL); return "{NAME}": pos.colAdd(len(self.value)) self.value = Token('NAME', pos.get(), self.value) if self.value.token == 'END': self.PUSHSTATE(FOOTER) return self.value.token "{NUMBER}": pos.colAdd(len(self.value)) try: self.value = Token('NUMBER', pos.get(), perlNumber(self.value)) except ValueError: self.value = Token('ERROR', pos.get(), self.value) return self.value.token "{HEXA}": pos.colAdd(len(self.value)) try: self.value = Token('NUMBER', pos.get(), perlNumber(self.value)) except ValueError: self.value = Token('ERROR', pos.get(), self.value) return self.value.token "{OCTAL}": pos.colAdd(len(self.value)) try: self.value = Token('NUMBER', pos.get(), perlNumber(self.value)) except ValueError: self.value = Token('ERROR', pos.get(), self.value) return self.value.token "{BINARY}": pos.colAdd(len(self.value)) try: self.value = Token('NUMBER', pos.get(), perlNumber(self.value)) except ValueError: self.value = Token('ERROR', pos.get(), self.value) return self.value.token "[[:blank:]]+": pos.colAdd(len(self.value)); return "\#": pos.colAdd(len(self.value)) self.stringbuf = '' self.PUSHSTATE(COMMENT); return "['\"]": pos.colAdd(len(self.value)); self.stringsep = self.value; self.stringbuf = '' self.PUSHSTATE(STRING); return "[\n]+": pos.lineAdd(len(self.value)); pos.mark(); return #Token('NEWLINE', pos.get(), self.value) "{SYMBOLS}": #print "Symbol found: %s" % (self.value) pos.colAdd(len(self.value)) self.value = Token('SYMBOLS', pos.get(), self.value) return self.value.token FOOTER: "<>": self.POPSTATE(); return "[^\n]+": return "[\n]+": return # state used to record a literal LITERAL: "<>": Error('EOF reached within literal', pos.get()); return "[\n]+": pos.lineAdd(len(self.value)); self.stringbuf += self.value; return # XXX: handle escaped string delimiters "[\(\[\{]": pos.colAdd(len(self.value)) self.stringbuf += self.value if self.delim == self.value: self.nestlvl += 1 return "[\)\]\}\/]": pos.colAdd(len(self.value)) if DelimMatch(self.delim, self.value): if self.nestlvl == 0: self.value = Token('LITERAL', pos.get(), self.stringbuf) del self.stringbuf, self.nestlvl, self.delim self.POPSTATE() return 'LITERAL' else: self.nestlvl -= 1 self.stringbuf += self.value return ".": pos.colAdd(len(self.value)); self.stringbuf += self.value; return COMMENT: "<>": self.POPSTATE(); return "\n": pos.lineAdd(1); pos.mark() self.value = Token('COMMENT', pos.get(), self.stringbuf) del(self.stringbuf) self.POPSTATE() return 'COMMENT' "[^\n]+": pos.colAdd(len(self.value)); self.stringbuf += self.value; return STRING: "<>": Error('EOF reached within string', pos.get()); return "\n": Error('New line reached within string', pos.get()); return "\\.?": pos.colAdd(len(self.value)); self.stringbuf += self.value; return "[\'\"]": pos.colAdd(len(self.value)) if self.value != self.stringsep: self.stringbuf += self.value return self.value = Token('STRING', pos.get(), self.stringbuf) del(self.stringbuf) self.POPSTATE() return 'STRING' "[^\\\"\'\n]+": pos.colAdd(len(self.value)); self.stringbuf += self.value; return