import re import enum class TokenType(enum.Enum): """Defines the type of a token""" RESERVED = enum.auto() IDENTIFIER = enum.auto() class LexerError(Exception): """Lexer error.""" pass TOKENS = [ (r'fn', TokenType.RESERVED), ] def lex(string: str, token_defs: list) -> list: """Generate tokens out of the given string.""" pos = 0 strlen = len(string) tokens = [] # generate a dict for compiled regexes out of the token defs # instead of compiling on each token definition per token. compiled = {pattern: re.compile(pattern) for pattern, _ in token_defs} while pos < strlen: valid = False for definition in token_defs: pattern, tok_type = definition regex = compiled[pattern] match = regex.match(string, pos) if not match: continue text = match.group(0) pos = match.end(0) valid = True tokens.append((text, tok_type)) # go to next token instead break if not valid: raise LexerError(f'Invalid character: {string[pos]}') return tokens def lex_jorts(string: str) -> list: """Lex with the jorts token definitions""" return lex(string, TOKENS)