jorts/jortsc/parser/lexer.py

62 lines
1.3 KiB
Python

import re
import enum
class TokenType(enum.Enum):
"""Defines the type of a token"""
RESERVED = enum.auto()
IDENTIFIER = enum.auto()
class LexerError(Exception):
"""Lexer error."""
pass
TOKENS = [
(r'fn', TokenType.RESERVED),
]
def lex(string: str, token_defs: list) -> list:
"""Generate tokens out of the given string."""
pos = 0
strlen = len(string)
tokens = []
# generate a dict for compiled regexes out of the token defs
# instead of compiling on each token definition per token.
compiled = {pattern: re.compile(pattern)
for pattern, _ in token_defs}
while pos < strlen:
valid = False
for definition in token_defs:
pattern, tok_type = definition
regex = compiled[pattern]
match = regex.match(string, pos)
if not match:
continue
text = match.group(0)
pos = match.end(0)
valid = True
tokens.append((text, tok_type))
# go to next token instead
break
if not valid:
raise LexerError(f'Invalid character: {string[pos]}')
return tokens
def lex_jorts(string: str) -> list:
"""Lex with the jorts token definitions"""
return lex(string, TOKENS)