forked from luna/jorts
62 lines
1.3 KiB
Python
62 lines
1.3 KiB
Python
|
import re
|
||
|
import enum
|
||
|
|
||
|
|
||
|
class TokenType(enum.Enum):
|
||
|
"""Defines the type of a token"""
|
||
|
RESERVED = enum.auto()
|
||
|
IDENTIFIER = enum.auto()
|
||
|
|
||
|
|
||
|
class LexerError(Exception):
|
||
|
"""Lexer error."""
|
||
|
pass
|
||
|
|
||
|
|
||
|
TOKENS = [
|
||
|
(r'fn', TokenType.RESERVED),
|
||
|
]
|
||
|
|
||
|
|
||
|
def lex(string: str, token_defs: list) -> list:
|
||
|
"""Generate tokens out of the given string."""
|
||
|
pos = 0
|
||
|
strlen = len(string)
|
||
|
tokens = []
|
||
|
|
||
|
# generate a dict for compiled regexes out of the token defs
|
||
|
# instead of compiling on each token definition per token.
|
||
|
compiled = {pattern: re.compile(pattern)
|
||
|
for pattern, _ in token_defs}
|
||
|
|
||
|
while pos < strlen:
|
||
|
valid = False
|
||
|
|
||
|
for definition in token_defs:
|
||
|
pattern, tok_type = definition
|
||
|
regex = compiled[pattern]
|
||
|
|
||
|
match = regex.match(string, pos)
|
||
|
|
||
|
if not match:
|
||
|
continue
|
||
|
|
||
|
text = match.group(0)
|
||
|
pos = match.end(0)
|
||
|
|
||
|
valid = True
|
||
|
tokens.append((text, tok_type))
|
||
|
|
||
|
# go to next token instead
|
||
|
break
|
||
|
|
||
|
if not valid:
|
||
|
raise LexerError(f'Invalid character: {string[pos]}')
|
||
|
|
||
|
return tokens
|
||
|
|
||
|
|
||
|
def lex_jorts(string: str) -> list:
|
||
|
"""Lex with the jorts token definitions"""
|
||
|
return lex(string, TOKENS)
|