forked from luna/jorts
add basic lexer
also a draft grammar that i wont use probably
This commit is contained in:
parent
3c983b004c
commit
588b63fabe
6 changed files with 183 additions and 50 deletions
0
jortsc/parser/__init__.py
Normal file
0
jortsc/parser/__init__.py
Normal file
61
jortsc/parser/lexer.py
Normal file
61
jortsc/parser/lexer.py
Normal file
|
@ -0,0 +1,61 @@
|
|||
import re
|
||||
import enum
|
||||
|
||||
|
||||
class TokenType(enum.Enum):
|
||||
"""Defines the type of a token"""
|
||||
RESERVED = enum.auto()
|
||||
IDENTIFIER = enum.auto()
|
||||
|
||||
|
||||
class LexerError(Exception):
|
||||
"""Lexer error."""
|
||||
pass
|
||||
|
||||
|
||||
TOKENS = [
|
||||
(r'fn', TokenType.RESERVED),
|
||||
]
|
||||
|
||||
|
||||
def lex(string: str, token_defs: list) -> list:
|
||||
"""Generate tokens out of the given string."""
|
||||
pos = 0
|
||||
strlen = len(string)
|
||||
tokens = []
|
||||
|
||||
# generate a dict for compiled regexes out of the token defs
|
||||
# instead of compiling on each token definition per token.
|
||||
compiled = {pattern: re.compile(pattern)
|
||||
for pattern, _ in token_defs}
|
||||
|
||||
while pos < strlen:
|
||||
valid = False
|
||||
|
||||
for definition in token_defs:
|
||||
pattern, tok_type = definition
|
||||
regex = compiled[pattern]
|
||||
|
||||
match = regex.match(string, pos)
|
||||
|
||||
if not match:
|
||||
continue
|
||||
|
||||
text = match.group(0)
|
||||
pos = match.end(0)
|
||||
|
||||
valid = True
|
||||
tokens.append((text, tok_type))
|
||||
|
||||
# go to next token instead
|
||||
break
|
||||
|
||||
if not valid:
|
||||
raise LexerError(f'Invalid character: {string[pos]}')
|
||||
|
||||
return tokens
|
||||
|
||||
|
||||
def lex_jorts(string: str) -> list:
|
||||
"""Lex with the jorts token definitions"""
|
||||
return lex(string, TOKENS)
|
Loading…
Add table
Add a link
Reference in a new issue