forked from luna/jorts
Luna
7ce0565de7
hell yeah i'm going down that path lark made confusing stuff, i'll probably get more confused with a handwritten one, but oh well, such is life
111 lines
2.7 KiB
Python
111 lines
2.7 KiB
Python
import re
|
|
|
|
from dataclasses import dataclass
|
|
from enum import Enum, auto
|
|
|
|
|
|
class TokenType(Enum):
|
|
"""Defines the type of a token"""
|
|
reserved = auto()
|
|
identifier = auto()
|
|
comment = auto()
|
|
comment_start = auto()
|
|
comment_end = auto()
|
|
whitespace = auto()
|
|
number = auto()
|
|
|
|
|
|
@dataclass
|
|
class Token:
|
|
value: str
|
|
type_: TokenType
|
|
|
|
|
|
class LexerError(Exception):
|
|
"""Lexer error."""
|
|
pass
|
|
|
|
|
|
TOKENS = [
|
|
(r'[ \n\t]+', TokenType.whitespace),
|
|
|
|
# single line comments and multiline comments
|
|
(r'//[^\n]*', TokenType.comment),
|
|
|
|
# TODO: shouldnt this be /* <anything> */ instead of
|
|
# only tokenizing on the start and end?
|
|
(r'/\*', TokenType.comment_start),
|
|
(r'\*/', TokenType.comment_end),
|
|
|
|
(r'fn', TokenType.reserved),
|
|
(r'if', TokenType.reserved),
|
|
(r'import', TokenType.reserved),
|
|
|
|
(r'\(', TokenType.reserved),
|
|
(r'\)', TokenType.reserved),
|
|
|
|
(r'\{', TokenType.reserved),
|
|
(r'\}', TokenType.reserved),
|
|
|
|
(r'\-\>', TokenType.reserved),
|
|
(r'\.', TokenType.reserved),
|
|
|
|
(r'\"[^\n]*\"', TokenType.reserved),
|
|
|
|
# basic math ops
|
|
(r'[\+\-\/\*]', TokenType.reserved),
|
|
|
|
(r'[0-9]+', TokenType.number),
|
|
(r'[A-Za-z][A-Za-z0-9_]*', TokenType.identifier)
|
|
]
|
|
|
|
|
|
def lex(string: str, token_defs: list) -> list:
|
|
"""Generate tokens out of the given string."""
|
|
pos = 0
|
|
strlen = len(string)
|
|
tokens = []
|
|
|
|
# generate a dict for compiled regexes out of the token defs
|
|
# instead of compiling on each token definition per token.
|
|
compiled = {pattern: re.compile(pattern)
|
|
for pattern, _ in token_defs}
|
|
|
|
# we use this instead of for pos in range(len(string)) because we
|
|
# need to increment pos to a whole token length's, and that wouldn't
|
|
# be easy on a for .. in range(..)
|
|
while pos < strlen:
|
|
valid = False
|
|
|
|
for definition in token_defs:
|
|
pattern, tok_type = definition
|
|
regex = compiled[pattern]
|
|
|
|
match = regex.match(string, pos)
|
|
|
|
if not match:
|
|
continue
|
|
|
|
text = match.group(0)
|
|
|
|
# update pos to the end of the token
|
|
pos = match.end(0)
|
|
|
|
valid = True
|
|
tokens.append(Token(text, tok_type))
|
|
|
|
# go to next token instead of checking other
|
|
# definitions for tokens, e.g if its a reserved token
|
|
# we shouldn't go down the path of an identifier.
|
|
break
|
|
|
|
if not valid:
|
|
print(f'context: {pos} {len(string)} {string[pos-1:pos+20]!r}')
|
|
raise LexerError(f'Invalid character: {string[pos]!r}')
|
|
|
|
return tokens
|
|
|
|
|
|
def lex_jorts(string: str) -> list:
|
|
"""Lex with the jorts token definitions"""
|
|
return lex(string, TOKENS)
|