import re from dataclasses import dataclass from enum import Enum, auto class TokenType(Enum): """Defines the type of a token""" reserved = auto() identifier = auto() comment = auto() comment_start = auto() comment_end = auto() whitespace = auto() number = auto() string = auto() @dataclass class Token: value: str type_: TokenType class LexerError(Exception): """Lexer error.""" pass TOKENS = [ (r'[ \n\t]+', TokenType.whitespace), # single line comments and multiline comments (r'//[^\n]*', TokenType.comment), # TODO: shouldnt this be /* */ instead of # only tokenizing on the start and end? (r'/\*', TokenType.comment_start), (r'\*/', TokenType.comment_end), (r'fn', TokenType.reserved), (r'if', TokenType.reserved), (r'import', TokenType.reserved), (r'\(', TokenType.reserved), (r'\)', TokenType.reserved), (r'\{', TokenType.reserved), (r'\}', TokenType.reserved), (r'\-\>', TokenType.reserved), (r'\.', TokenType.reserved), (r'\"[^\n]*\"', TokenType.string), # basic math ops (r'[\+\-\/\*]', TokenType.reserved), (r'[0-9]+', TokenType.number), (r'[A-Za-z][A-Za-z0-9_]*', TokenType.identifier) ] def lex(string: str, token_defs: list) -> list: """Generate tokens out of the given string.""" pos = 0 strlen = len(string) tokens = [] # generate a dict for compiled regexes out of the token defs # instead of compiling on each token definition per token. compiled = {pattern: re.compile(pattern) for pattern, _ in token_defs} # we use this instead of for pos in range(len(string)) because we # need to increment pos to a whole token length's, and that wouldn't # be easy on a for .. in range(..) while pos < strlen: valid = False for definition in token_defs: pattern, tok_type = definition regex = compiled[pattern] match = regex.match(string, pos) if not match: continue text = match.group(0) # update pos to the end of the token pos = match.end(0) valid = True tokens.append(Token(text, tok_type)) # go to next token instead of checking other # definitions for tokens, e.g if its a reserved token # we shouldn't go down the path of an identifier. break if not valid: print(f'context: {pos} {len(string)} {string[pos-1:pos+20]!r}') raise LexerError(f'Invalid character: {string[pos]!r}') return tokens def lex_jorts(string: str) -> list: """Lex with the jorts token definitions""" return lex(string, TOKENS)