jorts/jortsc/parser/lexer.py

import re
import enum


class TokenType(enum.Enum):
    """Defines the type of a token"""
    RESERVED = enum.auto()
    IDENTIFIER = enum.auto()


class LexerError(Exception):
    """Lexer error."""
    pass


TOKENS = [
    (r'fn', TokenType.RESERVED),
]


def lex(string: str, token_defs: list) -> list:
    """Generate tokens out of the given string."""
    pos = 0
    strlen = len(string)
    tokens = []

    # generate a dict for compiled regexes out of the token defs
    # instead of compiling on each token definition per token.
    compiled = {pattern: re.compile(pattern)
                for pattern, _ in token_defs}

    while pos < strlen:
        valid = False

        for definition in token_defs:
            pattern, tok_type = definition
            regex = compiled[pattern]

            match = regex.match(string, pos)

            if not match:
                continue

            text = match.group(0)
            pos = match.end(0)

            valid = True
            tokens.append((text, tok_type))

            # go to next token instead
            break

        if not valid:
            raise LexerError(f'Invalid character: {string[pos]}')

    return tokens


def lex_jorts(string: str) -> list:
    """Lex with the jorts token definitions"""
    return lex(string, TOKENS)