jorts/jortsc/parser/lexer.py

import re
import enum


class TokenType(enum.Enum):
    """Defines the type of a token"""
    RESERVED = enum.auto()
    IDENTIFIER = enum.auto()


class LexerError(Exception):
    """Lexer error."""
    pass


TOKENS = [
    (r'fn', TokenType.RESERVED),
]


def lex(string: str, token_defs: list) -> list:
    """Generate tokens out of the given string."""
    pos = 0
    strlen = len(string)
    tokens = []

    # generate a dict for compiled regexes out of the token defs
    # instead of compiling on each token definition per token.
    compiled = {pattern: re.compile(pattern)
                for pattern, _ in token_defs}

    while pos < strlen:
        valid = False

        for definition in token_defs:
            pattern, tok_type = definition
            regex = compiled[pattern]

            match = regex.match(string, pos)

            if not match:
                continue

            text = match.group(0)
            pos = match.end(0)

            valid = True
            tokens.append((text, tok_type))

            # go to next token instead
            break

        if not valid:
            raise LexerError(f'Invalid character: {string[pos]}')

    return tokens


def lex_jorts(string: str) -> list:
    """Lex with the jorts token definitions"""
    return lex(string, TOKENS)
add basic lexer also a draft grammar that i wont use probably 2019-03-09 02:43:17 +00:00			`import re`
			`import enum`


			`class TokenType(enum.Enum):`
			`"""Defines the type of a token"""`
			`RESERVED = enum.auto()`
			`IDENTIFIER = enum.auto()`


			`class LexerError(Exception):`
			`"""Lexer error."""`
			`pass`


			`TOKENS = [`
			`(r'fn', TokenType.RESERVED),`
			`]`


			`def lex(string: str, token_defs: list) -> list:`
			`"""Generate tokens out of the given string."""`
			`pos = 0`
			`strlen = len(string)`
			`tokens = []`

			`# generate a dict for compiled regexes out of the token defs`
			`# instead of compiling on each token definition per token.`
			`compiled = {pattern: re.compile(pattern)`
			`for pattern, _ in token_defs}`

			`while pos < strlen:`
			`valid = False`

			`for definition in token_defs:`
			`pattern, tok_type = definition`
			`regex = compiled[pattern]`

			`match = regex.match(string, pos)`

			`if not match:`
			`continue`

			`text = match.group(0)`
			`pos = match.end(0)`

			`valid = True`
			`tokens.append((text, tok_type))`

			`# go to next token instead`
			`break`

			`if not valid:`
			`raise LexerError(f'Invalid character: {string[pos]}')`

			`return tokens`


			`def lex_jorts(string: str) -> list:`
			`"""Lex with the jorts token definitions"""`
			`return lex(string, TOKENS)`