jorts/jortsc/parser/lexer.py

import re

from dataclasses import dataclass
from enum import Enum, auto


class TokenType(Enum):
    """Defines the type of a token"""
    reserved = auto()
    identifier = auto()
    comment = auto()
    comment_start = auto()
    comment_end = auto()
    whitespace = auto()
    number = auto()


@dataclass
class Token:
    value: str
    type_: TokenType


class LexerError(Exception):
    """Lexer error."""
    pass


TOKENS = [
    (r'[ \n\t]+', TokenType.whitespace),

    # single line comments and multiline comments
    (r'//[^\n]*', TokenType.comment),

    # TODO: shouldnt this be /* <anything> */ instead of
    # only tokenizing on the start and end?
    (r'/\*', TokenType.comment_start),
    (r'\*/', TokenType.comment_end),

    (r'fn', TokenType.reserved),
    (r'if', TokenType.reserved),
    (r'import', TokenType.reserved),

    (r'\(', TokenType.reserved),
    (r'\)', TokenType.reserved),

    (r'\{', TokenType.reserved),
    (r'\}', TokenType.reserved),

    (r'\-\>', TokenType.reserved),
    (r'\.', TokenType.reserved),

    (r'\"[^\n]*\"', TokenType.reserved),

    # basic math ops
    (r'[\+\-\/\*]', TokenType.reserved),

    (r'[0-9]+', TokenType.number),
    (r'[A-Za-z][A-Za-z0-9_]*', TokenType.identifier)
]


def lex(string: str, token_defs: list) -> list:
    """Generate tokens out of the given string."""
    pos = 0
    strlen = len(string)
    tokens = []

    # generate a dict for compiled regexes out of the token defs
    # instead of compiling on each token definition per token.
    compiled = {pattern: re.compile(pattern)
                for pattern, _ in token_defs}

    # we use this instead of for pos in range(len(string)) because we
    # need to increment pos to a whole token length's, and that wouldn't
    # be easy on a for .. in range(..)
    while pos < strlen:
        valid = False

        for definition in token_defs:
            pattern, tok_type = definition
            regex = compiled[pattern]

            match = regex.match(string, pos)

            if not match:
                continue

            text = match.group(0)

            # update pos to the end of the token
            pos = match.end(0)

            valid = True
            tokens.append(Token(text, tok_type))

            # go to next token instead of checking other
            # definitions for tokens, e.g if its a reserved token
            # we shouldn't go down the path of an identifier.
            break

        if not valid:
            print(f'context: {pos} {len(string)} {string[pos-1:pos+20]!r}')
            raise LexerError(f'Invalid character: {string[pos]!r}')

    return tokens


def lex_jorts(string: str) -> list:
    """Lex with the jorts token definitions"""
    return lex(string, TOKENS)
add basic lexer also a draft grammar that i wont use probably 2019-03-09 02:43:17 +00:00			`import re`
add basics of a handwritten parser hell yeah i'm going down that path lark made confusing stuff, i'll probably get more confused with a handwritten one, but oh well, such is life 2019-03-10 04:55:12 +00:00
			`from dataclasses import dataclass`
finish basic token list 2019-03-09 03:02:13 +00:00			`from enum import Enum, auto`
add basic lexer also a draft grammar that i wont use probably 2019-03-09 02:43:17 +00:00

finish basic token list 2019-03-09 03:02:13 +00:00			`class TokenType(Enum):`
add basic lexer also a draft grammar that i wont use probably 2019-03-09 02:43:17 +00:00			`"""Defines the type of a token"""`
finish basic token list 2019-03-09 03:02:13 +00:00			`reserved = auto()`
			`identifier = auto()`
			`comment = auto()`
			`comment_start = auto()`
			`comment_end = auto()`
			`whitespace = auto()`
			`number = auto()`
add basic lexer also a draft grammar that i wont use probably 2019-03-09 02:43:17 +00:00

add basics of a handwritten parser hell yeah i'm going down that path lark made confusing stuff, i'll probably get more confused with a handwritten one, but oh well, such is life 2019-03-10 04:55:12 +00:00			`@dataclass`
			`class Token:`
			`value: str`
			`type_: TokenType`


add basic lexer also a draft grammar that i wont use probably 2019-03-09 02:43:17 +00:00			`class LexerError(Exception):`
			`"""Lexer error."""`
			`pass`


			`TOKENS = [`
finish basic token list 2019-03-09 03:02:13 +00:00			`(r'[ \n\t]+', TokenType.whitespace),`

			`# single line comments and multiline comments`
			`(r'//[^\n]*', TokenType.comment),`

			`# TODO: shouldnt this be /* <anything> */ instead of`
			`# only tokenizing on the start and end?`
			`(r'/\*', TokenType.comment_start),`
			`(r'\*/', TokenType.comment_end),`

			`(r'fn', TokenType.reserved),`
			`(r'if', TokenType.reserved),`
			`(r'import', TokenType.reserved),`

			`(r'\(', TokenType.reserved),`
			`(r'\)', TokenType.reserved),`

			`(r'\{', TokenType.reserved),`
			`(r'\}', TokenType.reserved),`

			`(r'\-\>', TokenType.reserved),`
			`(r'\.', TokenType.reserved),`

			`(r'\"[^\n]*\"', TokenType.reserved),`

			`# basic math ops`
			`(r'[\+\-\/\*]', TokenType.reserved),`

			`(r'[0-9]+', TokenType.number),`
			`(r'[A-Za-z][A-Za-z0-9_]*', TokenType.identifier)`
add basic lexer also a draft grammar that i wont use probably 2019-03-09 02:43:17 +00:00			`]`


			`def lex(string: str, token_defs: list) -> list:`
			`"""Generate tokens out of the given string."""`
			`pos = 0`
			`strlen = len(string)`
			`tokens = []`

			`# generate a dict for compiled regexes out of the token defs`
			`# instead of compiling on each token definition per token.`
			`compiled = {pattern: re.compile(pattern)`
			`for pattern, _ in token_defs}`

finish basic token list 2019-03-09 03:02:13 +00:00			`# we use this instead of for pos in range(len(string)) because we`
			`# need to increment pos to a whole token length's, and that wouldn't`
			`# be easy on a for .. in range(..)`
add basic lexer also a draft grammar that i wont use probably 2019-03-09 02:43:17 +00:00			`while pos < strlen:`
			`valid = False`

			`for definition in token_defs:`
			`pattern, tok_type = definition`
			`regex = compiled[pattern]`

			`match = regex.match(string, pos)`

			`if not match:`
			`continue`

			`text = match.group(0)`
finish basic token list 2019-03-09 03:02:13 +00:00
			`# update pos to the end of the token`
add basic lexer also a draft grammar that i wont use probably 2019-03-09 02:43:17 +00:00			`pos = match.end(0)`

			`valid = True`
add basics of a handwritten parser hell yeah i'm going down that path lark made confusing stuff, i'll probably get more confused with a handwritten one, but oh well, such is life 2019-03-10 04:55:12 +00:00			`tokens.append(Token(text, tok_type))`
add basic lexer also a draft grammar that i wont use probably 2019-03-09 02:43:17 +00:00
finish basic token list 2019-03-09 03:02:13 +00:00			`# go to next token instead of checking other`
			`# definitions for tokens, e.g if its a reserved token`
			`# we shouldn't go down the path of an identifier.`
add basic lexer also a draft grammar that i wont use probably 2019-03-09 02:43:17 +00:00			`break`

			`if not valid:`
finish basic token list 2019-03-09 03:02:13 +00:00			`print(f'context: {pos} {len(string)} {string[pos-1:pos+20]!r}')`
			`raise LexerError(f'Invalid character: {string[pos]!r}')`
add basic lexer also a draft grammar that i wont use probably 2019-03-09 02:43:17 +00:00
			`return tokens`


			`def lex_jorts(string: str) -> list:`
			`"""Lex with the jorts token definitions"""`
			`return lex(string, TOKENS)`