From a11d21873aa9439786bef2da40334212b994a2bd Mon Sep 17 00:00:00 2001 From: Luna Date: Sat, 9 Mar 2019 00:02:13 -0300 Subject: [PATCH] finish basic token list --- jortsc/main.py | 6 ++++- jortsc/parser/lexer.py | 56 ++++++++++++++++++++++++++++++++++++------ 2 files changed, 54 insertions(+), 8 deletions(-) diff --git a/jortsc/main.py b/jortsc/main.py index 4e4b64f..45c6aa8 100644 --- a/jortsc/main.py +++ b/jortsc/main.py @@ -1,6 +1,8 @@ #!/usr/bin/python3 import sys +import pprint + from parser.lexer import lex_jorts def main(): @@ -10,7 +12,9 @@ def main(): except EOFError: pass - print(lex_jorts(in_data)) + tokens = lex_jorts(in_data) + pprint.pprint(tokens) + print([t[0] for t in tokens]) if __name__ == '__main__': main() diff --git a/jortsc/parser/lexer.py b/jortsc/parser/lexer.py index bcd4fc7..3a35a59 100644 --- a/jortsc/parser/lexer.py +++ b/jortsc/parser/lexer.py @@ -1,11 +1,16 @@ import re -import enum +from enum import Enum, auto -class TokenType(enum.Enum): +class TokenType(Enum): """Defines the type of a token""" - RESERVED = enum.auto() - IDENTIFIER = enum.auto() + reserved = auto() + identifier = auto() + comment = auto() + comment_start = auto() + comment_end = auto() + whitespace = auto() + number = auto() class LexerError(Exception): @@ -14,7 +19,36 @@ class LexerError(Exception): TOKENS = [ - (r'fn', TokenType.RESERVED), + (r'[ \n\t]+', TokenType.whitespace), + + # single line comments and multiline comments + (r'//[^\n]*', TokenType.comment), + + # TODO: shouldnt this be /* */ instead of + # only tokenizing on the start and end? + (r'/\*', TokenType.comment_start), + (r'\*/', TokenType.comment_end), + + (r'fn', TokenType.reserved), + (r'if', TokenType.reserved), + (r'import', TokenType.reserved), + + (r'\(', TokenType.reserved), + (r'\)', TokenType.reserved), + + (r'\{', TokenType.reserved), + (r'\}', TokenType.reserved), + + (r'\-\>', TokenType.reserved), + (r'\.', TokenType.reserved), + + (r'\"[^\n]*\"', TokenType.reserved), + + # basic math ops + (r'[\+\-\/\*]', TokenType.reserved), + + (r'[0-9]+', TokenType.number), + (r'[A-Za-z][A-Za-z0-9_]*', TokenType.identifier) ] @@ -29,6 +63,9 @@ def lex(string: str, token_defs: list) -> list: compiled = {pattern: re.compile(pattern) for pattern, _ in token_defs} + # we use this instead of for pos in range(len(string)) because we + # need to increment pos to a whole token length's, and that wouldn't + # be easy on a for .. in range(..) while pos < strlen: valid = False @@ -42,16 +79,21 @@ def lex(string: str, token_defs: list) -> list: continue text = match.group(0) + + # update pos to the end of the token pos = match.end(0) valid = True tokens.append((text, tok_type)) - # go to next token instead + # go to next token instead of checking other + # definitions for tokens, e.g if its a reserved token + # we shouldn't go down the path of an identifier. break if not valid: - raise LexerError(f'Invalid character: {string[pos]}') + print(f'context: {pos} {len(string)} {string[pos-1:pos+20]!r}') + raise LexerError(f'Invalid character: {string[pos]!r}') return tokens