add basic lexer

also a draft grammar that i wont use probably
2019-03-08 23:43:17 -03:00 · 2019-03-08 23:43:17 -03:00 · 588b63fabe
commit 588b63fabe
parent 3c983b004c
6 changed files with 183 additions and 50 deletions
--- a/jortsc/init.py
+++ b/jortsc/init.py
--- a/jortsc/grammar
+++ b/jortsc/grammar
@ -0,0 +1,10 @@
+digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" ;
+integer = ['-' | '+'] "0" digit {digit} ;
+
+hex_letters = "a" | "b" | "c" | "d" | "e" | "f"
+hex_integer = "0x", {hex_letters | digit} ;
+
+oct_digits = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" ;
+octal_integer = "0o", {oct_digits} ;
+
+program = 
--- a/jortsc/main.py
+++ b/jortsc/main.py
@ -1,6 +1,7 @@
 #!/usr/bin/python3

 import sys
+from parser.lexer import lex_jorts

 def main():
    """main entry point"""
@ -9,8 +10,7 @@ def main():
    except EOFError:
        pass

-    # TODO: lol
-    print(in_data)
+    print(lex_jorts(in_data))

 if __name__ == '__main__':
    main()
--- a/jortsc/parser/init.py
+++ b/jortsc/parser/init.py
--- a/jortsc/parser/lexer.py
+++ b/jortsc/parser/lexer.py
@ -0,0 +1,61 @@
+import re
+import enum
+
+
+class TokenType(enum.Enum):
+    """Defines the type of a token"""
+    RESERVED = enum.auto()
+    IDENTIFIER = enum.auto()
+
+
+class LexerError(Exception):
+    """Lexer error."""
+    pass
+
+
+TOKENS = [
+    (r'fn', TokenType.RESERVED),
+]
+
+
+def lex(string: str, token_defs: list) -> list:
+    """Generate tokens out of the given string."""
+    pos = 0
+    strlen = len(string)
+    tokens = []
+
+    # generate a dict for compiled regexes out of the token defs
+    # instead of compiling on each token definition per token.
+    compiled = {pattern: re.compile(pattern)
+                for pattern, _ in token_defs}
+
+    while pos < strlen:
+        valid = False
+
+        for definition in token_defs:
+            pattern, tok_type = definition
+            regex = compiled[pattern]
+
+            match = regex.match(string, pos)
+
+            if not match:
+                continue
+
+            text = match.group(0)
+            pos = match.end(0)
+
+            valid = True
+            tokens.append((text, tok_type))
+
+            # go to next token instead
+            break
+
+        if not valid:
+            raise LexerError(f'Invalid character: {string[pos]}')
+
+    return tokens
+
+
+def lex_jorts(string: str) -> list:
+    """Lex with the jorts token definitions"""
+    return lex(string, TOKENS)