From a11d21873aa9439786bef2da40334212b994a2bd Mon Sep 17 00:00:00 2001
From: Luna <git@l4.pm>
Date: Sat, 9 Mar 2019 00:02:13 -0300
Subject: [PATCH] finish basic token list

---
 jortsc/main.py         |  6 ++++-
 jortsc/parser/lexer.py | 56 ++++++++++++++++++++++++++++++++++++------
 2 files changed, 54 insertions(+), 8 deletions(-)
diff --git a/jortsc/main.py b/jortsc/main.py
index 4e4b64f..45c6aa8 100644
--- a/jortsc/main.py
+++ b/jortsc/main.py
@@ -1,6 +1,8 @@
 #!/usr/bin/python3
 
 import sys
+import pprint
+
 from parser.lexer import lex_jorts
 
 def main():
@@ -10,7 +12,9 @@ def main():
     except EOFError:
         pass
 
-    print(lex_jorts(in_data))
+    tokens = lex_jorts(in_data)
+    pprint.pprint(tokens)
+    print([t[0] for t in tokens])
 
 if __name__ == '__main__':
     main()
diff --git a/jortsc/parser/lexer.py b/jortsc/parser/lexer.py
index bcd4fc7..3a35a59 100644
--- a/jortsc/parser/lexer.py
+++ b/jortsc/parser/lexer.py
@@ -1,11 +1,16 @@
 import re
-import enum
+from enum import Enum, auto
 
 
-class TokenType(enum.Enum):
+class TokenType(Enum):
     """Defines the type of a token"""
-    RESERVED = enum.auto()
-    IDENTIFIER = enum.auto()
+    reserved = auto()
+    identifier = auto()
+    comment = auto()
+    comment_start = auto()
+    comment_end = auto()
+    whitespace = auto()
+    number = auto()
 
 
 class LexerError(Exception):
@@ -14,7 +19,36 @@ class LexerError(Exception):
 
 
 TOKENS = [
-    (r'fn', TokenType.RESERVED),
+    (r'[ \n\t]+', TokenType.whitespace),
+
+    # single line comments and multiline comments
+    (r'//[^\n]*', TokenType.comment),
+
+    # TODO: shouldnt this be /* <anything> */ instead of
+    # only tokenizing on the start and end?
+    (r'/\*', TokenType.comment_start),
+    (r'\*/', TokenType.comment_end),
+
+    (r'fn', TokenType.reserved),
+    (r'if', TokenType.reserved),
+    (r'import', TokenType.reserved),
+
+    (r'\(', TokenType.reserved),
+    (r'\)', TokenType.reserved),
+
+    (r'\{', TokenType.reserved),
+    (r'\}', TokenType.reserved),
+
+    (r'\-\>', TokenType.reserved),
+    (r'\.', TokenType.reserved),
+
+    (r'\"[^\n]*\"', TokenType.reserved),
+
+    # basic math ops
+    (r'[\+\-\/\*]', TokenType.reserved),
+
+    (r'[0-9]+', TokenType.number),
+    (r'[A-Za-z][A-Za-z0-9_]*', TokenType.identifier)
 ]
 
 
@@ -29,6 +63,9 @@ def lex(string: str, token_defs: list) -> list:
     compiled = {pattern: re.compile(pattern)
                 for pattern, _ in token_defs}
 
+    # we use this instead of for pos in range(len(string)) because we
+    # need to increment pos to a whole token length's, and that wouldn't
+    # be easy on a for .. in range(..)
     while pos < strlen:
         valid = False
 
@@ -42,16 +79,21 @@ def lex(string: str, token_defs: list) -> list:
                 continue
 
             text = match.group(0)
+
+            # update pos to the end of the token
             pos = match.end(0)
 
             valid = True
             tokens.append((text, tok_type))
 
-            # go to next token instead
+            # go to next token instead of checking other
+            # definitions for tokens, e.g if its a reserved token
+            # we shouldn't go down the path of an identifier.
             break
 
         if not valid:
-            raise LexerError(f'Invalid character: {string[pos]}')
+            print(f'context: {pos} {len(string)} {string[pos-1:pos+20]!r}')
+            raise LexerError(f'Invalid character: {string[pos]!r}')
 
     return tokens