From 588b63fabeea9f4f8b05fac8d97ce02474252ea0 Mon Sep 17 00:00:00 2001
From: Luna <git@l4.pm>
Date: Fri, 8 Mar 2019 23:43:17 -0300
Subject: [PATCH] add basic lexer

also a draft grammar that i wont use probably
---
 .gitignore                | 158 ++++++++++++++++++++++++++------------
 jortsc/__init__.py        |   0
 jortsc/grammar            |  10 +++
 jortsc/main.py            |   4 +-
 jortsc/parser/__init__.py |   0
 jortsc/parser/lexer.py    |  61 +++++++++++++++
 6 files changed, 183 insertions(+), 50 deletions(-)
 create mode 100644 jortsc/__init__.py
 create mode 100644 jortsc/grammar
 create mode 100644 jortsc/parser/__init__.py
 create mode 100644 jortsc/parser/lexer.py

diff --git a/.gitignore b/.gitignore
index cd531cf..0447b8b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,54 +1,116 @@
-# ---> C
-# Prerequisites
-*.d
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
 
-# Object files
-*.o
-*.ko
-*.obj
-*.elf
-
-# Linker output
-*.ilk
-*.map
-*.exp
-
-# Precompiled Headers
-*.gch
-*.pch
-
-# Libraries
-*.lib
-*.a
-*.la
-*.lo
-
-# Shared objects (inc. Windows DLLs)
-*.dll
+# C extensions
 *.so
-*.so.*
-*.dylib
 
-# Executables
-*.exe
-*.out
-*.app
-*.i*86
-*.x86_64
-*.hex
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
 
-# Debug files
-*.dSYM/
-*.su
-*.idb
-*.pdb
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
 
-# Kernel Module Compile Results
-*.mod*
-*.cmd
-.tmp_versions/
-modules.order
-Module.symvers
-Mkfile.old
-dkms.conf
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
 
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
diff --git a/jortsc/__init__.py b/jortsc/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/jortsc/grammar b/jortsc/grammar
new file mode 100644
index 0000000..43938ba
--- /dev/null
+++ b/jortsc/grammar
@@ -0,0 +1,10 @@
+digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" ;
+integer = ['-' | '+'] "0" digit {digit} ;
+
+hex_letters = "a" | "b" | "c" | "d" | "e" | "f"
+hex_integer = "0x", {hex_letters | digit} ;
+
+oct_digits = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" ;
+octal_integer = "0o", {oct_digits} ;
+
+program = 
diff --git a/jortsc/main.py b/jortsc/main.py
index e0c4e74..4e4b64f 100644
--- a/jortsc/main.py
+++ b/jortsc/main.py
@@ -1,6 +1,7 @@
 #!/usr/bin/python3
 
 import sys
+from parser.lexer import lex_jorts
 
 def main():
     """main entry point"""
@@ -9,8 +10,7 @@ def main():
     except EOFError:
         pass
 
-    # TODO: lol
-    print(in_data)
+    print(lex_jorts(in_data))
 
 if __name__ == '__main__':
     main()
diff --git a/jortsc/parser/__init__.py b/jortsc/parser/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/jortsc/parser/lexer.py b/jortsc/parser/lexer.py
new file mode 100644
index 0000000..bcd4fc7
--- /dev/null
+++ b/jortsc/parser/lexer.py
@@ -0,0 +1,61 @@
+import re
+import enum
+
+
+class TokenType(enum.Enum):
+    """Defines the type of a token"""
+    RESERVED = enum.auto()
+    IDENTIFIER = enum.auto()
+
+
+class LexerError(Exception):
+    """Lexer error."""
+    pass
+
+
+TOKENS = [
+    (r'fn', TokenType.RESERVED),
+]
+
+
+def lex(string: str, token_defs: list) -> list:
+    """Generate tokens out of the given string."""
+    pos = 0
+    strlen = len(string)
+    tokens = []
+
+    # generate a dict for compiled regexes out of the token defs
+    # instead of compiling on each token definition per token.
+    compiled = {pattern: re.compile(pattern)
+                for pattern, _ in token_defs}
+
+    while pos < strlen:
+        valid = False
+
+        for definition in token_defs:
+            pattern, tok_type = definition
+            regex = compiled[pattern]
+
+            match = regex.match(string, pos)
+
+            if not match:
+                continue
+
+            text = match.group(0)
+            pos = match.end(0)
+
+            valid = True
+            tokens.append((text, tok_type))
+
+            # go to next token instead
+            break
+
+        if not valid:
+            raise LexerError(f'Invalid character: {string[pos]}')
+
+    return tokens
+
+
+def lex_jorts(string: str) -> list:
+    """Lex with the jorts token definitions"""
+    return lex(string, TOKENS)