add basic lexer

also a draft grammar that i wont use probably
2019-03-08 23:43:17 -03:00 · 2019-03-08 23:43:17 -03:00 · 588b63fabe
commit 588b63fabe
parent 3c983b004c
6 changed files with 183 additions and 50 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,54 +1,116 @@
-# ---> C
-# Prerequisites
-*.d
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class

-# Object files
-*.o
-*.ko
-*.obj
-*.elf
-
-# Linker output
-*.ilk
-*.map
-*.exp
-
-# Precompiled Headers
-*.gch
-*.pch
-
-# Libraries
-*.lib
-*.a
-*.la
-*.lo
-
-# Shared objects (inc. Windows DLLs)
-*.dll
+# C extensions
 *.so
-*.so.*
-*.dylib

-# Executables
-*.exe
-*.out
-*.app
-*.i*86
-*.x86_64
-*.hex
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST

-# Debug files
-*.dSYM/
-*.su
-*.idb
-*.pdb
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec

-# Kernel Module Compile Results
-*.mod*
-*.cmd
-.tmp_versions/
-modules.order
-Module.symvers
-Mkfile.old
-dkms.conf
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt

+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
--- a/jortsc/init.py
+++ b/jortsc/init.py
--- a/jortsc/grammar
+++ b/jortsc/grammar
@ -0,0 +1,10 @@
+digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" ;
+integer = ['-' | '+'] "0" digit {digit} ;
+
+hex_letters = "a" | "b" | "c" | "d" | "e" | "f"
+hex_integer = "0x", {hex_letters | digit} ;
+
+oct_digits = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" ;
+octal_integer = "0o", {oct_digits} ;
+
+program = 
--- a/jortsc/main.py
+++ b/jortsc/main.py
@ -1,6 +1,7 @@
 #!/usr/bin/python3

 import sys
+from parser.lexer import lex_jorts

 def main():
    """main entry point"""
@ -9,8 +10,7 @@ def main():
    except EOFError:
        pass

-    # TODO: lol
-    print(in_data)
+    print(lex_jorts(in_data))

 if __name__ == '__main__':
    main()
--- a/jortsc/parser/init.py
+++ b/jortsc/parser/init.py
--- a/jortsc/parser/lexer.py
+++ b/jortsc/parser/lexer.py
@ -0,0 +1,61 @@
+import re
+import enum
+
+
+class TokenType(enum.Enum):
+    """Defines the type of a token"""
+    RESERVED = enum.auto()
+    IDENTIFIER = enum.auto()
+
+
+class LexerError(Exception):
+    """Lexer error."""
+    pass
+
+
+TOKENS = [
+    (r'fn', TokenType.RESERVED),
+]
+
+
+def lex(string: str, token_defs: list) -> list:
+    """Generate tokens out of the given string."""
+    pos = 0
+    strlen = len(string)
+    tokens = []
+
+    # generate a dict for compiled regexes out of the token defs
+    # instead of compiling on each token definition per token.
+    compiled = {pattern: re.compile(pattern)
+                for pattern, _ in token_defs}
+
+    while pos < strlen:
+        valid = False
+
+        for definition in token_defs:
+            pattern, tok_type = definition
+            regex = compiled[pattern]
+
+            match = regex.match(string, pos)
+
+            if not match:
+                continue
+
+            text = match.group(0)
+            pos = match.end(0)
+
+            valid = True
+            tokens.append((text, tok_type))
+
+            # go to next token instead
+            break
+
+        if not valid:
+            raise LexerError(f'Invalid character: {string[pos]}')
+
+    return tokens
+
+
+def lex_jorts(string: str) -> list:
+    """Lex with the jorts token definitions"""
+    return lex(string, TOKENS)