add basic lexer

also a draft grammar that i wont use probably
This commit is contained in:
Luna 2019-03-08 23:43:17 -03:00
parent 3c983b004c
commit 588b63fabe
6 changed files with 183 additions and 50 deletions

0
jortsc/__init__.py Normal file
View file

10
jortsc/grammar Normal file
View file

@ -0,0 +1,10 @@
digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" ;
integer = ['-' | '+'] "0" digit {digit} ;
hex_letters = "a" | "b" | "c" | "d" | "e" | "f"
hex_integer = "0x", {hex_letters | digit} ;
oct_digits = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" ;
octal_integer = "0o", {oct_digits} ;
program =

View file

@ -1,6 +1,7 @@
#!/usr/bin/python3
import sys
from parser.lexer import lex_jorts
def main():
"""main entry point"""
@ -9,8 +10,7 @@ def main():
except EOFError:
pass
# TODO: lol
print(in_data)
print(lex_jorts(in_data))
if __name__ == '__main__':
main()

View file

61
jortsc/parser/lexer.py Normal file
View file

@ -0,0 +1,61 @@
import re
import enum
class TokenType(enum.Enum):
"""Defines the type of a token"""
RESERVED = enum.auto()
IDENTIFIER = enum.auto()
class LexerError(Exception):
"""Lexer error."""
pass
TOKENS = [
(r'fn', TokenType.RESERVED),
]
def lex(string: str, token_defs: list) -> list:
"""Generate tokens out of the given string."""
pos = 0
strlen = len(string)
tokens = []
# generate a dict for compiled regexes out of the token defs
# instead of compiling on each token definition per token.
compiled = {pattern: re.compile(pattern)
for pattern, _ in token_defs}
while pos < strlen:
valid = False
for definition in token_defs:
pattern, tok_type = definition
regex = compiled[pattern]
match = regex.match(string, pos)
if not match:
continue
text = match.group(0)
pos = match.end(0)
valid = True
tokens.append((text, tok_type))
# go to next token instead
break
if not valid:
raise LexerError(f'Invalid character: {string[pos]}')
return tokens
def lex_jorts(string: str) -> list:
"""Lex with the jorts token definitions"""
return lex(string, TOKENS)