add basics of a handwritten parser

hell yeah i'm going down that path

lark made confusing stuff, i'll probably get more confused with a
handwritten one, but oh well, such is life
This commit is contained in:
Luna 2019-03-10 01:55:12 -03:00
parent 9fda0b31c3
commit 7ce0565de7
4 changed files with 105 additions and 6 deletions

View file

@ -5,7 +5,7 @@ import pprint
import logging
from jortsc.parser.lexer import lex_jorts
from jortsc.parser.parser import parse
from jortsc.parser.syntatic import syntatic
logging.basicConfig(level=logging.DEBUG)
@ -18,10 +18,10 @@ def main():
tokens = lex_jorts(in_data)
pprint.pprint(tokens)
print([t[0] for t in tokens])
tree = parse(in_data)
tree = syntatic(tokens)
print(tree)
if __name__ == '__main__':
main()

View file

@ -1,4 +1,6 @@
import re
from dataclasses import dataclass
from enum import Enum, auto
@ -13,6 +15,12 @@ class TokenType(Enum):
number = auto()
@dataclass
class Token:
value: str
type_: TokenType
class LexerError(Exception):
"""Lexer error."""
pass
@ -84,7 +92,7 @@ def lex(string: str, token_defs: list) -> list:
pos = match.end(0)
valid = True
tokens.append((text, tok_type))
tokens.append(Token(text, tok_type))
# go to next token instead of checking other
# definitions for tokens, e.g if its a reserved token

View file

@ -7,7 +7,7 @@ IMPORT: "import"
COMMA: ","
DOT: "."
SINGLE_COMMENT: "//"
NEWLINE: /[ \\n\\t]+/
NEWLINE: /(\\r?\\n)+\\s*/
ANY: /.+/
WHITESPACE: " "
INTEGER: /[0-9]+/
@ -39,6 +39,6 @@ start: (NEWLINE | stmt)*
"""
def parse(string: str):
"""Parse"""
"""Parse using Lark"""
parser = Lark(GRAMMAR, parser='lalr', debug=True)
return parser.parse(string)

91
jortsc/parser/syntatic.py Normal file
View file

@ -0,0 +1,91 @@
from typing import Optional, Any, List
from jortsc.parser.lexer import Token, TokenType
class ParseError(Exception):
"""Represents a parse error."""
pass
class Reader:
"""Main reader class"""
def __init__(self, tokens: List[Token]):
self.tokens = tokens
self.cur = 0
def next(self) -> Optional[Token]:
"""Fetch the current token then skip to the next one."""
try:
token = self.tokens[self.cur]
except IndexError:
return None
self.cur += 1
return token
def _fn_read_args(reader: Reader, cur: List = None) -> List:
"""Recursively read the arguments of the function."""
if cur is None:
cur = []
token = reader.next()
if token.value == ')':
return cur
argtype, argname = reader.next(), reader.next()
cur.append((argtype, argname))
return _fn_read_args(reader, cur)
def read_function(reader: Reader):
"""Read a function block."""
token = reader.next()
if token.type_ == TokenType.whitespace:
pass
else:
raise ParseError('Expected whitespace')
token = reader.next()
fn_name = '_anonymous'
fn_args = []
if token.type_ == TokenType.identifier:
fn_name = token.value
fn_args = _fn_read_args(reader)
block = read_start(reader)
elif token.value == '(':
fn_args = _fn_read_args(reader)
block = read_start(reader)
return (fn_name, fn_args, block)
def read_reserved(token: Token, reader: Reader):
"""Read reserved statements."""
if token.value == 'fn':
return read_function(reader)
def read_start(reader: Reader):
"""Read the start of a program."""
token = reader.next()
ast = []
res = []
print('cur', token)
if token.type_ == TokenType.reserved:
res = read_reserved(token, reader)
ast.extend(res)
return ast
def syntatic(tokens: List[Token]):
"""Create an AST out of the tokens."""
return read_start(Reader(tokens))