finish basic token list

This commit is contained in:
Luna 2019-03-09 00:02:13 -03:00
parent ece290e064
commit a11d21873a
2 changed files with 54 additions and 8 deletions

View File

@ -1,6 +1,8 @@
#!/usr/bin/python3 #!/usr/bin/python3
import sys import sys
import pprint
from parser.lexer import lex_jorts from parser.lexer import lex_jorts
def main(): def main():
@ -10,7 +12,9 @@ def main():
except EOFError: except EOFError:
pass pass
print(lex_jorts(in_data)) tokens = lex_jorts(in_data)
pprint.pprint(tokens)
print([t[0] for t in tokens])
if __name__ == '__main__': if __name__ == '__main__':
main() main()

View File

@ -1,11 +1,16 @@
import re import re
import enum from enum import Enum, auto
class TokenType(enum.Enum): class TokenType(Enum):
"""Defines the type of a token""" """Defines the type of a token"""
RESERVED = enum.auto() reserved = auto()
IDENTIFIER = enum.auto() identifier = auto()
comment = auto()
comment_start = auto()
comment_end = auto()
whitespace = auto()
number = auto()
class LexerError(Exception): class LexerError(Exception):
@ -14,7 +19,36 @@ class LexerError(Exception):
TOKENS = [ TOKENS = [
(r'fn', TokenType.RESERVED), (r'[ \n\t]+', TokenType.whitespace),
# single line comments and multiline comments
(r'//[^\n]*', TokenType.comment),
# TODO: shouldnt this be /* <anything> */ instead of
# only tokenizing on the start and end?
(r'/\*', TokenType.comment_start),
(r'\*/', TokenType.comment_end),
(r'fn', TokenType.reserved),
(r'if', TokenType.reserved),
(r'import', TokenType.reserved),
(r'\(', TokenType.reserved),
(r'\)', TokenType.reserved),
(r'\{', TokenType.reserved),
(r'\}', TokenType.reserved),
(r'\-\>', TokenType.reserved),
(r'\.', TokenType.reserved),
(r'\"[^\n]*\"', TokenType.reserved),
# basic math ops
(r'[\+\-\/\*]', TokenType.reserved),
(r'[0-9]+', TokenType.number),
(r'[A-Za-z][A-Za-z0-9_]*', TokenType.identifier)
] ]
@ -29,6 +63,9 @@ def lex(string: str, token_defs: list) -> list:
compiled = {pattern: re.compile(pattern) compiled = {pattern: re.compile(pattern)
for pattern, _ in token_defs} for pattern, _ in token_defs}
# we use this instead of for pos in range(len(string)) because we
# need to increment pos to a whole token length's, and that wouldn't
# be easy on a for .. in range(..)
while pos < strlen: while pos < strlen:
valid = False valid = False
@ -42,16 +79,21 @@ def lex(string: str, token_defs: list) -> list:
continue continue
text = match.group(0) text = match.group(0)
# update pos to the end of the token
pos = match.end(0) pos = match.end(0)
valid = True valid = True
tokens.append((text, tok_type)) tokens.append((text, tok_type))
# go to next token instead # go to next token instead of checking other
# definitions for tokens, e.g if its a reserved token
# we shouldn't go down the path of an identifier.
break break
if not valid: if not valid:
raise LexerError(f'Invalid character: {string[pos]}') print(f'context: {pos} {len(string)} {string[pos-1:pos+20]!r}')
raise LexerError(f'Invalid character: {string[pos]!r}')
return tokens return tokens