finish basic token list
This commit is contained in:
parent
ece290e064
commit
a11d21873a
2 changed files with 54 additions and 8 deletions
|
@ -1,6 +1,8 @@
|
|||
#!/usr/bin/python3
|
||||
|
||||
import sys
|
||||
import pprint
|
||||
|
||||
from parser.lexer import lex_jorts
|
||||
|
||||
def main():
|
||||
|
@ -10,7 +12,9 @@ def main():
|
|||
except EOFError:
|
||||
pass
|
||||
|
||||
print(lex_jorts(in_data))
|
||||
tokens = lex_jorts(in_data)
|
||||
pprint.pprint(tokens)
|
||||
print([t[0] for t in tokens])
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
|
|
@ -1,11 +1,16 @@
|
|||
import re
|
||||
import enum
|
||||
from enum import Enum, auto
|
||||
|
||||
|
||||
class TokenType(enum.Enum):
|
||||
class TokenType(Enum):
|
||||
"""Defines the type of a token"""
|
||||
RESERVED = enum.auto()
|
||||
IDENTIFIER = enum.auto()
|
||||
reserved = auto()
|
||||
identifier = auto()
|
||||
comment = auto()
|
||||
comment_start = auto()
|
||||
comment_end = auto()
|
||||
whitespace = auto()
|
||||
number = auto()
|
||||
|
||||
|
||||
class LexerError(Exception):
|
||||
|
@ -14,7 +19,36 @@ class LexerError(Exception):
|
|||
|
||||
|
||||
TOKENS = [
|
||||
(r'fn', TokenType.RESERVED),
|
||||
(r'[ \n\t]+', TokenType.whitespace),
|
||||
|
||||
# single line comments and multiline comments
|
||||
(r'//[^\n]*', TokenType.comment),
|
||||
|
||||
# TODO: shouldnt this be /* <anything> */ instead of
|
||||
# only tokenizing on the start and end?
|
||||
(r'/\*', TokenType.comment_start),
|
||||
(r'\*/', TokenType.comment_end),
|
||||
|
||||
(r'fn', TokenType.reserved),
|
||||
(r'if', TokenType.reserved),
|
||||
(r'import', TokenType.reserved),
|
||||
|
||||
(r'\(', TokenType.reserved),
|
||||
(r'\)', TokenType.reserved),
|
||||
|
||||
(r'\{', TokenType.reserved),
|
||||
(r'\}', TokenType.reserved),
|
||||
|
||||
(r'\-\>', TokenType.reserved),
|
||||
(r'\.', TokenType.reserved),
|
||||
|
||||
(r'\"[^\n]*\"', TokenType.reserved),
|
||||
|
||||
# basic math ops
|
||||
(r'[\+\-\/\*]', TokenType.reserved),
|
||||
|
||||
(r'[0-9]+', TokenType.number),
|
||||
(r'[A-Za-z][A-Za-z0-9_]*', TokenType.identifier)
|
||||
]
|
||||
|
||||
|
||||
|
@ -29,6 +63,9 @@ def lex(string: str, token_defs: list) -> list:
|
|||
compiled = {pattern: re.compile(pattern)
|
||||
for pattern, _ in token_defs}
|
||||
|
||||
# we use this instead of for pos in range(len(string)) because we
|
||||
# need to increment pos to a whole token length's, and that wouldn't
|
||||
# be easy on a for .. in range(..)
|
||||
while pos < strlen:
|
||||
valid = False
|
||||
|
||||
|
@ -42,16 +79,21 @@ def lex(string: str, token_defs: list) -> list:
|
|||
continue
|
||||
|
||||
text = match.group(0)
|
||||
|
||||
# update pos to the end of the token
|
||||
pos = match.end(0)
|
||||
|
||||
valid = True
|
||||
tokens.append((text, tok_type))
|
||||
|
||||
# go to next token instead
|
||||
# go to next token instead of checking other
|
||||
# definitions for tokens, e.g if its a reserved token
|
||||
# we shouldn't go down the path of an identifier.
|
||||
break
|
||||
|
||||
if not valid:
|
||||
raise LexerError(f'Invalid character: {string[pos]}')
|
||||
print(f'context: {pos} {len(string)} {string[pos-1:pos+20]!r}')
|
||||
raise LexerError(f'Invalid character: {string[pos]!r}')
|
||||
|
||||
return tokens
|
||||
|
||||
|
|
Loading…
Reference in a new issue