From 93e7d9db7ba4d611a0fac410fa586867a0aba7ee Mon Sep 17 00:00:00 2001 From: Luna Date: Wed, 18 Sep 2019 14:30:23 -0300 Subject: [PATCH] copy-paste vig's scanner into rayoko --- src/main.zig | 32 ++++- src/scanners.zig | 361 +++++++++++++++++++++++++++++++++++++++++++++++ src/tokens.zig | 89 ++++++++++++ 3 files changed, 481 insertions(+), 1 deletion(-) create mode 100644 src/scanners.zig create mode 100644 src/tokens.zig diff --git a/src/main.zig b/src/main.zig index 05b5fbb..655520e 100644 --- a/src/main.zig +++ b/src/main.zig @@ -1,5 +1,7 @@ const std = @import("std"); +const scanners = @import("scanners.zig"); + pub const Result = enum { Ok, TokenizeError, @@ -8,6 +10,34 @@ pub const Result = enum { }; pub fn run(allocator: *std.mem.Allocator, slice: []const u8) Result { + var scan = scanners.Scanner.init(allocator, slice); + //defer scan.deinit(); + + // do a full scan pass, then reset, then do it again (with parser) + while (true) { + var tok_opt = scan.nextToken() catch |err| { + std.debug.warn( + "error at '{}': {}\n", + scan.currentLexeme(), + err, + ); + + return Result.TokenizeError; + }; + + if (tok_opt) |tok| { + if (tok.typ == .EOF) break; + + // TODO remove + std.debug.warn("{x}\n", tok); + } + } + + // scan.reset(); + + //var parser = parsers.Parser.init(allocator, scan); + //defer parser.deinit(); + return Result.Ok; } @@ -28,8 +58,8 @@ pub fn main() anyerror!void { _ = try file.read(slice); - //switch (try run(allocator, slice)) { const result = run(allocator, slice); + //const result = try run(allocator, slice); switch (result) { .Ok => std.os.exit(0), diff --git a/src/scanners.zig b/src/scanners.zig new file mode 100644 index 0000000..47537da --- /dev/null +++ b/src/scanners.zig @@ -0,0 +1,361 @@ +const std = @import("std"); +const tokens = @import("tokens.zig"); + +const Allocator = std.mem.Allocator; +const Token = tokens.Token; +const TokenType = tokens.TokenType; + +pub const ScannerError = error{ + Unexpected, + Unterminated, +}; + +fn isDigit(char: u8) bool { + return char >= '0' and char <= '9'; +} + +fn isAlpha(c: u8) bool { + return (c >= 'a' and c <= 'z') or + (c >= 'A' and c <= 'Z') or + c == '_'; +} + +fn isAlphaNumeric(char: u8) bool { + return isAlpha(char) or isDigit(char); +} + +const keywords = [_][]const u8{ + "break", + "const", + "continue", + "defer", + "else", + "enum", + "fn", + "for", + "go", + "goto", + "if", + "import", + "in", + "interface", + "match", + "module", + "mut", + "or", + "return", + "struct", + "type", + "true", + "false", + "None", + "println", + "loop", + "pub", +}; + +const keyword_ttypes = [_]TokenType{ + .Break, + .Const, + .Continue, + .Defer, + .Else, + .Enum, + .Fn, + .For, + .Go, + .Goto, + .If, + .Import, + .In, + .Interface, + .Match, + .Module, + .Mut, + .Or, + .Return, + .Struct, + .Type, + .True, + .False, + .None, + .Println, + .Loop, + .Pub, +}; + +fn getKeyword(keyword: []const u8) ?TokenType { + for (keywords) |kw, idx| { + if (std.mem.eql(u8, keyword, kw)) { + return keyword_ttypes[idx]; + } + } + + return null; +} + +/// Scanner for vlang tokens. +pub const Scanner = struct { + allocator: *Allocator, + source: []const u8, + + start: usize = 0, + current: usize = 0, + line: usize = 1, + + pub fn init(allocator: *Allocator, source: []const u8) Scanner { + return Scanner{ .allocator = allocator, .source = source }; + } + + fn isAtEnd(self: *Scanner) bool { + return self.current >= self.source.len; + } + + fn advance(self: *Scanner) u8 { + self.current += 1; + return self.source[self.current - 1]; + } + + fn rollback(self: *Scanner) void { + self.current -= 1; + } + + pub fn currentLexeme(self: *Scanner) []const u8 { + return self.source[self.start..self.current]; + } + + fn makeToken(self: *Scanner, ttype: TokenType) Token { + return Token{ + .typ = ttype, + .lexeme = self.currentLexeme(), + .line = self.line, + }; + } + + fn makeTokenLexeme( + self: *Scanner, + ttype: TokenType, + lexeme: []const u8, + ) Token { + return Token{ + .typ = ttype, + .lexeme = lexeme, + .line = self.line, + }; + } + + /// Check if the next character matches what is expected. + fn match(self: *Scanner, expected: u8) bool { + if (self.isAtEnd()) return false; + if (self.source[self.current] != expected) return false; + + self.current += 1; + return true; + } + + /// Add a SimpleToken of type_match if the next character is + /// `expected`. Adds a SimpleToken of type_nomatch when it is not. + fn makeMatchToken( + self: *Scanner, + expected: u8, + type_match: TokenType, + type_nomatch: TokenType, + ) Token { + if (self.match(expected)) { + return self.makeToken(type_match); + } else { + return self.makeToken(type_nomatch); + } + } + + /// "triple" version of makeMatchToken. + /// Required per vlang's tokens. + fn makeTripleMatchToken( + self: *Scanner, + char1: u8, + ttype1: TokenType, + char2: u8, + ttype2: TokenType, + fallback: TokenType, + ) Token { + if (self.match(char1)) { + return self.makeToken(ttype1); + } else if (self.match(char2)) { + return self.makeToken(ttype2); + } else { + return self.makeToken(fallback); + } + } + + /// Peek at the current character in the scanner + fn peek(self: *Scanner) u8 { + if (self.isAtEnd()) return 0; + if (self.current == 0) return 0; + return self.source[self.current - 1]; + } + + /// Peek at the next character in the scanner + fn peekNext(self: *Scanner) u8 { + if (self.current + 1 > self.source.len) return 0; + return self.source[self.current]; + } + + /// Consume a number. + /// Returns either an Integer or a Float token. Proper typing + /// of the number (i32 i64 u32 u64 f32 f64) are for the parser. + fn doNumber(self: *Scanner) Token { + var ttype = TokenType.Integer; + + while (isDigit(self.peekNext())) { + _ = self.advance(); + } + + // check if its a number like 12.34, where the '.' character + // exists and the one next to it is a digit. + if (self.peek() == '.' and isDigit(self.peekNext())) { + ttype = TokenType.Float; + + _ = self.advance(); + while (isDigit(self.peek())) { + _ = self.advance(); + } + } + + return self.makeToken(ttype); + } + + /// Consume a string. stop_char is used to determine + /// if the string is a single quote or double quote string + fn doString(self: *Scanner, stop_char: u8) !Token { + // consume entire string + while (self.peekNext() != stop_char and !self.isAtEnd()) { + if (self.peek() == '\n') self.line += 1; + _ = self.advance(); + } + + // unterminated string. + if (self.isAtEnd()) { + return ScannerError.Unterminated; + } + + // the closing character of the string + _ = self.advance(); + + // remove the starting and ending chars of the string + const lexeme = self.currentLexeme(); + return self.makeTokenLexeme( + .String, + lexeme[1 .. lexeme.len - 1], + ); + } + + /// Either a keyword or an identifier come out of this. + fn doIdentifier(self: *Scanner) Token { + while (isAlphaNumeric(self.peek())) { + _ = self.advance(); + } + + // ugly hack. + self.rollback(); + + // after reading the identifier, we check + // if it is any of our keywords, if it is, then we add + // the specificed keyword type. if not, just .IDENTIFIER + var toktype: TokenType = undefined; + var ttype_opt = getKeyword(self.currentLexeme()); + + if (ttype_opt) |ttype| { + toktype = ttype; + } else { + toktype = TokenType.Identifier; + } + + return self.makeToken(toktype); + } + + pub fn nextToken(self: *Scanner) !?Token { + self.start = self.current; + + if (self.isAtEnd()) return self.makeToken(TokenType.EOF); + + var c = self.advance(); + if (isDigit(c)) return self.doNumber(); + if (isAlpha(c)) return self.doIdentifier(); + + var token: ?Token = switch (c) { + '(' => self.makeToken(.LeftParen), + ')' => self.makeToken(.RightParen), + '{' => self.makeToken(.LeftBrace), + '}' => self.makeToken(.RightBrace), + '[' => self.makeToken(.LeftSquare), + ']' => self.makeToken(.RightSquare), + '.' => self.makeToken(.Dot), + ';' => self.makeToken(.Semicolon), + ',' => self.makeToken(.Comma), + '?' => self.makeToken(.QuestionMark), + '$' => self.makeToken(.DollarSign), + + '%' => self.makeToken(.Modulo), + + ':' => self.makeMatchToken('=', .ColonEqual, .Colon), + '*' => self.makeMatchToken('=', .StarEqual, .Star), + '-' => self.makeMatchToken('=', .MinusEqual, .Minus), + + // we use the existing .And and .Or tokens + // representing the and and or keywords to + // also have || and && + '&' => self.makeMatchToken('&', .And, .Address), + '|' => self.makeMatchToken('|', .Or, .Pipe), + + '!' => self.makeMatchToken('=', .BangEqual, .Bang), + '=' => self.makeMatchToken('=', .EqualEqual, .Equal), + '>' => self.makeMatchToken('=', .GreaterEqual, .Greater), + '+' => self.makeTripleMatchToken('+', .PlusPlus, '=', .PlusEqual, .Plus), + '<' => self.makeTripleMatchToken('=', .LessEqual, '<', .LeftDoubleChevron, .Less), + + '/' => blk: { + var next = self.peekNext(); + + switch (next) { + '=' => { + self.current += 1; + return self.makeToken(.SlashEqual); + }, + + '/' => blk2: { + while (self.peek() != '\n' and !self.isAtEnd()) { + _ = self.advance(); + } + + return null; + }, + + '*' => blk2: { + while (self.peek() != '*' or self.peekNext() != '/') { + _ = self.advance(); + } + + // consume the ending slash + _ = self.advance(); + return null; + }, + + else => break :blk self.makeToken(.Slash), + } + }, + + '\'' => try self.doString('\''), + '"' => try self.doString('"'), + + ' ', '\r', '\t' => null, + '\n' => blk: { + self.line += 1; + break :blk null; + }, + + else => return ScannerError.Unexpected, + }; + + return token; + } +}; diff --git a/src/tokens.zig b/src/tokens.zig new file mode 100644 index 0000000..a3811cc --- /dev/null +++ b/src/tokens.zig @@ -0,0 +1,89 @@ +pub const TokenType = enum { + // basic tokens + LeftParen, + RightParen, + LeftBrace, + RightBrace, + LeftSquare, + RightSquare, + Dot, + Equal, + Semicolon, + Comma, + Colon, + Address, + Pipe, + QuestionMark, + DollarSign, + + // math operators + Plus, + Minus, + Star, + Slash, + Modulo, + + // one-two char tokens + DotEqual, + LeftDoubleChevron, // AKA "<<" + PlusPlus, + PlusEqual, + MinusEqual, + ColonEqual, + StarEqual, + SlashEqual, + + // comparison ones + EqualEqual, + Less, + LessEqual, + Greater, + GreaterEqual, + Bang, + BangEqual, + + // complex types + Integer, + Float, + String, + Identifier, + + // keywords + Break, + Const, + Continue, + Defer, + Else, + Enum, + Fn, + For, + Loop, + Go, + Goto, + If, + Import, + In, + Interface, + Match, + Module, + Mut, + Or, + And, + Return, + Struct, + Type, + True, + False, + None, + + Println, + Pub, + + EOF, +}; + +pub const Token = struct { + typ: TokenType, + lexeme: []const u8, + line: usize, +};