From 93e7d9db7ba4d611a0fac410fa586867a0aba7ee Mon Sep 17 00:00:00 2001
From: Luna <git@l4.pm>
Date: Wed, 18 Sep 2019 14:30:23 -0300
Subject: [PATCH] copy-paste vig's scanner into rayoko

---
 src/main.zig     |  32 ++++-
 src/scanners.zig | 361 +++++++++++++++++++++++++++++++++++++++++++++++
 src/tokens.zig   |  89 ++++++++++++
 3 files changed, 481 insertions(+), 1 deletion(-)
 create mode 100644 src/scanners.zig
 create mode 100644 src/tokens.zig

diff --git a/src/main.zig b/src/main.zig
index 05b5fbb..655520e 100644
--- a/src/main.zig
+++ b/src/main.zig
@@ -1,5 +1,7 @@
 const std = @import("std");
 
+const scanners = @import("scanners.zig");
+
 pub const Result = enum {
     Ok,
     TokenizeError,
@@ -8,6 +10,34 @@ pub const Result = enum {
 };
 
 pub fn run(allocator: *std.mem.Allocator, slice: []const u8) Result {
+    var scan = scanners.Scanner.init(allocator, slice);
+    //defer scan.deinit();
+
+    // do a full scan pass, then reset, then do it again (with parser)
+    while (true) {
+        var tok_opt = scan.nextToken() catch |err| {
+            std.debug.warn(
+                "error at '{}': {}\n",
+                scan.currentLexeme(),
+                err,
+            );
+
+            return Result.TokenizeError;
+        };
+
+        if (tok_opt) |tok| {
+            if (tok.typ == .EOF) break;
+
+            // TODO remove
+            std.debug.warn("{x}\n", tok);
+        }
+    }
+
+    // scan.reset();
+
+    //var parser = parsers.Parser.init(allocator, scan);
+    //defer parser.deinit();
+
     return Result.Ok;
 }
 
@@ -28,8 +58,8 @@ pub fn main() anyerror!void {
 
     _ = try file.read(slice);
 
-    //switch (try run(allocator, slice)) {
     const result = run(allocator, slice);
+    //const result = try run(allocator, slice);
     switch (result) {
         .Ok => std.os.exit(0),
 
diff --git a/src/scanners.zig b/src/scanners.zig
new file mode 100644
index 0000000..47537da
--- /dev/null
+++ b/src/scanners.zig
@@ -0,0 +1,361 @@
+const std = @import("std");
+const tokens = @import("tokens.zig");
+
+const Allocator = std.mem.Allocator;
+const Token = tokens.Token;
+const TokenType = tokens.TokenType;
+
+pub const ScannerError = error{
+    Unexpected,
+    Unterminated,
+};
+
+fn isDigit(char: u8) bool {
+    return char >= '0' and char <= '9';
+}
+
+fn isAlpha(c: u8) bool {
+    return (c >= 'a' and c <= 'z') or
+        (c >= 'A' and c <= 'Z') or
+        c == '_';
+}
+
+fn isAlphaNumeric(char: u8) bool {
+    return isAlpha(char) or isDigit(char);
+}
+
+const keywords = [_][]const u8{
+    "break",
+    "const",
+    "continue",
+    "defer",
+    "else",
+    "enum",
+    "fn",
+    "for",
+    "go",
+    "goto",
+    "if",
+    "import",
+    "in",
+    "interface",
+    "match",
+    "module",
+    "mut",
+    "or",
+    "return",
+    "struct",
+    "type",
+    "true",
+    "false",
+    "None",
+    "println",
+    "loop",
+    "pub",
+};
+
+const keyword_ttypes = [_]TokenType{
+    .Break,
+    .Const,
+    .Continue,
+    .Defer,
+    .Else,
+    .Enum,
+    .Fn,
+    .For,
+    .Go,
+    .Goto,
+    .If,
+    .Import,
+    .In,
+    .Interface,
+    .Match,
+    .Module,
+    .Mut,
+    .Or,
+    .Return,
+    .Struct,
+    .Type,
+    .True,
+    .False,
+    .None,
+    .Println,
+    .Loop,
+    .Pub,
+};
+
+fn getKeyword(keyword: []const u8) ?TokenType {
+    for (keywords) |kw, idx| {
+        if (std.mem.eql(u8, keyword, kw)) {
+            return keyword_ttypes[idx];
+        }
+    }
+
+    return null;
+}
+
+/// Scanner for vlang tokens.
+pub const Scanner = struct {
+    allocator: *Allocator,
+    source: []const u8,
+
+    start: usize = 0,
+    current: usize = 0,
+    line: usize = 1,
+
+    pub fn init(allocator: *Allocator, source: []const u8) Scanner {
+        return Scanner{ .allocator = allocator, .source = source };
+    }
+
+    fn isAtEnd(self: *Scanner) bool {
+        return self.current >= self.source.len;
+    }
+
+    fn advance(self: *Scanner) u8 {
+        self.current += 1;
+        return self.source[self.current - 1];
+    }
+
+    fn rollback(self: *Scanner) void {
+        self.current -= 1;
+    }
+
+    pub fn currentLexeme(self: *Scanner) []const u8 {
+        return self.source[self.start..self.current];
+    }
+
+    fn makeToken(self: *Scanner, ttype: TokenType) Token {
+        return Token{
+            .typ = ttype,
+            .lexeme = self.currentLexeme(),
+            .line = self.line,
+        };
+    }
+
+    fn makeTokenLexeme(
+        self: *Scanner,
+        ttype: TokenType,
+        lexeme: []const u8,
+    ) Token {
+        return Token{
+            .typ = ttype,
+            .lexeme = lexeme,
+            .line = self.line,
+        };
+    }
+
+    /// Check if the next character matches what is expected.
+    fn match(self: *Scanner, expected: u8) bool {
+        if (self.isAtEnd()) return false;
+        if (self.source[self.current] != expected) return false;
+
+        self.current += 1;
+        return true;
+    }
+
+    /// Add a SimpleToken of type_match if the next character is
+    /// `expected`. Adds a SimpleToken of type_nomatch when it is not.
+    fn makeMatchToken(
+        self: *Scanner,
+        expected: u8,
+        type_match: TokenType,
+        type_nomatch: TokenType,
+    ) Token {
+        if (self.match(expected)) {
+            return self.makeToken(type_match);
+        } else {
+            return self.makeToken(type_nomatch);
+        }
+    }
+
+    /// "triple" version of makeMatchToken.
+    /// Required per vlang's tokens.
+    fn makeTripleMatchToken(
+        self: *Scanner,
+        char1: u8,
+        ttype1: TokenType,
+        char2: u8,
+        ttype2: TokenType,
+        fallback: TokenType,
+    ) Token {
+        if (self.match(char1)) {
+            return self.makeToken(ttype1);
+        } else if (self.match(char2)) {
+            return self.makeToken(ttype2);
+        } else {
+            return self.makeToken(fallback);
+        }
+    }
+
+    /// Peek at the current character in the scanner
+    fn peek(self: *Scanner) u8 {
+        if (self.isAtEnd()) return 0;
+        if (self.current == 0) return 0;
+        return self.source[self.current - 1];
+    }
+
+    /// Peek at the next character in the scanner
+    fn peekNext(self: *Scanner) u8 {
+        if (self.current + 1 > self.source.len) return 0;
+        return self.source[self.current];
+    }
+
+    /// Consume a number.
+    /// Returns either an Integer or a Float token. Proper typing
+    /// of the number (i32 i64 u32 u64 f32 f64) are for the parser.
+    fn doNumber(self: *Scanner) Token {
+        var ttype = TokenType.Integer;
+
+        while (isDigit(self.peekNext())) {
+            _ = self.advance();
+        }
+
+        // check if its a number like 12.34, where the '.' character
+        // exists and the one next to it is a digit.
+        if (self.peek() == '.' and isDigit(self.peekNext())) {
+            ttype = TokenType.Float;
+
+            _ = self.advance();
+            while (isDigit(self.peek())) {
+                _ = self.advance();
+            }
+        }
+
+        return self.makeToken(ttype);
+    }
+
+    /// Consume a string. stop_char is used to determine
+    /// if the string is a single quote or double quote string
+    fn doString(self: *Scanner, stop_char: u8) !Token {
+        // consume entire string
+        while (self.peekNext() != stop_char and !self.isAtEnd()) {
+            if (self.peek() == '\n') self.line += 1;
+            _ = self.advance();
+        }
+
+        // unterminated string.
+        if (self.isAtEnd()) {
+            return ScannerError.Unterminated;
+        }
+
+        // the closing character of the string
+        _ = self.advance();
+
+        // remove the starting and ending chars of the string
+        const lexeme = self.currentLexeme();
+        return self.makeTokenLexeme(
+            .String,
+            lexeme[1 .. lexeme.len - 1],
+        );
+    }
+
+    /// Either a keyword or an identifier come out of this.
+    fn doIdentifier(self: *Scanner) Token {
+        while (isAlphaNumeric(self.peek())) {
+            _ = self.advance();
+        }
+
+        // ugly hack.
+        self.rollback();
+
+        // after reading the identifier, we check
+        // if it is any of our keywords, if it is, then we add
+        // the specificed keyword type. if not, just .IDENTIFIER
+        var toktype: TokenType = undefined;
+        var ttype_opt = getKeyword(self.currentLexeme());
+
+        if (ttype_opt) |ttype| {
+            toktype = ttype;
+        } else {
+            toktype = TokenType.Identifier;
+        }
+
+        return self.makeToken(toktype);
+    }
+
+    pub fn nextToken(self: *Scanner) !?Token {
+        self.start = self.current;
+
+        if (self.isAtEnd()) return self.makeToken(TokenType.EOF);
+
+        var c = self.advance();
+        if (isDigit(c)) return self.doNumber();
+        if (isAlpha(c)) return self.doIdentifier();
+
+        var token: ?Token = switch (c) {
+            '(' => self.makeToken(.LeftParen),
+            ')' => self.makeToken(.RightParen),
+            '{' => self.makeToken(.LeftBrace),
+            '}' => self.makeToken(.RightBrace),
+            '[' => self.makeToken(.LeftSquare),
+            ']' => self.makeToken(.RightSquare),
+            '.' => self.makeToken(.Dot),
+            ';' => self.makeToken(.Semicolon),
+            ',' => self.makeToken(.Comma),
+            '?' => self.makeToken(.QuestionMark),
+            '$' => self.makeToken(.DollarSign),
+
+            '%' => self.makeToken(.Modulo),
+
+            ':' => self.makeMatchToken('=', .ColonEqual, .Colon),
+            '*' => self.makeMatchToken('=', .StarEqual, .Star),
+            '-' => self.makeMatchToken('=', .MinusEqual, .Minus),
+
+            // we use the existing .And and .Or tokens
+            // representing the and and or keywords to
+            // also have || and &&
+            '&' => self.makeMatchToken('&', .And, .Address),
+            '|' => self.makeMatchToken('|', .Or, .Pipe),
+
+            '!' => self.makeMatchToken('=', .BangEqual, .Bang),
+            '=' => self.makeMatchToken('=', .EqualEqual, .Equal),
+            '>' => self.makeMatchToken('=', .GreaterEqual, .Greater),
+            '+' => self.makeTripleMatchToken('+', .PlusPlus, '=', .PlusEqual, .Plus),
+            '<' => self.makeTripleMatchToken('=', .LessEqual, '<', .LeftDoubleChevron, .Less),
+
+            '/' => blk: {
+                var next = self.peekNext();
+
+                switch (next) {
+                    '=' => {
+                        self.current += 1;
+                        return self.makeToken(.SlashEqual);
+                    },
+
+                    '/' => blk2: {
+                        while (self.peek() != '\n' and !self.isAtEnd()) {
+                            _ = self.advance();
+                        }
+
+                        return null;
+                    },
+
+                    '*' => blk2: {
+                        while (self.peek() != '*' or self.peekNext() != '/') {
+                            _ = self.advance();
+                        }
+
+                        // consume the ending slash
+                        _ = self.advance();
+                        return null;
+                    },
+
+                    else => break :blk self.makeToken(.Slash),
+                }
+            },
+
+            '\'' => try self.doString('\''),
+            '"' => try self.doString('"'),
+
+            ' ', '\r', '\t' => null,
+            '\n' => blk: {
+                self.line += 1;
+                break :blk null;
+            },
+
+            else => return ScannerError.Unexpected,
+        };
+
+        return token;
+    }
+};
diff --git a/src/tokens.zig b/src/tokens.zig
new file mode 100644
index 0000000..a3811cc
--- /dev/null
+++ b/src/tokens.zig
@@ -0,0 +1,89 @@
+pub const TokenType = enum {
+    // basic tokens
+    LeftParen,
+    RightParen,
+    LeftBrace,
+    RightBrace,
+    LeftSquare,
+    RightSquare,
+    Dot,
+    Equal,
+    Semicolon,
+    Comma,
+    Colon,
+    Address,
+    Pipe,
+    QuestionMark,
+    DollarSign,
+
+    // math operators
+    Plus,
+    Minus,
+    Star,
+    Slash,
+    Modulo,
+
+    // one-two char tokens
+    DotEqual,
+    LeftDoubleChevron, // AKA "<<"
+    PlusPlus,
+    PlusEqual,
+    MinusEqual,
+    ColonEqual,
+    StarEqual,
+    SlashEqual,
+
+    // comparison ones
+    EqualEqual,
+    Less,
+    LessEqual,
+    Greater,
+    GreaterEqual,
+    Bang,
+    BangEqual,
+
+    // complex types
+    Integer,
+    Float,
+    String,
+    Identifier,
+
+    // keywords
+    Break,
+    Const,
+    Continue,
+    Defer,
+    Else,
+    Enum,
+    Fn,
+    For,
+    Loop,
+    Go,
+    Goto,
+    If,
+    Import,
+    In,
+    Interface,
+    Match,
+    Module,
+    Mut,
+    Or,
+    And,
+    Return,
+    Struct,
+    Type,
+    True,
+    False,
+    None,
+
+    Println,
+    Pub,
+
+    EOF,
+};
+
+pub const Token = struct {
+    typ: TokenType,
+    lexeme: []const u8,
+    line: usize,
+};