copy-paste vig's scanner into rayoko

2019-09-18 14:30:23 -03:00 · 2019-09-18 14:30:23 -03:00 · 93e7d9db7b
commit 93e7d9db7b
parent 05509d1181
3 changed files with 481 additions and 1 deletions
--- a/src/main.zig
+++ b/src/main.zig
@ -1,5 +1,7 @@
 const std = @import("std");
 const scanners = @import("scanners.zig");
 pub const Result = enum {
    Ok,
    TokenizeError,
@ -8,6 +10,34 @@ pub const Result = enum {
 };
 pub fn run(allocator: *std.mem.Allocator, slice: []const u8) Result {
    var scan = scanners.Scanner.init(allocator, slice);
    //defer scan.deinit();
    // do a full scan pass, then reset, then do it again (with parser)
    while (true) {
        var tok_opt = scan.nextToken() catch |err| {
            std.debug.warn(
                "error at '{}': {}\n",
                scan.currentLexeme(),
                err,
            );
            return Result.TokenizeError;
        };
        if (tok_opt) |tok| {
            if (tok.typ == .EOF) break;
            // TODO remove
            std.debug.warn("{x}\n", tok);
        }
    }
    // scan.reset();
    //var parser = parsers.Parser.init(allocator, scan);
    //defer parser.deinit();
    return Result.Ok;
 }
@ -28,8 +58,8 @@ pub fn main() anyerror!void {
    _ = try file.read(slice);
    //switch (try run(allocator, slice)) {
    const result = run(allocator, slice);
    //const result = try run(allocator, slice);
    switch (result) {
        .Ok => std.os.exit(0),
--- a/src/scanners.zig
+++ b/src/scanners.zig
@ -0,0 +1,361 @@
 const std = @import("std");
 const tokens = @import("tokens.zig");
 const Allocator = std.mem.Allocator;
 const Token = tokens.Token;
 const TokenType = tokens.TokenType;
 pub const ScannerError = error{
    Unexpected,
    Unterminated,
 };
 fn isDigit(char: u8) bool {
    return char >= '0' and char <= '9';
 }
 fn isAlpha(c: u8) bool {
    return (c >= 'a' and c <= 'z') or
        (c >= 'A' and c <= 'Z') or
        c == '_';
 }
 fn isAlphaNumeric(char: u8) bool {
    return isAlpha(char) or isDigit(char);
 }
 const keywords = [_][]const u8{
    "break",
    "const",
    "continue",
    "defer",
    "else",
    "enum",
    "fn",
    "for",
    "go",
    "goto",
    "if",
    "import",
    "in",
    "interface",
    "match",
    "module",
    "mut",
    "or",
    "return",
    "struct",
    "type",
    "true",
    "false",
    "None",
    "println",
    "loop",
    "pub",
 };
 const keyword_ttypes = [_]TokenType{
    .Break,
    .Const,
    .Continue,
    .Defer,
    .Else,
    .Enum,
    .Fn,
    .For,
    .Go,
    .Goto,
    .If,
    .Import,
    .In,
    .Interface,
    .Match,
    .Module,
    .Mut,
    .Or,
    .Return,
    .Struct,
    .Type,
    .True,
    .False,
    .None,
    .Println,
    .Loop,
    .Pub,
 };
 fn getKeyword(keyword: []const u8) ?TokenType {
    for (keywords) |kw, idx| {
        if (std.mem.eql(u8, keyword, kw)) {
            return keyword_ttypes[idx];
        }
    }
    return null;
 }
 /// Scanner for vlang tokens.
 pub const Scanner = struct {
    allocator: *Allocator,
    source: []const u8,
    start: usize = 0,
    current: usize = 0,
    line: usize = 1,
    pub fn init(allocator: *Allocator, source: []const u8) Scanner {
        return Scanner{ .allocator = allocator, .source = source };
    }
    fn isAtEnd(self: *Scanner) bool {
        return self.current >= self.source.len;
    }
    fn advance(self: *Scanner) u8 {
        self.current += 1;
        return self.source[self.current - 1];
    }
    fn rollback(self: *Scanner) void {
        self.current -= 1;
    }
    pub fn currentLexeme(self: *Scanner) []const u8 {
        return self.source[self.start..self.current];
    }
    fn makeToken(self: *Scanner, ttype: TokenType) Token {
        return Token{
            .typ = ttype,
            .lexeme = self.currentLexeme(),
            .line = self.line,
        };
    }
    fn makeTokenLexeme(
        self: *Scanner,
        ttype: TokenType,
        lexeme: []const u8,
    ) Token {
        return Token{
            .typ = ttype,
            .lexeme = lexeme,
            .line = self.line,
        };
    }
    /// Check if the next character matches what is expected.
    fn match(self: *Scanner, expected: u8) bool {
        if (self.isAtEnd()) return false;
        if (self.source[self.current] != expected) return false;
        self.current += 1;
        return true;
    }
    /// Add a SimpleToken of type_match if the next character is
    /// `expected`. Adds a SimpleToken of type_nomatch when it is not.
    fn makeMatchToken(
        self: *Scanner,
        expected: u8,
        type_match: TokenType,
        type_nomatch: TokenType,
    ) Token {
        if (self.match(expected)) {
            return self.makeToken(type_match);
        } else {
            return self.makeToken(type_nomatch);
        }
    }
    /// "triple" version of makeMatchToken.
    /// Required per vlang's tokens.
    fn makeTripleMatchToken(
        self: *Scanner,
        char1: u8,
        ttype1: TokenType,
        char2: u8,
        ttype2: TokenType,
        fallback: TokenType,
    ) Token {
        if (self.match(char1)) {
            return self.makeToken(ttype1);
        } else if (self.match(char2)) {
            return self.makeToken(ttype2);
        } else {
            return self.makeToken(fallback);
        }
    }
    /// Peek at the current character in the scanner
    fn peek(self: *Scanner) u8 {
        if (self.isAtEnd()) return 0;
        if (self.current == 0) return 0;
        return self.source[self.current - 1];
    }
    /// Peek at the next character in the scanner
    fn peekNext(self: *Scanner) u8 {
        if (self.current + 1 > self.source.len) return 0;
        return self.source[self.current];
    }
    /// Consume a number.
    /// Returns either an Integer or a Float token. Proper typing
    /// of the number (i32 i64 u32 u64 f32 f64) are for the parser.
    fn doNumber(self: *Scanner) Token {
        var ttype = TokenType.Integer;
        while (isDigit(self.peekNext())) {
            _ = self.advance();
        }
        // check if its a number like 12.34, where the '.' character
        // exists and the one next to it is a digit.
        if (self.peek() == '.' and isDigit(self.peekNext())) {
            ttype = TokenType.Float;
            _ = self.advance();
            while (isDigit(self.peek())) {
                _ = self.advance();
            }
        }
        return self.makeToken(ttype);
    }
    /// Consume a string. stop_char is used to determine
    /// if the string is a single quote or double quote string
    fn doString(self: *Scanner, stop_char: u8) !Token {
        // consume entire string
        while (self.peekNext() != stop_char and !self.isAtEnd()) {
            if (self.peek() == '\n') self.line += 1;
            _ = self.advance();
        }
        // unterminated string.
        if (self.isAtEnd()) {
            return ScannerError.Unterminated;
        }
        // the closing character of the string
        _ = self.advance();
        // remove the starting and ending chars of the string
        const lexeme = self.currentLexeme();
        return self.makeTokenLexeme(
            .String,
            lexeme[1 .. lexeme.len - 1],
        );
    }
    /// Either a keyword or an identifier come out of this.
    fn doIdentifier(self: *Scanner) Token {
        while (isAlphaNumeric(self.peek())) {
            _ = self.advance();
        }
        // ugly hack.
        self.rollback();
        // after reading the identifier, we check
        // if it is any of our keywords, if it is, then we add
        // the specificed keyword type. if not, just .IDENTIFIER
        var toktype: TokenType = undefined;
        var ttype_opt = getKeyword(self.currentLexeme());
        if (ttype_opt) |ttype| {
            toktype = ttype;
        } else {
            toktype = TokenType.Identifier;
        }
        return self.makeToken(toktype);
    }
    pub fn nextToken(self: *Scanner) !?Token {
        self.start = self.current;
        if (self.isAtEnd()) return self.makeToken(TokenType.EOF);
        var c = self.advance();
        if (isDigit(c)) return self.doNumber();
        if (isAlpha(c)) return self.doIdentifier();
        var token: ?Token = switch (c) {
            '(' => self.makeToken(.LeftParen),
            ')' => self.makeToken(.RightParen),
            '{' => self.makeToken(.LeftBrace),
            '}' => self.makeToken(.RightBrace),
            '[' => self.makeToken(.LeftSquare),
            ']' => self.makeToken(.RightSquare),
            '.' => self.makeToken(.Dot),
            ';' => self.makeToken(.Semicolon),
            ',' => self.makeToken(.Comma),
            '?' => self.makeToken(.QuestionMark),
            '$' => self.makeToken(.DollarSign),
            '%' => self.makeToken(.Modulo),
            ':' => self.makeMatchToken('=', .ColonEqual, .Colon),
            '*' => self.makeMatchToken('=', .StarEqual, .Star),
            '-' => self.makeMatchToken('=', .MinusEqual, .Minus),
            // we use the existing .And and .Or tokens
            // representing the and and or keywords to
            // also have || and &&
            '&' => self.makeMatchToken('&', .And, .Address),
            '|' => self.makeMatchToken('|', .Or, .Pipe),
            '!' => self.makeMatchToken('=', .BangEqual, .Bang),
            '=' => self.makeMatchToken('=', .EqualEqual, .Equal),
            '>' => self.makeMatchToken('=', .GreaterEqual, .Greater),
            '+' => self.makeTripleMatchToken('+', .PlusPlus, '=', .PlusEqual, .Plus),
            '<' => self.makeTripleMatchToken('=', .LessEqual, '<', .LeftDoubleChevron, .Less),
            '/' => blk: {
                var next = self.peekNext();
                switch (next) {
                    '=' => {
                        self.current += 1;
                        return self.makeToken(.SlashEqual);
                    },
                    '/' => blk2: {
                        while (self.peek() != '\n' and !self.isAtEnd()) {
                            _ = self.advance();
                        }
                        return null;
                    },
                    '*' => blk2: {
                        while (self.peek() != '*' or self.peekNext() != '/') {
                            _ = self.advance();
                        }
                        // consume the ending slash
                        _ = self.advance();
                        return null;
                    },
                    else => break :blk self.makeToken(.Slash),
                }
            },
            '\'' => try self.doString('\''),
            '"' => try self.doString('"'),
            ' ', '\r', '\t' => null,
            '\n' => blk: {
                self.line += 1;
                break :blk null;
            },
            else => return ScannerError.Unexpected,
        };
        return token;
    }
 };
--- a/src/tokens.zig
+++ b/src/tokens.zig
@ -0,0 +1,89 @@
 pub const TokenType = enum {
    // basic tokens
    LeftParen,
    RightParen,
    LeftBrace,
    RightBrace,
    LeftSquare,
    RightSquare,
    Dot,
    Equal,
    Semicolon,
    Comma,
    Colon,
    Address,
    Pipe,
    QuestionMark,
    DollarSign,
    // math operators
    Plus,
    Minus,
    Star,
    Slash,
    Modulo,
    // one-two char tokens
    DotEqual,
    LeftDoubleChevron, // AKA "<<"
    PlusPlus,
    PlusEqual,
    MinusEqual,
    ColonEqual,
    StarEqual,
    SlashEqual,
    // comparison ones
    EqualEqual,
    Less,
    LessEqual,
    Greater,
    GreaterEqual,
    Bang,
    BangEqual,
    // complex types
    Integer,
    Float,
    String,
    Identifier,
    // keywords
    Break,
    Const,
    Continue,
    Defer,
    Else,
    Enum,
    Fn,
    For,
    Loop,
    Go,
    Goto,
    If,
    Import,
    In,
    Interface,
    Match,
    Module,
    Mut,
    Or,
    And,
    Return,
    Struct,
    Type,
    True,
    False,
    None,
    Println,
    Pub,
    EOF,
 };
 pub const Token = struct {
    typ: TokenType,
    lexeme: []const u8,
    line: usize,
 };