c-repl/src/tokenize.cpp

#include <regex>
#include <vector>
#include <iostream>
#include <string>
#include "include/errors.h"
#include "include/tokenize.h"
using namespace std;

regex INT_REGEX ("\\d+");
regex DOUBLE_REGEX ("\\d+\\.\\d*|\\d*\\.\\d+");
regex IDENTIFIER_REGEX ("[A-Za-z_]\\w*");

vector<tuple<string, TokenType>> simpleTokens = {
    { "if", TokenType::If },
    { "else", TokenType::Else },
    { "while", TokenType::While },
    { "for", TokenType::For },
    { "break", TokenType::Break },
    { "continue", TokenType::Continue },
    { "return", TokenType::Return },
    { "++", TokenType::DoublePlus },
    { "--", TokenType::DoubleMinus },
    { "==", TokenType::DoubleEqual },
    { "&&", TokenType::Land },
    { "||", TokenType::Lor },
    { "<=", TokenType::Leq },
    { ">=", TokenType::Geq },
    { "!=", TokenType::NotEqual },
    { "<", TokenType::Lt },
    { ">", TokenType::Gt },
    { "!", TokenType::Not },
    { "+", TokenType::Plus },
    { "-", TokenType::Minus },
    { "*", TokenType::Star },
    { "/", TokenType::Slash },
    { "%", TokenType::Percent },
    { "=", TokenType::Equal },
    { ";", TokenType::Semicolon },
    { "(", TokenType::LParenthese },
    { ")", TokenType::RParenthese },
    { "{", TokenType::LCurlyBracket },
    { "}", TokenType::RCurlyBracket },
    { ",", TokenType::Comma }
};

string _debug_get_token_type_name(TokenType type) {
    switch (type) {
        case TokenType::Identifier: return "Identifier";
        case TokenType::Litteral: return "Litteral";
        case TokenType::Plus: return "Plus";
        case TokenType::Minus: return "Minus";
        case TokenType::DoublePlus: return "DoublePlus";
        case TokenType::DoubleMinus: return "DoubleMinus";
        case TokenType::DoubleEqual: return "DoubleEqual";
        case TokenType::Land: return "Land";
        case TokenType::Lor: return "Lor";
        case TokenType::Lt: return "Lt";
        case TokenType::Gt: return "Gt";
        case TokenType::Leq: return "Leq";
        case TokenType::Geq: return "Geq";
        case TokenType::NotEqual: return "NotEqual";
        case TokenType::Not: return "Not";
        case TokenType::Star: return "Star";
        case TokenType::Slash: return "Slash";
        case TokenType::Percent: return "Percent";
        case TokenType::Equal: return "Equal";
        case TokenType::Semicolon: return "Semicolon";
        case TokenType::LParenthese: return "LParenthese";
        case TokenType::RParenthese: return "RParenthese";
        case TokenType::LCurlyBracket: return "LCurlyBracket";
        case TokenType::RCurlyBracket: return "RCurlyBracket";
        case TokenType::If: return "If";
        case TokenType::Else: return "Else";
        case TokenType::While: return "While";
        case TokenType::For: return "For";
        case TokenType::Break: return "Break";
        case TokenType::Continue: return "Continue";
        case TokenType::Return: return "Return";
        case TokenType::Comma: return "Comma";
        default: return "Unknown";
    }
}

void _debug_print_token(Token token) {
    switch (token.type) {
        case TokenType::Litteral:
            if (holds_alternative<int>(token.data)) {
                cout << "Litteral(" << get<int>(token.data) << ")";
            }
            else if (holds_alternative<double>(token.data)) {
                cout << "Litteral(" << get<double>(token.data) << ")";
            }
        break;
        case TokenType::Identifier:
            cout << "Identifier(" << get<string>(token.data) << ")";
        break;
        case TokenType::Plus:
            cout << "+";
        break;
        case TokenType::Minus:
            cout << "-";
        break;
        case TokenType::DoublePlus:
            cout << "++";
        break;
        case TokenType::DoubleMinus:
            cout << "--";
        break;
        case TokenType::DoubleEqual:
            cout << "==";
        break;
        case TokenType::Land:
            cout << "&&";
        break;
        case TokenType::Lor:
            cout << "||";
        break;
        case TokenType::Lt:
            cout << "<";
        break;
        case TokenType::Gt:
            cout << ">";
        break;
        case TokenType::Leq:
            cout << "<=";
        break;
        case TokenType::Geq:
            cout << ">=";
        break;
        case TokenType::NotEqual:
            cout << "!=";
        break;
        case TokenType::Not:
            cout << "!";
        break;
        case TokenType::Star:
            cout << "*";
        break;
        case TokenType::Slash:
            cout << "/";
        break;
        case TokenType::Percent:
            cout << "%";
        break;
        case TokenType::Equal:
            cout << "=";
        break;
        case TokenType::Semicolon:
            cout << ";";
        break;
        case TokenType::LParenthese:
            cout << "(";
        break;
        case TokenType::RParenthese:
            cout << ")";
        break;
        case TokenType::LCurlyBracket:
            cout << "{";
        break;
        case TokenType::RCurlyBracket:
            cout << "}";
        break;
        case TokenType::If:
            cout << "If";
        break;
        case TokenType::Else:
            cout << "Else";
        break;
        case TokenType::While:
            cout << "While";
        break;
        case TokenType::For:
            cout << "For";
        break;
        case TokenType::Break:
            cout << "Break";
        break;
        case TokenType::Continue:
            cout << "Continue";
        break;
        case TokenType::Return:
            cout << "Return";
        break;
        case TokenType::Comma:
            cout << "Comma";
        break;
    }
}

void _debug_print_tokens(vector<Token> tokens) {
    for (Token token : tokens) {
        _debug_print_token(token);
        cout << " ";
    }
    cout << endl;
}

vector<Token> tokenize(vector<string> input, int initial_line) {
    vector<Token> tokens;

    for (int i = initial_line; i < int(input.size()); i++) {
        string line = input[i];
        int j = 0;

        while (j < int(line.length())) {
            CodePosition pos = { .line = i, .column = j };
            string str = line.substr(j, string::npos);
            smatch m;

            if (regex_search(str, m, DOUBLE_REGEX, regex_constants::match_continuous)) {
                Token token = {
                    .type = TokenType::Litteral,
                    .data = stod(m.str()),
                    .pos = pos
                };
                tokens.emplace_back(token);
                j += m.str().length();
                continue;
            }

            if (regex_search(str, m, INT_REGEX, regex_constants::match_continuous)) {
                int val;
                try { val = stoi(m.str()); }
                catch (const out_of_range& e) {
                    throw SyntaxError(ErrorType::IntegerTooLarge, pos);
                }

                Token token = {
                    .type = TokenType::Litteral,
                    .data = val,
                    .pos = pos
                };
                tokens.emplace_back(token);
                j += m.str().length();
                continue;
            }

            bool matched = false;
            for (auto simpleToken: simpleTokens) {
                if (str.starts_with(get<0>(simpleToken))) {
                    Token token = { .type = get<1>(simpleToken), .pos = pos };
                    tokens.emplace_back(token);
                    j += get<0>(simpleToken).length();
                    matched = true;
                    break;
                }
            }
            if (matched) continue;

            if (regex_search(str, m, IDENTIFIER_REGEX, regex_constants::match_continuous)) {
                Token token = {
                    .type = TokenType::Identifier,
                    .data = m.str(),
                    .pos = pos
                };
                tokens.emplace_back(token);
                j += m.str().length();
                continue;
            }

            if (isspace(str[0])) {
                j += 1;
                continue;
            }

            throw SyntaxError(ErrorType::UnknownToken, pos);
        }
    }

    return tokens;
}