#include #include #include #include #include "include/tokenize.h" using namespace std; regex NUMBER_REGEX ("\\d+(\\.\\d+)?"); regex TYPE_INT_REGEX ("int\\s"); regex IDENTIFIER_REGEX ("[A-Za-z_]\\w*"); void _debug_print_token(Token token) { switch (token.type) { case TokenType::Type: cout << "Type(INT)"; break; case TokenType::Int: cout << "Number(" << get(token.data) << ")"; break; case TokenType::Identifier: cout << "Identifier(" << get(token.data) << ")"; break; case TokenType::Plus: cout << "+"; break; case TokenType::Minus: cout << "-"; break; case TokenType::DoublePlus: cout << "++"; break; case TokenType::DoubleMinus: cout << "--"; break; case TokenType::Star: cout << "*"; break; case TokenType::Slash: cout << "/"; break; case TokenType::Percent: cout << "%"; break; case TokenType::Equal: cout << "="; break; case TokenType::Semicolon: cout << ";"; break; case TokenType::LParenthese: cout << "("; break; case TokenType::RParenthese: cout << ")"; break; } } void _debug_print_tokens(vector tokens) { for (Token token : tokens) { _debug_print_token(token); cout << " "; } cout << endl; } vector tokenize(vector input) { vector tokens; for (int i = 0; i < int(input.size()); i++) { string line = input[i]; int j = 0; while (j < int(line.length())) { CodePosition pos = { .line = i, .column = j }; string str = line.substr(j, string::npos); smatch m; if (regex_search(str, m, NUMBER_REGEX, regex_constants::match_continuous)) { Token token = { .type = TokenType::Int, .data = stoi(m.str()), .pos = pos }; tokens.emplace_back(token); j += m.str().length(); } else if (regex_search(str, m, TYPE_INT_REGEX, regex_constants::match_continuous)) { Token token = { .type = TokenType::Type, .data = Type::Int, .pos = pos }; tokens.emplace_back(token); j += m.str().length(); } else if (regex_search(str, m, IDENTIFIER_REGEX, regex_constants::match_continuous)) { Token token = { .type = TokenType::Identifier, .data = m.str(), .pos = pos }; tokens.emplace_back(token); j += m.str().length(); } else if (str.length() >= 2 && str[0] == '+' && str[1] == '+') { Token token = { .type = TokenType::DoublePlus, .pos = pos }; tokens.emplace_back(token); j += 2; } else if (str.length() >= 2 && str[0] == '-' && str[1] == '-') { Token token = { .type = TokenType::DoubleMinus, .pos = pos }; tokens.emplace_back(token); j += 2; } else if (str[0] == '+') { Token token = { .type = TokenType::Plus, .pos = pos }; tokens.emplace_back(token); j += 1; } else if (str[0] == '-') { Token token = { .type = TokenType::Minus, .pos = pos }; tokens.emplace_back(token); j += 1; } else if (str[0] == '*') { Token token = { .type = TokenType::Star, .pos = pos }; tokens.emplace_back(token); j += 1; } else if (str[0] == '/') { Token token = { .type = TokenType::Slash, .pos = pos }; tokens.emplace_back(token); j += 1; } else if (str[0] == '%') { Token token = { .type = TokenType::Percent, .pos = pos }; tokens.emplace_back(token); j += 1; } else if (str[0] == '=') { Token token = { .type = TokenType::Equal, .pos = pos }; tokens.emplace_back(token); j += 1; } else if (str[0] == ';') { Token token = { .type = TokenType::Semicolon, .pos = pos }; tokens.emplace_back(token); j += 1; } else if (str[0] == '(') { Token token = { .type = TokenType::LParenthese, .pos = pos }; tokens.emplace_back(token); j += 1; } else if (str[0] == ')') { Token token = { .type = TokenType::RParenthese, .pos = pos }; tokens.emplace_back(token); j += 1; } else if (isspace(str[0]) || str[0] == '\0') { j += 1; } else { throw TokenError("Unknown token {}", pos); } } } return tokens; }