#include #include #include #include #include "include/tokenize.h" using namespace std; regex NUMBER_REGEX ("\\d+(\\.\\d+)?"); regex TYPE_INT_REGEX ("int(\\s|$)"); regex IDENTIFIER_REGEX ("[A-Za-z_]\\w*"); void _debug_print_token(Token token) { switch (token.type) { case TokenType::Type: cout << "Type(INT)"; break; case TokenType::Int: cout << "Number(" << get(token.data) << ")"; break; case TokenType::Identifier: cout << "Identifier(" << get(token.data) << ")"; break; case TokenType::Plus: cout << "+"; break; case TokenType::Minus: cout << "-"; break; case TokenType::DoublePlus: cout << "++"; break; case TokenType::DoubleMinus: cout << "--"; break; case TokenType::DoubleEqual: cout << "=="; break; case TokenType::Land: cout << "&&"; break; case TokenType::Lor: cout << "||"; break; case TokenType::Lt: cout << "<"; break; case TokenType::Gt: cout << ">"; break; case TokenType::Leq: cout << "<="; break; case TokenType::Geq: cout << ">="; break; case TokenType::NotEqual: cout << "!="; break; case TokenType::Not: cout << "!"; break; case TokenType::Star: cout << "*"; break; case TokenType::Slash: cout << "/"; break; case TokenType::Percent: cout << "%"; break; case TokenType::Equal: cout << "="; break; case TokenType::Semicolon: cout << ";"; break; case TokenType::LParenthese: cout << "("; break; case TokenType::RParenthese: cout << ")"; break; case TokenType::LCurlyBracket: cout << "{"; break; case TokenType::RCurlyBracket: cout << "}"; break; case TokenType::If: cout << "If"; break; case TokenType::Else: cout << "Else"; break; case TokenType::While: cout << "While"; break; case TokenType::For: cout << "For"; break; } } string _debug_print_token_type(TokenType type) { switch (type) { case TokenType::Type: return "Type"; case TokenType::Identifier: return "Identifier"; case TokenType::Int: return "Int"; case TokenType::Plus: return "Plus"; case TokenType::Minus: return "Minus"; case TokenType::DoublePlus: return "DoublePlus"; case TokenType::DoubleMinus: return "DoubleMinus"; case TokenType::DoubleEqual: return "DoubleEqual"; case TokenType::Land: return "Land"; case TokenType::Lor: return "Lor"; case TokenType::Lt: return "Lt"; case TokenType::Gt: return "Gt"; case TokenType::Leq: return "Leq"; case TokenType::Geq: return "Geq"; case TokenType::NotEqual: return "NotEqual"; case TokenType::Not: return "Not"; case TokenType::Star: return "Star"; case TokenType::Slash: return "Slash"; case TokenType::Percent: return "Percent"; case TokenType::Equal: return "Equal"; case TokenType::Semicolon: return "Semicolon"; case TokenType::LParenthese: return "LParenthese"; case TokenType::RParenthese: return "RParenthese"; case TokenType::LCurlyBracket: return "LCurlyBracket"; case TokenType::RCurlyBracket: return "RCurlyBracket"; case TokenType::If: return "If"; case TokenType::Else: return "Else"; case TokenType::While: return "While"; case TokenType::For: return "For"; default: return "Unknown"; } } void _debug_print_tokens(vector tokens) { for (Token token : tokens) { _debug_print_token(token); cout << " "; } cout << endl; } vector tokenize(vector input, int initial_line) { vector tokens; for (int i = initial_line; i < int(input.size()); i++) { string line = input[i]; int j = 0; while (j < int(line.length())) { CodePosition pos = { .line = i, .column = j }; string str = line.substr(j, string::npos); smatch m; if (regex_search(str, m, NUMBER_REGEX, regex_constants::match_continuous)) { Token token = { .type = TokenType::Int, .data = stoi(m.str()), .pos = pos }; tokens.emplace_back(token); j += m.str().length(); } else if (regex_search(str, m, TYPE_INT_REGEX, regex_constants::match_continuous)) { Token token = { .type = TokenType::Type, .data = Type::Int, .pos = pos }; tokens.emplace_back(token); j += m.str().length(); } else if (str.starts_with("if")) { Token token = { .type = TokenType::If, .pos = pos }; tokens.emplace_back(token); j += 2; } else if (str.starts_with("else")) { Token token = { .type = TokenType::Else, .pos = pos }; tokens.emplace_back(token); j += 4; } else if (str.starts_with("while")) { Token token = { .type = TokenType::While, .pos = pos }; tokens.emplace_back(token); j += 5; } else if (str.starts_with("for")) { Token token = { .type = TokenType::For, .pos = pos }; tokens.emplace_back(token); j += 3; } else if (str.starts_with("++")) { Token token = { .type = TokenType::DoublePlus, .pos = pos }; tokens.emplace_back(token); j += 2; } else if (str.starts_with("--")) { Token token = { .type = TokenType::DoubleMinus, .pos = pos }; tokens.emplace_back(token); j += 2; } else if (str.starts_with("==")) { Token token = { .type = TokenType::DoubleEqual, .pos = pos }; tokens.emplace_back(token); j += 2; } else if (str.starts_with("&&")) { Token token = { .type = TokenType::Land, .pos = pos }; tokens.emplace_back(token); j += 2; } else if (str.starts_with("||")) { Token token = { .type = TokenType::Lor, .pos = pos }; tokens.emplace_back(token); j += 2; } else if (str.starts_with("<=")) { Token token = { .type = TokenType::Leq, .pos = pos }; tokens.emplace_back(token); j += 2; } else if (str.starts_with(">=")) { Token token = { .type = TokenType::Geq, .pos = pos }; tokens.emplace_back(token); j += 2; } else if (str.starts_with("!=")) { Token token = { .type = TokenType::NotEqual, .pos = pos }; tokens.emplace_back(token); j += 2; } else if (str.starts_with("<")) { Token token = { .type = TokenType::Lt, .pos = pos }; tokens.emplace_back(token); j += 1; } else if (str.starts_with(">")) { Token token = { .type = TokenType::Gt, .pos = pos }; tokens.emplace_back(token); j += 1; } else if (str.starts_with("!")) { Token token = { .type = TokenType::Not, .pos = pos }; tokens.emplace_back(token); j += 1; } else if (str.starts_with("+")) { Token token = { .type = TokenType::Plus, .pos = pos }; tokens.emplace_back(token); j += 1; } else if (str.starts_with("-")) { Token token = { .type = TokenType::Minus, .pos = pos }; tokens.emplace_back(token); j += 1; } else if (str.starts_with("*")) { Token token = { .type = TokenType::Star, .pos = pos }; tokens.emplace_back(token); j += 1; } else if (str.starts_with("/")) { Token token = { .type = TokenType::Slash, .pos = pos }; tokens.emplace_back(token); j += 1; } else if (str.starts_with("%")) { Token token = { .type = TokenType::Percent, .pos = pos }; tokens.emplace_back(token); j += 1; } else if (str.starts_with("=")) { Token token = { .type = TokenType::Equal, .pos = pos }; tokens.emplace_back(token); j += 1; } else if (str.starts_with(";")) { Token token = { .type = TokenType::Semicolon, .pos = pos }; tokens.emplace_back(token); j += 1; } else if (str.starts_with("(")) { Token token = { .type = TokenType::LParenthese, .pos = pos }; tokens.emplace_back(token); j += 1; } else if (str.starts_with(")")) { Token token = { .type = TokenType::RParenthese, .pos = pos }; tokens.emplace_back(token); j += 1; } else if (str.starts_with("{")) { Token token = { .type = TokenType::LCurlyBracket, .pos = pos }; tokens.emplace_back(token); j += 1; } else if (str.starts_with("}")) { Token token = { .type = TokenType::RCurlyBracket, .pos = pos }; tokens.emplace_back(token); j += 1; } else if (regex_search(str, m, IDENTIFIER_REGEX, regex_constants::match_continuous)) { Token token = { .type = TokenType::Identifier, .data = m.str(), .pos = pos }; tokens.emplace_back(token); j += m.str().length(); } else if (isspace(str[0])) { j += 1; } else { throw TokenError("Unknown token", pos); } } } return tokens; }