#include #include #include #include #include "include/errors.h" #include "include/tokenize.h" using namespace std; regex INT_REGEX ("\\d+"); regex DOUBLE_REGEX ("\\d+\\.\\d*|\\d*\\.\\d+"); regex IDENTIFIER_REGEX ("[A-Za-z_]\\w*"); vector> simpleTokens = { { "if", TokenType::If }, { "else", TokenType::Else }, { "while", TokenType::While }, { "for", TokenType::For }, { "break", TokenType::Break }, { "continue", TokenType::Continue }, { "return", TokenType::Return }, { "++", TokenType::DoublePlus }, { "--", TokenType::DoubleMinus }, { "==", TokenType::DoubleEqual }, { "&&", TokenType::Land }, { "||", TokenType::Lor }, { "<=", TokenType::Leq }, { ">=", TokenType::Geq }, { "!=", TokenType::NotEqual }, { "<", TokenType::Lt }, { ">", TokenType::Gt }, { "!", TokenType::Not }, { "+", TokenType::Plus }, { "-", TokenType::Minus }, { "*", TokenType::Star }, { "/", TokenType::Slash }, { "%", TokenType::Percent }, { "=", TokenType::Equal }, { ";", TokenType::Semicolon }, { "(", TokenType::LParenthese }, { ")", TokenType::RParenthese }, { "{", TokenType::LCurlyBracket }, { "}", TokenType::RCurlyBracket }, { ",", TokenType::Comma } }; string _debug_get_token_type_name(TokenType type) { switch (type) { case TokenType::Identifier: return "Identifier"; case TokenType::Litteral: return "Litteral"; case TokenType::Plus: return "Plus"; case TokenType::Minus: return "Minus"; case TokenType::DoublePlus: return "DoublePlus"; case TokenType::DoubleMinus: return "DoubleMinus"; case TokenType::DoubleEqual: return "DoubleEqual"; case TokenType::Land: return "Land"; case TokenType::Lor: return "Lor"; case TokenType::Lt: return "Lt"; case TokenType::Gt: return "Gt"; case TokenType::Leq: return "Leq"; case TokenType::Geq: return "Geq"; case TokenType::NotEqual: return "NotEqual"; case TokenType::Not: return "Not"; case TokenType::Star: return "Star"; case TokenType::Slash: return "Slash"; case TokenType::Percent: return "Percent"; case TokenType::Equal: return "Equal"; case TokenType::Semicolon: return "Semicolon"; case TokenType::LParenthese: return "LParenthese"; case TokenType::RParenthese: return "RParenthese"; case TokenType::LCurlyBracket: return "LCurlyBracket"; case TokenType::RCurlyBracket: return "RCurlyBracket"; case TokenType::If: return "If"; case TokenType::Else: return "Else"; case TokenType::While: return "While"; case TokenType::For: return "For"; case TokenType::Break: return "Break"; case TokenType::Continue: return "Continue"; case TokenType::Return: return "Return"; case TokenType::Comma: return "Comma"; default: return "Unknown"; } } void _debug_print_token(Token token) { switch (token.type) { case TokenType::Litteral: if (holds_alternative(token.data)) { cout << "Litteral(" << get(token.data) << ")"; } else if (holds_alternative(token.data)) { cout << "Litteral(" << get(token.data) << ")"; } break; case TokenType::Identifier: cout << "Identifier(" << get(token.data) << ")"; break; case TokenType::Plus: cout << "+"; break; case TokenType::Minus: cout << "-"; break; case TokenType::DoublePlus: cout << "++"; break; case TokenType::DoubleMinus: cout << "--"; break; case TokenType::DoubleEqual: cout << "=="; break; case TokenType::Land: cout << "&&"; break; case TokenType::Lor: cout << "||"; break; case TokenType::Lt: cout << "<"; break; case TokenType::Gt: cout << ">"; break; case TokenType::Leq: cout << "<="; break; case TokenType::Geq: cout << ">="; break; case TokenType::NotEqual: cout << "!="; break; case TokenType::Not: cout << "!"; break; case TokenType::Star: cout << "*"; break; case TokenType::Slash: cout << "/"; break; case TokenType::Percent: cout << "%"; break; case TokenType::Equal: cout << "="; break; case TokenType::Semicolon: cout << ";"; break; case TokenType::LParenthese: cout << "("; break; case TokenType::RParenthese: cout << ")"; break; case TokenType::LCurlyBracket: cout << "{"; break; case TokenType::RCurlyBracket: cout << "}"; break; case TokenType::If: cout << "If"; break; case TokenType::Else: cout << "Else"; break; case TokenType::While: cout << "While"; break; case TokenType::For: cout << "For"; break; case TokenType::Break: cout << "Break"; break; case TokenType::Continue: cout << "Continue"; break; case TokenType::Return: cout << "Return"; break; case TokenType::Comma: cout << "Comma"; break; } } void _debug_print_tokens(vector tokens) { for (Token token : tokens) { _debug_print_token(token); cout << " "; } cout << endl; } vector tokenize(vector input, int initial_line) { vector tokens; for (int i = initial_line; i < int(input.size()); i++) { string line = input[i]; int j = 0; while (j < int(line.length())) { CodePosition pos = { .line = i, .column = j }; string str = line.substr(j, string::npos); smatch m; if (regex_search(str, m, DOUBLE_REGEX, regex_constants::match_continuous)) { Token token = { .type = TokenType::Litteral, .data = stod(m.str()), .pos = pos }; tokens.emplace_back(token); j += m.str().length(); continue; } if (regex_search(str, m, INT_REGEX, regex_constants::match_continuous)) { int val; try { val = stoi(m.str()); } catch (const out_of_range& e) { throw SyntaxError(ErrorType::IntegerTooLarge, pos); } Token token = { .type = TokenType::Litteral, .data = val, .pos = pos }; tokens.emplace_back(token); j += m.str().length(); continue; } bool matched = false; for (auto simpleToken: simpleTokens) { if (str.starts_with(get<0>(simpleToken))) { Token token = { .type = get<1>(simpleToken), .pos = pos }; tokens.emplace_back(token); j += get<0>(simpleToken).length(); matched = true; break; } } if (matched) continue; if (regex_search(str, m, IDENTIFIER_REGEX, regex_constants::match_continuous)) { Token token = { .type = TokenType::Identifier, .data = m.str(), .pos = pos }; tokens.emplace_back(token); j += m.str().length(); continue; } if (isspace(str[0])) { j += 1; continue; } throw SyntaxError(ErrorType::UnknownToken, pos); } } return tokens; }