2023-10-27 16:56:54 +02:00
|
|
|
#include <regex>
|
|
|
|
#include <vector>
|
|
|
|
#include <iostream>
|
2023-11-15 14:31:11 +01:00
|
|
|
#include <string>
|
2023-10-27 16:56:54 +02:00
|
|
|
#include "include/tokenize.h"
|
|
|
|
using namespace std;
|
|
|
|
|
|
|
|
regex NUMBER_REGEX ("\\d+(\\.\\d+)?");
|
|
|
|
regex TYPE_INT_REGEX ("int\\s");
|
|
|
|
regex IDENTIFIER_REGEX ("[A-Za-z_]\\w*");
|
|
|
|
|
2023-11-10 19:04:24 +01:00
|
|
|
void _debug_print_token(Token token) {
|
|
|
|
switch (token.type) {
|
|
|
|
case TokenType::Type:
|
|
|
|
cout << "Type(INT)";
|
|
|
|
break;
|
|
|
|
case TokenType::Int:
|
|
|
|
cout << "Number(" << get<int>(token.data) << ")";
|
|
|
|
break;
|
|
|
|
case TokenType::Identifier:
|
|
|
|
cout << "Identifier(" << get<string>(token.data) << ")";
|
|
|
|
break;
|
|
|
|
case TokenType::Plus:
|
|
|
|
cout << "+";
|
|
|
|
break;
|
|
|
|
case TokenType::Minus:
|
|
|
|
cout << "-";
|
|
|
|
break;
|
2023-11-14 17:00:34 +01:00
|
|
|
case TokenType::DoublePlus:
|
|
|
|
cout << "++";
|
|
|
|
break;
|
|
|
|
case TokenType::DoubleMinus:
|
|
|
|
cout << "--";
|
|
|
|
break;
|
2023-11-15 17:15:18 +01:00
|
|
|
case TokenType::DoubleEqual:
|
|
|
|
cout << "==";
|
|
|
|
break;
|
2023-11-10 19:04:24 +01:00
|
|
|
case TokenType::Star:
|
|
|
|
cout << "*";
|
|
|
|
break;
|
|
|
|
case TokenType::Slash:
|
|
|
|
cout << "/";
|
|
|
|
break;
|
|
|
|
case TokenType::Percent:
|
|
|
|
cout << "%";
|
|
|
|
break;
|
|
|
|
case TokenType::Equal:
|
|
|
|
cout << "=";
|
|
|
|
break;
|
|
|
|
case TokenType::Semicolon:
|
|
|
|
cout << ";";
|
|
|
|
break;
|
|
|
|
case TokenType::LParenthese:
|
|
|
|
cout << "(";
|
|
|
|
break;
|
|
|
|
case TokenType::RParenthese:
|
|
|
|
cout << ")";
|
|
|
|
break;
|
2023-11-15 17:15:18 +01:00
|
|
|
case TokenType::LCurlyBracket:
|
|
|
|
cout << "{";
|
|
|
|
break;
|
|
|
|
case TokenType::RCurlyBracket:
|
|
|
|
cout << "}";
|
|
|
|
break;
|
|
|
|
case TokenType::If:
|
|
|
|
cout << "If";
|
|
|
|
break;
|
|
|
|
case TokenType::Else:
|
|
|
|
cout << "Else";
|
|
|
|
break;
|
2023-11-10 19:04:24 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-11-10 17:35:33 +01:00
|
|
|
void _debug_print_tokens(vector<Token> tokens) {
|
2023-10-27 16:56:54 +02:00
|
|
|
for (Token token : tokens) {
|
2023-11-10 19:04:24 +01:00
|
|
|
_debug_print_token(token);
|
2023-10-27 16:56:54 +02:00
|
|
|
cout << " ";
|
|
|
|
}
|
|
|
|
cout << endl;
|
|
|
|
}
|
|
|
|
|
2023-11-15 16:07:50 +01:00
|
|
|
vector<Token> tokenize(vector<string> input, int initial_line) {
|
2023-10-27 16:56:54 +02:00
|
|
|
vector<Token> tokens;
|
|
|
|
|
2023-11-15 16:07:50 +01:00
|
|
|
for (int i = initial_line; i < int(input.size()); i++) {
|
2023-11-15 14:31:11 +01:00
|
|
|
string line = input[i];
|
|
|
|
int j = 0;
|
|
|
|
|
|
|
|
while (j < int(line.length())) {
|
2023-11-15 15:23:33 +01:00
|
|
|
CodePosition pos = { .line = i, .column = j };
|
2023-11-15 14:31:11 +01:00
|
|
|
string str = line.substr(j, string::npos);
|
|
|
|
smatch m;
|
|
|
|
|
|
|
|
if (regex_search(str, m, NUMBER_REGEX, regex_constants::match_continuous)) {
|
|
|
|
Token token = {
|
|
|
|
.type = TokenType::Int,
|
|
|
|
.data = stoi(m.str()),
|
|
|
|
.pos = pos
|
|
|
|
};
|
|
|
|
tokens.emplace_back(token);
|
|
|
|
j += m.str().length();
|
|
|
|
}
|
|
|
|
else if (regex_search(str, m, TYPE_INT_REGEX, regex_constants::match_continuous)) {
|
|
|
|
Token token = {
|
|
|
|
.type = TokenType::Type,
|
|
|
|
.data = Type::Int,
|
|
|
|
.pos = pos
|
|
|
|
};
|
|
|
|
tokens.emplace_back(token);
|
|
|
|
j += m.str().length();
|
|
|
|
}
|
|
|
|
else if (regex_search(str, m, IDENTIFIER_REGEX, regex_constants::match_continuous)) {
|
|
|
|
Token token = {
|
|
|
|
.type = TokenType::Identifier,
|
|
|
|
.data = m.str(),
|
|
|
|
.pos = pos
|
|
|
|
};
|
|
|
|
tokens.emplace_back(token);
|
|
|
|
j += m.str().length();
|
|
|
|
}
|
2023-11-15 17:15:18 +01:00
|
|
|
else if (str.starts_with("++")) {
|
2023-11-15 14:31:11 +01:00
|
|
|
Token token = { .type = TokenType::DoublePlus, .pos = pos };
|
|
|
|
tokens.emplace_back(token);
|
|
|
|
j += 2;
|
|
|
|
}
|
2023-11-15 17:15:18 +01:00
|
|
|
else if (str.starts_with("--")) {
|
2023-11-15 14:31:11 +01:00
|
|
|
Token token = { .type = TokenType::DoubleMinus, .pos = pos };
|
|
|
|
tokens.emplace_back(token);
|
|
|
|
j += 2;
|
|
|
|
}
|
2023-11-15 17:15:18 +01:00
|
|
|
else if (str.starts_with("==")) {
|
|
|
|
Token token = { .type = TokenType::DoubleEqual, .pos = pos };
|
|
|
|
tokens.emplace_back(token);
|
|
|
|
j += 2;
|
|
|
|
}
|
|
|
|
else if (str.starts_with("+")) {
|
2023-11-15 14:31:11 +01:00
|
|
|
Token token = { .type = TokenType::Plus, .pos = pos };
|
|
|
|
tokens.emplace_back(token);
|
|
|
|
j += 1;
|
|
|
|
}
|
2023-11-15 17:15:18 +01:00
|
|
|
else if (str.starts_with("-")) {
|
2023-11-15 14:31:11 +01:00
|
|
|
Token token = { .type = TokenType::Minus, .pos = pos };
|
|
|
|
tokens.emplace_back(token);
|
|
|
|
j += 1;
|
|
|
|
}
|
2023-11-15 17:15:18 +01:00
|
|
|
else if (str.starts_with("*")) {
|
2023-11-15 14:31:11 +01:00
|
|
|
Token token = { .type = TokenType::Star, .pos = pos };
|
|
|
|
tokens.emplace_back(token);
|
|
|
|
j += 1;
|
|
|
|
}
|
2023-11-15 17:15:18 +01:00
|
|
|
else if (str.starts_with("/")) {
|
2023-11-15 14:31:11 +01:00
|
|
|
Token token = { .type = TokenType::Slash, .pos = pos };
|
|
|
|
tokens.emplace_back(token);
|
|
|
|
j += 1;
|
|
|
|
}
|
2023-11-15 17:15:18 +01:00
|
|
|
else if (str.starts_with("%")) {
|
2023-11-15 14:31:11 +01:00
|
|
|
Token token = { .type = TokenType::Percent, .pos = pos };
|
|
|
|
tokens.emplace_back(token);
|
|
|
|
j += 1;
|
|
|
|
}
|
2023-11-15 17:15:18 +01:00
|
|
|
else if (str.starts_with("=")) {
|
2023-11-15 14:31:11 +01:00
|
|
|
Token token = { .type = TokenType::Equal, .pos = pos };
|
|
|
|
tokens.emplace_back(token);
|
|
|
|
j += 1;
|
|
|
|
}
|
2023-11-15 17:15:18 +01:00
|
|
|
else if (str.starts_with(";")) {
|
2023-11-15 14:31:11 +01:00
|
|
|
Token token = { .type = TokenType::Semicolon, .pos = pos };
|
|
|
|
tokens.emplace_back(token);
|
|
|
|
j += 1;
|
|
|
|
}
|
2023-11-15 17:15:18 +01:00
|
|
|
else if (str.starts_with("(")) {
|
2023-11-15 14:31:11 +01:00
|
|
|
Token token = { .type = TokenType::LParenthese, .pos = pos };
|
|
|
|
tokens.emplace_back(token);
|
|
|
|
j += 1;
|
|
|
|
}
|
2023-11-15 17:15:18 +01:00
|
|
|
else if (str.starts_with(")")) {
|
2023-11-15 14:31:11 +01:00
|
|
|
Token token = { .type = TokenType::RParenthese, .pos = pos };
|
|
|
|
tokens.emplace_back(token);
|
|
|
|
j += 1;
|
|
|
|
}
|
2023-11-15 17:15:18 +01:00
|
|
|
else if (str.starts_with("{")) {
|
|
|
|
Token token = { .type = TokenType::LCurlyBracket, .pos = pos };
|
|
|
|
tokens.emplace_back(token);
|
|
|
|
j += 1;
|
|
|
|
}
|
|
|
|
else if (str.starts_with("}")) {
|
|
|
|
Token token = { .type = TokenType::RCurlyBracket, .pos = pos };
|
|
|
|
tokens.emplace_back(token);
|
|
|
|
j += 1;
|
|
|
|
}
|
|
|
|
else if (str.starts_with("if")) {
|
|
|
|
Token token = { .type = TokenType::If, .pos = pos };
|
|
|
|
tokens.emplace_back(token);
|
|
|
|
j += 2;
|
|
|
|
}
|
|
|
|
else if (str.starts_with("else")) {
|
|
|
|
Token token = { .type = TokenType::Else, .pos = pos };
|
|
|
|
tokens.emplace_back(token);
|
|
|
|
j += 4;
|
|
|
|
}
|
|
|
|
else if (isspace(str[0])) {
|
2023-11-15 14:31:11 +01:00
|
|
|
j += 1;
|
|
|
|
}
|
|
|
|
else {
|
2023-11-15 17:15:18 +01:00
|
|
|
throw TokenError("Unknown token", pos);
|
2023-11-15 14:31:11 +01:00
|
|
|
}
|
2023-10-27 16:56:54 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return tokens;
|
|
|
|
}
|