c-repl/src/tokenize.cpp

171 lines
5.3 KiB
C++
Raw Normal View History

2023-10-27 16:56:54 +02:00
#include <regex>
#include <vector>
#include <iostream>
2023-11-15 14:31:11 +01:00
#include <string>
2023-10-27 16:56:54 +02:00
#include "include/tokenize.h"
using namespace std;
regex NUMBER_REGEX ("\\d+(\\.\\d+)?");
regex TYPE_INT_REGEX ("int\\s");
regex IDENTIFIER_REGEX ("[A-Za-z_]\\w*");
2023-11-10 19:04:24 +01:00
void _debug_print_token(Token token) {
switch (token.type) {
case TokenType::Type:
cout << "Type(INT)";
break;
case TokenType::Int:
cout << "Number(" << get<int>(token.data) << ")";
break;
case TokenType::Identifier:
cout << "Identifier(" << get<string>(token.data) << ")";
break;
case TokenType::Plus:
cout << "+";
break;
case TokenType::Minus:
cout << "-";
break;
case TokenType::DoublePlus:
cout << "++";
break;
case TokenType::DoubleMinus:
cout << "--";
break;
2023-11-10 19:04:24 +01:00
case TokenType::Star:
cout << "*";
break;
case TokenType::Slash:
cout << "/";
break;
case TokenType::Percent:
cout << "%";
break;
case TokenType::Equal:
cout << "=";
break;
case TokenType::Semicolon:
cout << ";";
break;
case TokenType::LParenthese:
cout << "(";
break;
case TokenType::RParenthese:
cout << ")";
break;
}
}
2023-11-10 17:35:33 +01:00
void _debug_print_tokens(vector<Token> tokens) {
2023-10-27 16:56:54 +02:00
for (Token token : tokens) {
2023-11-10 19:04:24 +01:00
_debug_print_token(token);
2023-10-27 16:56:54 +02:00
cout << " ";
}
cout << endl;
}
2023-11-15 14:31:11 +01:00
vector<Token> tokenize(vector<string> input) {
2023-10-27 16:56:54 +02:00
vector<Token> tokens;
2023-11-15 14:31:11 +01:00
for (int i = 0; i < int(input.size()); i++) {
string line = input[i];
int j = 0;
while (j < int(line.length())) {
2023-11-15 15:23:33 +01:00
CodePosition pos = { .line = i, .column = j };
2023-11-15 14:31:11 +01:00
string str = line.substr(j, string::npos);
smatch m;
if (regex_search(str, m, NUMBER_REGEX, regex_constants::match_continuous)) {
Token token = {
.type = TokenType::Int,
.data = stoi(m.str()),
.pos = pos
};
tokens.emplace_back(token);
j += m.str().length();
}
else if (regex_search(str, m, TYPE_INT_REGEX, regex_constants::match_continuous)) {
Token token = {
.type = TokenType::Type,
.data = Type::Int,
.pos = pos
};
tokens.emplace_back(token);
j += m.str().length();
}
else if (regex_search(str, m, IDENTIFIER_REGEX, regex_constants::match_continuous)) {
Token token = {
.type = TokenType::Identifier,
.data = m.str(),
.pos = pos
};
tokens.emplace_back(token);
j += m.str().length();
}
else if (str.length() >= 2 && str[0] == '+' && str[1] == '+') {
Token token = { .type = TokenType::DoublePlus, .pos = pos };
tokens.emplace_back(token);
j += 2;
}
else if (str.length() >= 2 && str[0] == '-' && str[1] == '-') {
Token token = { .type = TokenType::DoubleMinus, .pos = pos };
tokens.emplace_back(token);
j += 2;
}
else if (str[0] == '+') {
Token token = { .type = TokenType::Plus, .pos = pos };
tokens.emplace_back(token);
j += 1;
}
else if (str[0] == '-') {
Token token = { .type = TokenType::Minus, .pos = pos };
tokens.emplace_back(token);
j += 1;
}
else if (str[0] == '*') {
Token token = { .type = TokenType::Star, .pos = pos };
tokens.emplace_back(token);
j += 1;
}
else if (str[0] == '/') {
Token token = { .type = TokenType::Slash, .pos = pos };
tokens.emplace_back(token);
j += 1;
}
else if (str[0] == '%') {
Token token = { .type = TokenType::Percent, .pos = pos };
tokens.emplace_back(token);
j += 1;
}
else if (str[0] == '=') {
Token token = { .type = TokenType::Equal, .pos = pos };
tokens.emplace_back(token);
j += 1;
}
else if (str[0] == ';') {
Token token = { .type = TokenType::Semicolon, .pos = pos };
tokens.emplace_back(token);
j += 1;
}
else if (str[0] == '(') {
Token token = { .type = TokenType::LParenthese, .pos = pos };
tokens.emplace_back(token);
j += 1;
}
else if (str[0] == ')') {
Token token = { .type = TokenType::RParenthese, .pos = pos };
tokens.emplace_back(token);
j += 1;
}
else if (isspace(str[0]) || str[0] == '\0') {
j += 1;
}
else {
2023-11-15 15:42:30 +01:00
throw TokenError("Unknown token {}", pos);
2023-11-15 14:31:11 +01:00
}
2023-10-27 16:56:54 +02:00
}
}
return tokens;
}