c-repl/src/tokenize.cpp

239 lines
7.1 KiB
C++
Raw Normal View History

2023-10-27 16:56:54 +02:00
#include <regex>
#include <vector>
#include <iostream>
2023-11-15 14:31:11 +01:00
#include <string>
2023-10-27 16:56:54 +02:00
#include "include/tokenize.h"
using namespace std;
2023-12-08 09:04:05 +01:00
regex INT_REGEX ("\\d+");
regex DOUBLE_REGEX ("\\d+\\.\\d*|\\d*\\.\\d+");
2023-10-27 16:56:54 +02:00
regex IDENTIFIER_REGEX ("[A-Za-z_]\\w*");
2023-12-08 09:04:05 +01:00
vector<tuple<string, TokenType>> simpleTokens = {
{ "if", TokenType::If },
{ "else", TokenType::Else },
{ "while", TokenType::While },
{ "for", TokenType::For },
{ "++", TokenType::DoublePlus },
{ "--", TokenType::DoubleMinus },
{ "==", TokenType::DoubleEqual },
{ "&&", TokenType::Land },
{ "||", TokenType::Lor },
{ "<=", TokenType::Leq },
{ ">=", TokenType::Geq },
{ "!=", TokenType::NotEqual },
{ "<", TokenType::Lt },
{ ">", TokenType::Gt },
{ "!", TokenType::Not },
{ "+", TokenType::Plus },
{ "-", TokenType::Minus },
{ "*", TokenType::Star },
{ "/", TokenType::Slash },
{ "%", TokenType::Percent },
{ "=", TokenType::Equal },
{ ";", TokenType::Semicolon },
{ "(", TokenType::LParenthese },
{ ")", TokenType::RParenthese },
{ "{", TokenType::LCurlyBracket },
{ "}", TokenType::RCurlyBracket }
};
2023-11-10 19:04:24 +01:00
void _debug_print_token(Token token) {
switch (token.type) {
2023-12-08 09:04:05 +01:00
case TokenType::Litteral:
cout << "Litteral(" << "tmp" << ")";
2023-11-10 19:04:24 +01:00
break;
case TokenType::Identifier:
cout << "Identifier(" << get<string>(token.data) << ")";
break;
case TokenType::Plus:
cout << "+";
break;
case TokenType::Minus:
cout << "-";
break;
case TokenType::DoublePlus:
cout << "++";
break;
case TokenType::DoubleMinus:
cout << "--";
break;
case TokenType::DoubleEqual:
cout << "==";
break;
2023-11-22 16:20:20 +01:00
case TokenType::Land:
cout << "&&";
break;
case TokenType::Lor:
cout << "||";
break;
2023-11-22 16:03:27 +01:00
case TokenType::Lt:
cout << "<";
break;
case TokenType::Gt:
cout << ">";
break;
case TokenType::Leq:
cout << "<=";
break;
case TokenType::Geq:
cout << ">=";
break;
case TokenType::NotEqual:
cout << "!=";
break;
case TokenType::Not:
cout << "!";
break;
2023-11-10 19:04:24 +01:00
case TokenType::Star:
cout << "*";
break;
case TokenType::Slash:
cout << "/";
break;
case TokenType::Percent:
cout << "%";
break;
case TokenType::Equal:
cout << "=";
break;
case TokenType::Semicolon:
cout << ";";
break;
case TokenType::LParenthese:
cout << "(";
break;
case TokenType::RParenthese:
cout << ")";
break;
case TokenType::LCurlyBracket:
cout << "{";
break;
case TokenType::RCurlyBracket:
cout << "}";
break;
case TokenType::If:
cout << "If";
break;
case TokenType::Else:
cout << "Else";
break;
2023-11-29 11:46:01 +01:00
case TokenType::While:
cout << "While";
break;
case TokenType::For:
cout << "For";
break;
2023-11-10 19:04:24 +01:00
}
}
2023-11-24 10:21:58 +01:00
string _debug_print_token_type(TokenType type) {
switch (type) {
case TokenType::Identifier: return "Identifier";
2023-12-08 09:04:05 +01:00
case TokenType::Litteral: return "Litteral";
2023-11-24 10:21:58 +01:00
case TokenType::Plus: return "Plus";
case TokenType::Minus: return "Minus";
case TokenType::DoublePlus: return "DoublePlus";
case TokenType::DoubleMinus: return "DoubleMinus";
case TokenType::DoubleEqual: return "DoubleEqual";
case TokenType::Land: return "Land";
case TokenType::Lor: return "Lor";
case TokenType::Lt: return "Lt";
case TokenType::Gt: return "Gt";
case TokenType::Leq: return "Leq";
case TokenType::Geq: return "Geq";
case TokenType::NotEqual: return "NotEqual";
case TokenType::Not: return "Not";
case TokenType::Star: return "Star";
case TokenType::Slash: return "Slash";
case TokenType::Percent: return "Percent";
case TokenType::Equal: return "Equal";
case TokenType::Semicolon: return "Semicolon";
case TokenType::LParenthese: return "LParenthese";
case TokenType::RParenthese: return "RParenthese";
case TokenType::LCurlyBracket: return "LCurlyBracket";
case TokenType::RCurlyBracket: return "RCurlyBracket";
case TokenType::If: return "If";
case TokenType::Else: return "Else";
2023-11-29 11:46:01 +01:00
case TokenType::While: return "While";
case TokenType::For: return "For";
2023-11-24 10:21:58 +01:00
default: return "Unknown";
}
}
2023-11-10 17:35:33 +01:00
void _debug_print_tokens(vector<Token> tokens) {
2023-10-27 16:56:54 +02:00
for (Token token : tokens) {
2023-11-10 19:04:24 +01:00
_debug_print_token(token);
2023-10-27 16:56:54 +02:00
cout << " ";
}
cout << endl;
}
2023-11-15 16:07:50 +01:00
vector<Token> tokenize(vector<string> input, int initial_line) {
2023-10-27 16:56:54 +02:00
vector<Token> tokens;
2023-11-15 16:07:50 +01:00
for (int i = initial_line; i < int(input.size()); i++) {
2023-11-15 14:31:11 +01:00
string line = input[i];
int j = 0;
while (j < int(line.length())) {
2023-11-15 15:23:33 +01:00
CodePosition pos = { .line = i, .column = j };
2023-11-15 14:31:11 +01:00
string str = line.substr(j, string::npos);
smatch m;
2023-12-08 09:04:05 +01:00
if (regex_search(str, m, DOUBLE_REGEX, regex_constants::match_continuous)) {
2023-11-15 14:31:11 +01:00
Token token = {
2023-12-08 09:04:05 +01:00
.type = TokenType::Litteral,
.data = stod(m.str()),
2023-11-15 14:31:11 +01:00
.pos = pos
};
tokens.emplace_back(token);
j += m.str().length();
2023-12-08 09:04:05 +01:00
continue;
2023-11-15 14:31:11 +01:00
}
2023-12-08 09:04:05 +01:00
if (regex_search(str, m, INT_REGEX, regex_constants::match_continuous)) {
2023-11-15 14:31:11 +01:00
Token token = {
2023-12-08 09:04:05 +01:00
.type = TokenType::Litteral,
.data = stoi(m.str()),
2023-11-15 14:31:11 +01:00
.pos = pos
};
tokens.emplace_back(token);
j += m.str().length();
2023-12-08 09:04:05 +01:00
continue;
}
bool matched = false;
for (auto simpleToken: simpleTokens) {
if (str.starts_with(get<0>(simpleToken))) {
Token token = { .type = get<1>(simpleToken), .pos = pos };
tokens.emplace_back(token);
j += get<0>(simpleToken).length();
matched = true;
break;
}
}
if (matched) continue;
if (regex_search(str, m, IDENTIFIER_REGEX, regex_constants::match_continuous)) {
2023-11-22 15:31:30 +01:00
Token token = {
.type = TokenType::Identifier,
.data = m.str(),
.pos = pos
};
tokens.emplace_back(token);
j += m.str().length();
2023-12-08 09:04:05 +01:00
continue;
2023-11-22 15:31:30 +01:00
}
2023-12-08 09:04:05 +01:00
if (isspace(str[0])) {
2023-11-15 14:31:11 +01:00
j += 1;
2023-12-08 09:04:05 +01:00
continue;
2023-11-15 14:31:11 +01:00
}
2023-12-08 09:04:05 +01:00
throw TokenError("Unknown token", pos);
2023-10-27 16:56:54 +02:00
}
}
return tokens;
}