271 lines
8.2 KiB
C++
271 lines
8.2 KiB
C++
#include <regex>
|
|
#include <vector>
|
|
#include <iostream>
|
|
#include <string>
|
|
#include "include/errors.h"
|
|
#include "include/tokenize.h"
|
|
using namespace std;
|
|
|
|
regex INT_REGEX ("\\d+");
|
|
regex DOUBLE_REGEX ("\\d+\\.\\d*|\\d*\\.\\d+");
|
|
regex IDENTIFIER_REGEX ("[A-Za-z_]\\w*");
|
|
|
|
vector<tuple<string, TokenType>> simpleTokens = {
|
|
{ "if", TokenType::If },
|
|
{ "else", TokenType::Else },
|
|
{ "while", TokenType::While },
|
|
{ "for", TokenType::For },
|
|
{ "break", TokenType::Break },
|
|
{ "continue", TokenType::Continue },
|
|
{ "return", TokenType::Return },
|
|
{ "++", TokenType::DoublePlus },
|
|
{ "--", TokenType::DoubleMinus },
|
|
{ "==", TokenType::DoubleEqual },
|
|
{ "&&", TokenType::Land },
|
|
{ "||", TokenType::Lor },
|
|
{ "<=", TokenType::Leq },
|
|
{ ">=", TokenType::Geq },
|
|
{ "!=", TokenType::NotEqual },
|
|
{ "<", TokenType::Lt },
|
|
{ ">", TokenType::Gt },
|
|
{ "!", TokenType::Not },
|
|
{ "+", TokenType::Plus },
|
|
{ "-", TokenType::Minus },
|
|
{ "*", TokenType::Star },
|
|
{ "/", TokenType::Slash },
|
|
{ "%", TokenType::Percent },
|
|
{ "=", TokenType::Equal },
|
|
{ ";", TokenType::Semicolon },
|
|
{ "(", TokenType::LParenthese },
|
|
{ ")", TokenType::RParenthese },
|
|
{ "{", TokenType::LCurlyBracket },
|
|
{ "}", TokenType::RCurlyBracket },
|
|
{ ",", TokenType::Comma }
|
|
};
|
|
|
|
string _debug_get_token_type_name(TokenType type) {
|
|
switch (type) {
|
|
case TokenType::Identifier: return "Identifier";
|
|
case TokenType::Litteral: return "Litteral";
|
|
case TokenType::Plus: return "Plus";
|
|
case TokenType::Minus: return "Minus";
|
|
case TokenType::DoublePlus: return "DoublePlus";
|
|
case TokenType::DoubleMinus: return "DoubleMinus";
|
|
case TokenType::DoubleEqual: return "DoubleEqual";
|
|
case TokenType::Land: return "Land";
|
|
case TokenType::Lor: return "Lor";
|
|
case TokenType::Lt: return "Lt";
|
|
case TokenType::Gt: return "Gt";
|
|
case TokenType::Leq: return "Leq";
|
|
case TokenType::Geq: return "Geq";
|
|
case TokenType::NotEqual: return "NotEqual";
|
|
case TokenType::Not: return "Not";
|
|
case TokenType::Star: return "Star";
|
|
case TokenType::Slash: return "Slash";
|
|
case TokenType::Percent: return "Percent";
|
|
case TokenType::Equal: return "Equal";
|
|
case TokenType::Semicolon: return "Semicolon";
|
|
case TokenType::LParenthese: return "LParenthese";
|
|
case TokenType::RParenthese: return "RParenthese";
|
|
case TokenType::LCurlyBracket: return "LCurlyBracket";
|
|
case TokenType::RCurlyBracket: return "RCurlyBracket";
|
|
case TokenType::If: return "If";
|
|
case TokenType::Else: return "Else";
|
|
case TokenType::While: return "While";
|
|
case TokenType::For: return "For";
|
|
case TokenType::Break: return "Break";
|
|
case TokenType::Continue: return "Continue";
|
|
case TokenType::Return: return "Return";
|
|
case TokenType::Comma: return "Comma";
|
|
default: return "Unknown";
|
|
}
|
|
}
|
|
|
|
void _debug_print_token(Token token) {
|
|
switch (token.type) {
|
|
case TokenType::Litteral:
|
|
if (holds_alternative<int>(token.data)) {
|
|
cout << "Litteral(" << get<int>(token.data) << ")";
|
|
}
|
|
else if (holds_alternative<double>(token.data)) {
|
|
cout << "Litteral(" << get<double>(token.data) << ")";
|
|
}
|
|
break;
|
|
case TokenType::Identifier:
|
|
cout << "Identifier(" << get<string>(token.data) << ")";
|
|
break;
|
|
case TokenType::Plus:
|
|
cout << "+";
|
|
break;
|
|
case TokenType::Minus:
|
|
cout << "-";
|
|
break;
|
|
case TokenType::DoublePlus:
|
|
cout << "++";
|
|
break;
|
|
case TokenType::DoubleMinus:
|
|
cout << "--";
|
|
break;
|
|
case TokenType::DoubleEqual:
|
|
cout << "==";
|
|
break;
|
|
case TokenType::Land:
|
|
cout << "&&";
|
|
break;
|
|
case TokenType::Lor:
|
|
cout << "||";
|
|
break;
|
|
case TokenType::Lt:
|
|
cout << "<";
|
|
break;
|
|
case TokenType::Gt:
|
|
cout << ">";
|
|
break;
|
|
case TokenType::Leq:
|
|
cout << "<=";
|
|
break;
|
|
case TokenType::Geq:
|
|
cout << ">=";
|
|
break;
|
|
case TokenType::NotEqual:
|
|
cout << "!=";
|
|
break;
|
|
case TokenType::Not:
|
|
cout << "!";
|
|
break;
|
|
case TokenType::Star:
|
|
cout << "*";
|
|
break;
|
|
case TokenType::Slash:
|
|
cout << "/";
|
|
break;
|
|
case TokenType::Percent:
|
|
cout << "%";
|
|
break;
|
|
case TokenType::Equal:
|
|
cout << "=";
|
|
break;
|
|
case TokenType::Semicolon:
|
|
cout << ";";
|
|
break;
|
|
case TokenType::LParenthese:
|
|
cout << "(";
|
|
break;
|
|
case TokenType::RParenthese:
|
|
cout << ")";
|
|
break;
|
|
case TokenType::LCurlyBracket:
|
|
cout << "{";
|
|
break;
|
|
case TokenType::RCurlyBracket:
|
|
cout << "}";
|
|
break;
|
|
case TokenType::If:
|
|
cout << "If";
|
|
break;
|
|
case TokenType::Else:
|
|
cout << "Else";
|
|
break;
|
|
case TokenType::While:
|
|
cout << "While";
|
|
break;
|
|
case TokenType::For:
|
|
cout << "For";
|
|
break;
|
|
case TokenType::Break:
|
|
cout << "Break";
|
|
break;
|
|
case TokenType::Continue:
|
|
cout << "Continue";
|
|
break;
|
|
case TokenType::Return:
|
|
cout << "Return";
|
|
break;
|
|
case TokenType::Comma:
|
|
cout << "Comma";
|
|
break;
|
|
}
|
|
}
|
|
|
|
void _debug_print_tokens(vector<Token> tokens) {
|
|
for (Token token : tokens) {
|
|
_debug_print_token(token);
|
|
cout << " ";
|
|
}
|
|
cout << endl;
|
|
}
|
|
|
|
vector<Token> tokenize(vector<string> input, int initial_line) {
|
|
vector<Token> tokens;
|
|
|
|
for (int i = initial_line; i < int(input.size()); i++) {
|
|
string line = input[i];
|
|
int j = 0;
|
|
|
|
while (j < int(line.length())) {
|
|
CodePosition pos = { .line = i, .column = j };
|
|
string str = line.substr(j, string::npos);
|
|
smatch m;
|
|
|
|
if (regex_search(str, m, DOUBLE_REGEX, regex_constants::match_continuous)) {
|
|
Token token = {
|
|
.type = TokenType::Litteral,
|
|
.data = stod(m.str()),
|
|
.pos = pos
|
|
};
|
|
tokens.emplace_back(token);
|
|
j += m.str().length();
|
|
continue;
|
|
}
|
|
|
|
if (regex_search(str, m, INT_REGEX, regex_constants::match_continuous)) {
|
|
int val;
|
|
try { val = stoi(m.str()); }
|
|
catch (const out_of_range& e) {
|
|
throw SyntaxError(ErrorType::IntegerTooLarge, pos);
|
|
}
|
|
|
|
Token token = {
|
|
.type = TokenType::Litteral,
|
|
.data = val,
|
|
.pos = pos
|
|
};
|
|
tokens.emplace_back(token);
|
|
j += m.str().length();
|
|
continue;
|
|
}
|
|
|
|
bool matched = false;
|
|
for (auto simpleToken: simpleTokens) {
|
|
if (str.starts_with(get<0>(simpleToken))) {
|
|
Token token = { .type = get<1>(simpleToken), .pos = pos };
|
|
tokens.emplace_back(token);
|
|
j += get<0>(simpleToken).length();
|
|
matched = true;
|
|
break;
|
|
}
|
|
}
|
|
if (matched) continue;
|
|
|
|
if (regex_search(str, m, IDENTIFIER_REGEX, regex_constants::match_continuous)) {
|
|
Token token = {
|
|
.type = TokenType::Identifier,
|
|
.data = m.str(),
|
|
.pos = pos
|
|
};
|
|
tokens.emplace_back(token);
|
|
j += m.str().length();
|
|
continue;
|
|
}
|
|
|
|
if (isspace(str[0])) {
|
|
j += 1;
|
|
continue;
|
|
}
|
|
|
|
throw SyntaxError(ErrorType::UnknownToken, pos);
|
|
}
|
|
}
|
|
|
|
return tokens;
|
|
} |