Add tokenizer

This commit is contained in:
ala89 2023-10-27 16:56:54 +02:00
parent bc7c075f9a
commit f04271df19
3 changed files with 147 additions and 2 deletions

3
.gitignore vendored
View File

@ -1,3 +1,4 @@
*.o *.o
build/* build/*
.vscode .vscode
test

View File

@ -4,12 +4,13 @@
#include <vector> #include <vector>
using namespace std; using namespace std;
enum class TokenType { Type, Number, Plus, Minus, Star, Slash, Percent, Equal, Semicolon, LParenthese, RParenthese }; enum class TokenType { Type, Identifier, Number, Plus, Minus, Star, Slash, Percent, Equal, Semicolon, LParenthese, RParenthese };
enum class Type { Int }; enum class Type { Int };
union TokenData { union TokenData {
double number; double number;
Type type; Type type;
char* identifier;
}; };
struct Token { struct Token {

143
src/tokenize.cpp Normal file
View File

@ -0,0 +1,143 @@
#include <regex>
#include <vector>
#include <iostream>
#include <bits/stdc++.h>
#include "include/tokenize.h"
using namespace std;
regex NUMBER_REGEX ("\\d+(\\.\\d+)?");
regex TYPE_INT_REGEX ("int\\s");
regex IDENTIFIER_REGEX ("[A-Za-z_]\\w*");
void print_tokens(vector<Token> tokens) {
for (Token token : tokens) {
switch (token.type) {
case TokenType::Type:
cout << "Type(INT)";
break;
case TokenType::Number:
cout << "Number(" << token.data.number << ")";
break;
case TokenType::Identifier:
cout << "Identifier(" << token.data.identifier << ")";
break;
case TokenType::Plus:
cout << "+";
break;
case TokenType::Minus:
cout << "-";
break;
case TokenType::Star:
cout << "*";
break;
case TokenType::Slash:
cout << "/";
break;
case TokenType::Percent:
cout << "%";
break;
case TokenType::Equal:
cout << "=";
break;
case TokenType::Semicolon:
cout << ";";
break;
case TokenType::LParenthese:
cout << "(";
break;
case TokenType::RParenthese:
cout << ")";
break;
}
cout << " ";
}
cout << endl;
}
vector<Token> tokenize(string str) {
vector<Token> tokens;
while (str.size() > 0) {
smatch m;
if (regex_search(str, m, NUMBER_REGEX, regex_constants::match_continuous)) {
Token token = {
.type = TokenType::Number,
.data = { .number = stof(m.str()) }
};
tokens.emplace_back(token);
str.erase(0, m.str().length());
}
else if (regex_search(str, m, TYPE_INT_REGEX, regex_constants::match_continuous)) {
Token token = {
.type = TokenType::Type,
.data = { .type = Type::Int }
};
tokens.emplace_back(token);
str.erase(0, m.str().length());
}
else if (regex_search(str, m, IDENTIFIER_REGEX, regex_constants::match_continuous)) {
char* identifier = new char[m.str().length() + 1];
strcpy(identifier, m.str().c_str());
Token token = {
.type = TokenType::Identifier,
.data = { .identifier = identifier }
};
tokens.emplace_back(token);
str.erase(0, m.str().length());
}
else if (str[0] == '+') {
Token token = { .type = TokenType::Plus };
tokens.emplace_back(token);
str.erase(0, 1);
}
else if (str[0] == '-') {
Token token = { .type = TokenType::Minus };
tokens.emplace_back(token);
str.erase(0, 1);
}
else if (str[0] == '*') {
Token token = { .type = TokenType::Star };
tokens.emplace_back(token);
str.erase(0, 1);
}
else if (str[0] == '/') {
Token token = { .type = TokenType::Slash };
tokens.emplace_back(token);
str.erase(0, 1);
}
else if (str[0] == '%') {
Token token = { .type = TokenType::Percent };
tokens.emplace_back(token);
str.erase(0, 1);
}
else if (str[0] == '=') {
Token token = { .type = TokenType::Equal };
tokens.emplace_back(token);
str.erase(0, 1);
}
else if (str[0] == ';') {
Token token = { .type = TokenType::Semicolon };
tokens.emplace_back(token);
str.erase(0, 1);
}
else if (str[0] == '(') {
Token token = { .type = TokenType::LParenthese };
tokens.emplace_back(token);
str.erase(0, 1);
}
else if (str[0] == ')') {
Token token = { .type = TokenType::RParenthese };
tokens.emplace_back(token);
str.erase(0, 1);
}
else if (isspace(str[0])) {
str.erase(0, 1);
}
else {
cerr << "Unknown token: " << str << endl;
break;
}
}
return tokens;
}