From f04271df1976087e7363bbfca0757871f1e710e9 Mon Sep 17 00:00:00 2001 From: ala89 Date: Fri, 27 Oct 2023 16:56:54 +0200 Subject: [PATCH] Add tokenizer --- .gitignore | 3 +- src/include/tokenize.h | 3 +- src/tokenize.cpp | 143 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 147 insertions(+), 2 deletions(-) create mode 100644 src/tokenize.cpp diff --git a/.gitignore b/.gitignore index 710fc64..9567a1a 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ *.o build/* -.vscode \ No newline at end of file +.vscode +test \ No newline at end of file diff --git a/src/include/tokenize.h b/src/include/tokenize.h index 2e08661..e06fd88 100644 --- a/src/include/tokenize.h +++ b/src/include/tokenize.h @@ -4,12 +4,13 @@ #include using namespace std; -enum class TokenType { Type, Number, Plus, Minus, Star, Slash, Percent, Equal, Semicolon, LParenthese, RParenthese }; +enum class TokenType { Type, Identifier, Number, Plus, Minus, Star, Slash, Percent, Equal, Semicolon, LParenthese, RParenthese }; enum class Type { Int }; union TokenData { double number; Type type; + char* identifier; }; struct Token { diff --git a/src/tokenize.cpp b/src/tokenize.cpp new file mode 100644 index 0000000..ae8127d --- /dev/null +++ b/src/tokenize.cpp @@ -0,0 +1,143 @@ +#include +#include +#include +#include +#include "include/tokenize.h" +using namespace std; + +regex NUMBER_REGEX ("\\d+(\\.\\d+)?"); +regex TYPE_INT_REGEX ("int\\s"); +regex IDENTIFIER_REGEX ("[A-Za-z_]\\w*"); + +void print_tokens(vector tokens) { + for (Token token : tokens) { + switch (token.type) { + case TokenType::Type: + cout << "Type(INT)"; + break; + case TokenType::Number: + cout << "Number(" << token.data.number << ")"; + break; + case TokenType::Identifier: + cout << "Identifier(" << token.data.identifier << ")"; + break; + case TokenType::Plus: + cout << "+"; + break; + case TokenType::Minus: + cout << "-"; + break; + case TokenType::Star: + cout << "*"; + break; + case TokenType::Slash: + cout << "/"; + break; + case TokenType::Percent: + cout << "%"; + break; + case TokenType::Equal: + cout << "="; + break; + case TokenType::Semicolon: + cout << ";"; + break; + case TokenType::LParenthese: + cout << "("; + break; + case TokenType::RParenthese: + cout << ")"; + break; + } + cout << " "; + } + cout << endl; +} + +vector tokenize(string str) { + vector tokens; + + while (str.size() > 0) { + smatch m; + if (regex_search(str, m, NUMBER_REGEX, regex_constants::match_continuous)) { + Token token = { + .type = TokenType::Number, + .data = { .number = stof(m.str()) } + }; + tokens.emplace_back(token); + str.erase(0, m.str().length()); + } + else if (regex_search(str, m, TYPE_INT_REGEX, regex_constants::match_continuous)) { + Token token = { + .type = TokenType::Type, + .data = { .type = Type::Int } + }; + tokens.emplace_back(token); + str.erase(0, m.str().length()); + } + else if (regex_search(str, m, IDENTIFIER_REGEX, regex_constants::match_continuous)) { + char* identifier = new char[m.str().length() + 1]; + strcpy(identifier, m.str().c_str()); + Token token = { + .type = TokenType::Identifier, + .data = { .identifier = identifier } + }; + tokens.emplace_back(token); + str.erase(0, m.str().length()); + } + else if (str[0] == '+') { + Token token = { .type = TokenType::Plus }; + tokens.emplace_back(token); + str.erase(0, 1); + } + else if (str[0] == '-') { + Token token = { .type = TokenType::Minus }; + tokens.emplace_back(token); + str.erase(0, 1); + } + else if (str[0] == '*') { + Token token = { .type = TokenType::Star }; + tokens.emplace_back(token); + str.erase(0, 1); + } + else if (str[0] == '/') { + Token token = { .type = TokenType::Slash }; + tokens.emplace_back(token); + str.erase(0, 1); + } + else if (str[0] == '%') { + Token token = { .type = TokenType::Percent }; + tokens.emplace_back(token); + str.erase(0, 1); + } + else if (str[0] == '=') { + Token token = { .type = TokenType::Equal }; + tokens.emplace_back(token); + str.erase(0, 1); + } + else if (str[0] == ';') { + Token token = { .type = TokenType::Semicolon }; + tokens.emplace_back(token); + str.erase(0, 1); + } + else if (str[0] == '(') { + Token token = { .type = TokenType::LParenthese }; + tokens.emplace_back(token); + str.erase(0, 1); + } + else if (str[0] == ')') { + Token token = { .type = TokenType::RParenthese }; + tokens.emplace_back(token); + str.erase(0, 1); + } + else if (isspace(str[0])) { + str.erase(0, 1); + } + else { + cerr << "Unknown token: " << str << endl; + break; + } + } + + return tokens; +} \ No newline at end of file