From cb3444fcf39e704cf0dfe70577b9ae6b2c8ec209 Mon Sep 17 00:00:00 2001 From: ala89 Date: Wed, 15 Nov 2023 14:31:11 +0100 Subject: [PATCH] Add code position to tokens --- Makefile | 2 +- src/tokenize.cpp | 187 +++++++++++++++++++----------------- test/expr_arithmetiques.cpp | 2 +- test/tokenize.cpp | 4 +- test/variables.cpp | 2 +- 5 files changed, 105 insertions(+), 92 deletions(-) diff --git a/Makefile b/Makefile index a93c65d..a4c6d7c 100644 --- a/Makefile +++ b/Makefile @@ -14,7 +14,7 @@ TESTS_OBJ = $(TESTS_SRC:$(TEST_SRCDIR)/%.cpp=$(BUILDDIR)/test-%) LD_CXXFLAGS = # Compilation flag -CXXFLAGS = -Wall -Wextra -g -O3 -std=c++2a +CXXFLAGS = -Wall -Wextra -g -O0 -std=c++2a # Remove warnings about unused variables, functions, ... # -Wno-unused-parameter -Wno-unused-function -Wno-unused-variable -Wno-unused-but-set-variable # Compile with debug diff --git a/src/tokenize.cpp b/src/tokenize.cpp index 140d684..99fd77c 100644 --- a/src/tokenize.cpp +++ b/src/tokenize.cpp @@ -1,6 +1,7 @@ #include #include #include +#include #include "include/tokenize.h" using namespace std; @@ -63,95 +64,107 @@ void _debug_print_tokens(vector tokens) { cout << endl; } -vector tokenize(string str) { +vector tokenize(vector input) { vector tokens; - while (str.size() > 0) { - smatch m; - if (regex_search(str, m, NUMBER_REGEX, regex_constants::match_continuous)) { - Token token = { - .type = TokenType::Int, - .data = stoi(m.str()) - }; - tokens.emplace_back(token); - str.erase(0, m.str().length()); - } - else if (regex_search(str, m, TYPE_INT_REGEX, regex_constants::match_continuous)) { - Token token = { - .type = TokenType::Type, - .data = Type::Int - }; - tokens.emplace_back(token); - str.erase(0, m.str().length()); - } - else if (regex_search(str, m, IDENTIFIER_REGEX, regex_constants::match_continuous)) { - Token token = { - .type = TokenType::Identifier, - .data = m.str() - }; - tokens.emplace_back(token); - str.erase(0, m.str().length()); - } - else if (str.size() >= 2 && str[0] == '+' && str[1] == '+') { - Token token = { .type = TokenType::DoublePlus }; - tokens.emplace_back(token); - str.erase(0, 2); - } - else if (str.size() >= 2 && str[0] == '-' && str[1] == '-') { - Token token = { .type = TokenType::DoubleMinus }; - tokens.emplace_back(token); - str.erase(0, 2); - } - else if (str[0] == '+') { - Token token = { .type = TokenType::Plus }; - tokens.emplace_back(token); - str.erase(0, 1); - } - else if (str[0] == '-') { - Token token = { .type = TokenType::Minus }; - tokens.emplace_back(token); - str.erase(0, 1); - } - else if (str[0] == '*') { - Token token = { .type = TokenType::Star }; - tokens.emplace_back(token); - str.erase(0, 1); - } - else if (str[0] == '/') { - Token token = { .type = TokenType::Slash }; - tokens.emplace_back(token); - str.erase(0, 1); - } - else if (str[0] == '%') { - Token token = { .type = TokenType::Percent }; - tokens.emplace_back(token); - str.erase(0, 1); - } - else if (str[0] == '=') { - Token token = { .type = TokenType::Equal }; - tokens.emplace_back(token); - str.erase(0, 1); - } - else if (str[0] == ';') { - Token token = { .type = TokenType::Semicolon }; - tokens.emplace_back(token); - str.erase(0, 1); - } - else if (str[0] == '(') { - Token token = { .type = TokenType::LParenthese }; - tokens.emplace_back(token); - str.erase(0, 1); - } - else if (str[0] == ')') { - Token token = { .type = TokenType::RParenthese }; - tokens.emplace_back(token); - str.erase(0, 1); - } - else if (isspace(str[0]) || str[0] == '\0') { - str.erase(0, 1); - } - else { - throw TokenError("Unknown token {}"); + for (int i = 0; i < int(input.size()); i++) { + string line = input[i]; + int j = 0; + + CodePosition pos = { .line = i, .column = j }; + + while (j < int(line.length())) { + string str = line.substr(j, string::npos); + smatch m; + + if (regex_search(str, m, NUMBER_REGEX, regex_constants::match_continuous)) { + Token token = { + .type = TokenType::Int, + .data = stoi(m.str()), + .pos = pos + }; + tokens.emplace_back(token); + j += m.str().length(); + } + else if (regex_search(str, m, TYPE_INT_REGEX, regex_constants::match_continuous)) { + Token token = { + .type = TokenType::Type, + .data = Type::Int, + .pos = pos + }; + tokens.emplace_back(token); + j += m.str().length(); + } + else if (regex_search(str, m, IDENTIFIER_REGEX, regex_constants::match_continuous)) { + Token token = { + .type = TokenType::Identifier, + .data = m.str(), + .pos = pos + }; + tokens.emplace_back(token); + j += m.str().length(); + } + else if (str.length() >= 2 && str[0] == '+' && str[1] == '+') { + Token token = { .type = TokenType::DoublePlus, .pos = pos }; + tokens.emplace_back(token); + j += 2; + } + else if (str.length() >= 2 && str[0] == '-' && str[1] == '-') { + Token token = { .type = TokenType::DoubleMinus, .pos = pos }; + tokens.emplace_back(token); + j += 2; + } + else if (str[0] == '+') { + Token token = { .type = TokenType::Plus, .pos = pos }; + tokens.emplace_back(token); + j += 1; + } + else if (str[0] == '-') { + Token token = { .type = TokenType::Minus, .pos = pos }; + tokens.emplace_back(token); + j += 1; + } + else if (str[0] == '*') { + Token token = { .type = TokenType::Star, .pos = pos }; + tokens.emplace_back(token); + j += 1; + } + else if (str[0] == '/') { + Token token = { .type = TokenType::Slash, .pos = pos }; + tokens.emplace_back(token); + j += 1; + } + else if (str[0] == '%') { + Token token = { .type = TokenType::Percent, .pos = pos }; + tokens.emplace_back(token); + j += 1; + } + else if (str[0] == '=') { + Token token = { .type = TokenType::Equal, .pos = pos }; + tokens.emplace_back(token); + j += 1; + } + else if (str[0] == ';') { + Token token = { .type = TokenType::Semicolon, .pos = pos }; + tokens.emplace_back(token); + j += 1; + } + else if (str[0] == '(') { + Token token = { .type = TokenType::LParenthese, .pos = pos }; + tokens.emplace_back(token); + j += 1; + } + else if (str[0] == ')') { + Token token = { .type = TokenType::RParenthese, .pos = pos }; + tokens.emplace_back(token); + j += 1; + } + else if (isspace(str[0]) || str[0] == '\0') { + j += 1; + } + else { + throw TokenError("Unknown token {}"); + } } } diff --git a/test/expr_arithmetiques.cpp b/test/expr_arithmetiques.cpp index e95f09c..bee7cf7 100644 --- a/test/expr_arithmetiques.cpp +++ b/test/expr_arithmetiques.cpp @@ -5,7 +5,7 @@ #include "../src/include/interpreter.h" int execute(string s) { - vector tokens = tokenize(s); + vector tokens = tokenize({ s }); Node ast = parse(tokens); EvalResult res = eval(ast); diff --git a/test/tokenize.cpp b/test/tokenize.cpp index b05a5db..bc4edc7 100644 --- a/test/tokenize.cpp +++ b/test/tokenize.cpp @@ -13,7 +13,7 @@ int main() { { string input = "int a = x+++7;"; - vector tokens = tokenize(input); + vector tokens = tokenize({ input }); _TEST_ASSERT( _TEST_NO_EXCEPTION(tokens.size() == 8), @@ -97,7 +97,7 @@ int main() { string input = "int a = 10 @;"; _TEST_ASSERT( - _TEST_IS_EXCEPTION(tokenize(input), TokenError), + _TEST_IS_EXCEPTION(tokenize({ input }), TokenError), "Token invalide", true ) diff --git a/test/variables.cpp b/test/variables.cpp index dead4ff..0ed5c61 100644 --- a/test/variables.cpp +++ b/test/variables.cpp @@ -5,7 +5,7 @@ #include "../src/include/interpreter.h" int execute(string s) { - vector tokens = tokenize(s); + vector tokens = tokenize({ s }); Node ast = parse(tokens); EvalResult res = eval(ast);