diff --git a/CMakeLists.txt b/CMakeLists.txt index dd00341..53b241d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,5 +1,5 @@ cmake_minimum_required(VERSION 3.25) -project(graphs VERSION 0.1.0) +project(graphs VERSION 0.1.1) option(ENABLE_ADDRSAN "Enable the address sanitizer" OFF) option(ENABLE_UBSAN "Enable the ub sanitizer" OFF) diff --git a/include/tokenizer.h b/include/tokenizer.h index 476e414..72c8f27 100644 --- a/include/tokenizer.h +++ b/include/tokenizer.h @@ -24,8 +24,9 @@ namespace proc { - enum class token_e + enum class state_t { + NONE, // Default state, no token found. SQUARE_OPEN, // [ SQUARE_CLOSE, // ] CURLY_OPEN, // { @@ -35,8 +36,8 @@ namespace proc SEMI, // ; COLON, // : COMMENT, // # or // - BLOCK_BEGIN, // /* or /** - BLOCK_CLOSE, // */ + COMMENT_BEGIN, // /* or /** + COMMENT_CLOSE, // */ STAR, // * TEXT, // any text inside quotes IDENT, // identifier @@ -49,7 +50,7 @@ namespace proc struct token_t { // the type of this token - token_e token; + state_t token; // position inside file blt::size_t token_pos; // all data associated with token. will contain all text if text or the token characters otherwise @@ -94,8 +95,44 @@ namespace proc const std::vector& tokenize(); private: - std::string data; + [[nodiscard]] char peek(blt::size_t offset = 0) const + { + return data[current_pos + offset]; + } + + char advance() + { + return data[current_pos++]; + } + + bool has_next(blt::size_t size = 0) + { + return (current_pos + size) < data.size(); + } + + [[nodiscard]] bool is_digit(char c) const; + + void new_token() + { + if (state == state_t::NONE) + return; + tokens.push_back({state, begin, {&data[begin], current_pos - begin}}); + state = state_t::NONE; + } + + bool can_state(state_t s) + { + return s == state || state == state_t::NONE; + } + + private: + state_t state = state_t::NONE; blt::size_t current_pos = 0; + blt::size_t line_number = 1; + blt::size_t begin = current_pos; + + std::string data; + std::vector tokens; }; } diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp index e8950da..391dd7d 100644 --- a/src/tokenizer.cpp +++ b/src/tokenizer.cpp @@ -16,8 +16,56 @@ * along with this program. If not, see . */ #include +#include -const std::vector& proc::tokenizer::tokenize() +namespace proc { - + const std::vector& tokenizer::tokenize() + { + while (has_next()) + { + auto next = advance(); + if (std::isspace(next)) + { + if (next == '\n') + { + state = state_t::NEWLINE; + line_number++; + } + new_token(); + continue; + } + state_t determine = state_t::NONE; + if (is_digit(next)) + determine = state_t::VALUE; + else + { + switch (next) + { + case '[': + determine = state_t::SQUARE_OPEN; + break; + case ']': + determine = state_t::SQUARE_CLOSE; + break; + default: + BLT_ERROR("Failed to parse data, error found at character index %ld on line %ld", current_pos, line_number); + BLT_ERROR("Context:"); + BLT_ERROR(std::string_view(&data[std::max(0ul, current_pos - 40)], std::min(data.size() - current_pos, current_pos + 40))); + break; + } + } + if (!can_state(determine)) + { + begin = current_pos; + new_token(); + } + } + return tokens; + } + + bool tokenizer::is_digit(char c) const + { + return std::isdigit(c) || (state == state_t::VALUE && c == '.'); + } }