stupid tokenizer im just going to use a library

2024-07-27 19:16:06 -04:00 · 2024-07-27 19:16:06 -04:00 · 04ff53315d
parent 5dbf56d65b
commit 04ff53315d
3 changed files with 93 additions and 8 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,5 +1,5 @@
 cmake_minimum_required(VERSION 3.25)
-project(graphs VERSION 0.1.0)
+project(graphs VERSION 0.1.1)
 option(ENABLE_ADDRSAN "Enable the address sanitizer" OFF)
 option(ENABLE_UBSAN "Enable the ub sanitizer" OFF)
--- a/include/tokenizer.h
+++ b/include/tokenizer.h
@ -24,8 +24,9 @@
 namespace proc
 {
-    enum class token_e
+    enum class state_t
    {
        NONE,           // Default state, no token found.
        SQUARE_OPEN,    // [
        SQUARE_CLOSE,   // ]
        CURLY_OPEN,     // {
@ -35,8 +36,8 @@ namespace proc
        SEMI,           // ;
        COLON,          // :
        COMMENT,        // # or //
-        BLOCK_BEGIN,    // /* or /**
+        COMMENT_BEGIN,    // /* or /**
-        BLOCK_CLOSE,    // */
+        COMMENT_CLOSE,    // */
        STAR,           // *
        TEXT,           // any text inside quotes
        IDENT,          // identifier
@ -49,7 +50,7 @@ namespace proc
    struct token_t
    {
        // the type of this token
-        token_e token;
+        state_t token;
        // position inside file
        blt::size_t token_pos;
        // all data associated with token. will contain all text if text or the token characters otherwise
@ -94,8 +95,44 @@ namespace proc
            const std::vector<token_t>& tokenize();
        private:
-            std::string data;
+            [[nodiscard]] char peek(blt::size_t offset = 0) const
            {
                return data[current_pos + offset];
            }
            char advance()
            {
                return data[current_pos++];
            }
            bool has_next(blt::size_t size = 0)
            {
                return (current_pos + size) < data.size();
            }
            [[nodiscard]] bool is_digit(char c) const;
            void new_token()
            {
                if (state == state_t::NONE)
                    return;
                tokens.push_back({state, begin, {&data[begin], current_pos - begin}});
                state = state_t::NONE;
            }
            bool can_state(state_t s)
            {
                return s == state || state == state_t::NONE;
            }
        private:
            state_t state = state_t::NONE;
            blt::size_t current_pos = 0;
            blt::size_t line_number = 1;
            blt::size_t begin = current_pos;
            std::string data;
            std::vector<token_t> tokens;
    };
 }
--- a/src/tokenizer.cpp
+++ b/src/tokenizer.cpp
@ -16,8 +16,56 @@
 *  along with this program.  If not, see <https://www.gnu.org/licenses/>.
 */
 #include <tokenizer.h>
 #include <cctype>
-const std::vector<token_t>& proc::tokenizer::tokenize()
+namespace proc
 {
-
+    const std::vector<token_t>& tokenizer::tokenize()
    {
        while (has_next())
        {
            auto next = advance();
            if (std::isspace(next))
            {
                if (next == '\n')
                {
                    state = state_t::NEWLINE;
                    line_number++;
                }
                new_token();
                continue;
            }
            state_t determine = state_t::NONE;
            if (is_digit(next))
                determine = state_t::VALUE;
            else
            {
                switch (next)
                {
                    case '[':
                        determine = state_t::SQUARE_OPEN;
                        break;
                    case ']':
                        determine = state_t::SQUARE_CLOSE;
                        break;
                    default:
                        BLT_ERROR("Failed to parse data, error found at character index %ld on line %ld", current_pos, line_number);
                        BLT_ERROR("Context:");
                        BLT_ERROR(std::string_view(&data[std::max(0ul, current_pos - 40)], std::min(data.size() - current_pos, current_pos + 40)));
                        break;
                }
            }
            if (!can_state(determine))
            {
                begin = current_pos;
                new_token();
            }
        }
        return tokens;
    }
    bool tokenizer::is_digit(char c) const
    {
        return std::isdigit(c) || (state == state_t::VALUE && c == '.');
    }
 }