stupid tokenizer im just going to use a library

2024-07-27 19:16:06 -04:00 · 2024-07-27 19:16:06 -04:00 · 04ff53315d
parent 5dbf56d65b
commit 04ff53315d
3 changed files with 93 additions and 8 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,5 +1,5 @@
 cmake_minimum_required(VERSION 3.25)
-project(graphs VERSION 0.1.0)
+project(graphs VERSION 0.1.1)

 option(ENABLE_ADDRSAN "Enable the address sanitizer" OFF)
 option(ENABLE_UBSAN "Enable the ub sanitizer" OFF)
--- a/include/tokenizer.h
+++ b/include/tokenizer.h
@ -24,8 +24,9 @@

 namespace proc
 {
-    enum class token_e
+    enum class state_t
    {
+        NONE,           // Default state, no token found.
        SQUARE_OPEN,    // [
        SQUARE_CLOSE,   // ]
        CURLY_OPEN,     // {
@ -35,8 +36,8 @@ namespace proc
        SEMI,           // ;
        COLON,          // :
        COMMENT,        // # or //
-        BLOCK_BEGIN,    // /* or /**
-        BLOCK_CLOSE,    // */
+        COMMENT_BEGIN,    // /* or /**
+        COMMENT_CLOSE,    // */
        STAR,           // *
        TEXT,           // any text inside quotes
        IDENT,          // identifier
@ -49,7 +50,7 @@ namespace proc
    struct token_t
    {
        // the type of this token
-        token_e token;
+        state_t token;
        // position inside file
        blt::size_t token_pos;
        // all data associated with token. will contain all text if text or the token characters otherwise
@ -94,8 +95,44 @@ namespace proc
            const std::vector<token_t>& tokenize();
        
        private:
-            std::string data;
+            [[nodiscard]] char peek(blt::size_t offset = 0) const
+            {
+                return data[current_pos + offset];
+            }
+            
+            char advance()
+            {
+                return data[current_pos++];
+            }
+            
+            bool has_next(blt::size_t size = 0)
+            {
+                return (current_pos + size) < data.size();
+            }
+            
+            [[nodiscard]] bool is_digit(char c) const;
+            
+            void new_token()
+            {
+                if (state == state_t::NONE)
+                    return;
+                tokens.push_back({state, begin, {&data[begin], current_pos - begin}});
+                state = state_t::NONE;
+            }
+            
+            bool can_state(state_t s)
+            {
+                return s == state || state == state_t::NONE;
+            }
+        
+        private:
+            state_t state = state_t::NONE;
            blt::size_t current_pos = 0;
+            blt::size_t line_number = 1;
+            blt::size_t begin = current_pos;
+            
+            std::string data;
+            
            std::vector<token_t> tokens;
    };
 }
--- a/src/tokenizer.cpp
+++ b/src/tokenizer.cpp
@ -16,8 +16,56 @@
 *  along with this program.  If not, see <https://www.gnu.org/licenses/>.
 */
 #include <tokenizer.h>
+#include <cctype>

-const std::vector<token_t>& proc::tokenizer::tokenize()
+namespace proc
 {
+    const std::vector<token_t>& tokenizer::tokenize()
+    {
+        while (has_next())
+        {
+            auto next = advance();
+            if (std::isspace(next))
+            {
+                if (next == '\n')
+                {
+                    state = state_t::NEWLINE;
+                    line_number++;
+                }
+                new_token();
+                continue;
+            }
+            state_t determine = state_t::NONE;
+            if (is_digit(next))
+                determine = state_t::VALUE;
+            else
+            {
+                switch (next)
+                {
+                    case '[':
+                        determine = state_t::SQUARE_OPEN;
+                        break;
+                    case ']':
+                        determine = state_t::SQUARE_CLOSE;
+                        break;
+                    default:
+                        BLT_ERROR("Failed to parse data, error found at character index %ld on line %ld", current_pos, line_number);
+                        BLT_ERROR("Context:");
+                        BLT_ERROR(std::string_view(&data[std::max(0ul, current_pos - 40)], std::min(data.size() - current_pos, current_pos + 40)));
+                        break;
+                }
+            }
+            if (!can_state(determine))
+            {
+                begin = current_pos;
+                new_token();
+            }
+        }
+        return tokens;
+    }
    
+    bool tokenizer::is_digit(char c) const
+    {
+        return std::isdigit(c) || (state == state_t::VALUE && c == '.');
+    }
 }