stupid tokenizer im just going to use a library

main
Brett 2024-07-27 19:16:06 -04:00
parent 5dbf56d65b
commit 04ff53315d
3 changed files with 93 additions and 8 deletions

View File

@ -1,5 +1,5 @@
cmake_minimum_required(VERSION 3.25) cmake_minimum_required(VERSION 3.25)
project(graphs VERSION 0.1.0) project(graphs VERSION 0.1.1)
option(ENABLE_ADDRSAN "Enable the address sanitizer" OFF) option(ENABLE_ADDRSAN "Enable the address sanitizer" OFF)
option(ENABLE_UBSAN "Enable the ub sanitizer" OFF) option(ENABLE_UBSAN "Enable the ub sanitizer" OFF)

View File

@ -24,8 +24,9 @@
namespace proc namespace proc
{ {
enum class token_e enum class state_t
{ {
NONE, // Default state, no token found.
SQUARE_OPEN, // [ SQUARE_OPEN, // [
SQUARE_CLOSE, // ] SQUARE_CLOSE, // ]
CURLY_OPEN, // { CURLY_OPEN, // {
@ -35,8 +36,8 @@ namespace proc
SEMI, // ; SEMI, // ;
COLON, // : COLON, // :
COMMENT, // # or // COMMENT, // # or //
BLOCK_BEGIN, // /* or /** COMMENT_BEGIN, // /* or /**
BLOCK_CLOSE, // */ COMMENT_CLOSE, // */
STAR, // * STAR, // *
TEXT, // any text inside quotes TEXT, // any text inside quotes
IDENT, // identifier IDENT, // identifier
@ -49,7 +50,7 @@ namespace proc
struct token_t struct token_t
{ {
// the type of this token // the type of this token
token_e token; state_t token;
// position inside file // position inside file
blt::size_t token_pos; blt::size_t token_pos;
// all data associated with token. will contain all text if text or the token characters otherwise // all data associated with token. will contain all text if text or the token characters otherwise
@ -94,8 +95,44 @@ namespace proc
const std::vector<token_t>& tokenize(); const std::vector<token_t>& tokenize();
private: private:
std::string data; [[nodiscard]] char peek(blt::size_t offset = 0) const
{
return data[current_pos + offset];
}
char advance()
{
return data[current_pos++];
}
bool has_next(blt::size_t size = 0)
{
return (current_pos + size) < data.size();
}
[[nodiscard]] bool is_digit(char c) const;
void new_token()
{
if (state == state_t::NONE)
return;
tokens.push_back({state, begin, {&data[begin], current_pos - begin}});
state = state_t::NONE;
}
bool can_state(state_t s)
{
return s == state || state == state_t::NONE;
}
private:
state_t state = state_t::NONE;
blt::size_t current_pos = 0; blt::size_t current_pos = 0;
blt::size_t line_number = 1;
blt::size_t begin = current_pos;
std::string data;
std::vector<token_t> tokens; std::vector<token_t> tokens;
}; };
} }

View File

@ -16,8 +16,56 @@
* along with this program. If not, see <https://www.gnu.org/licenses/>. * along with this program. If not, see <https://www.gnu.org/licenses/>.
*/ */
#include <tokenizer.h> #include <tokenizer.h>
#include <cctype>
const std::vector<token_t>& proc::tokenizer::tokenize() namespace proc
{ {
const std::vector<token_t>& tokenizer::tokenize()
{
while (has_next())
{
auto next = advance();
if (std::isspace(next))
{
if (next == '\n')
{
state = state_t::NEWLINE;
line_number++;
}
new_token();
continue;
}
state_t determine = state_t::NONE;
if (is_digit(next))
determine = state_t::VALUE;
else
{
switch (next)
{
case '[':
determine = state_t::SQUARE_OPEN;
break;
case ']':
determine = state_t::SQUARE_CLOSE;
break;
default:
BLT_ERROR("Failed to parse data, error found at character index %ld on line %ld", current_pos, line_number);
BLT_ERROR("Context:");
BLT_ERROR(std::string_view(&data[std::max(0ul, current_pos - 40)], std::min(data.size() - current_pos, current_pos + 40)));
break;
}
}
if (!can_state(determine))
{
begin = current_pos;
new_token();
}
}
return tokens;
}
bool tokenizer::is_digit(char c) const
{
return std::isdigit(c) || (state == state_t::VALUE && c == '.');
}
} }