From 68dc109dadc7c9955aa79ed7addbc178f7f9d805 Mon Sep 17 00:00:00 2001 From: Brett Laptop Date: Tue, 27 Aug 2024 16:31:34 -0400 Subject: [PATCH] adding rice because symbolic regression is too unstable --- .gitignore | 4 +- CMakeLists.txt | 3 +- examples/operations_common.h | 33 +++++ examples/rice_classification.cpp | 245 +++++++++++++++++++++++++++++++ examples/symbolic_regression.cpp | 22 +-- include/blt/gp/random.h | 88 +---------- lib/blt | 2 +- 7 files changed, 286 insertions(+), 111 deletions(-) create mode 100644 examples/operations_common.h create mode 100644 examples/rice_classification.cpp diff --git a/.gitignore b/.gitignore index c1d1eb6..c0f2ba8 100644 --- a/.gitignore +++ b/.gitignore @@ -7,7 +7,5 @@ out/ massif.* callgrind.* *.out.* -<<<<<<< HEAD heaptrack.* -======= ->>>>>>> refs/remotes/origin/main +Rice_Cammeo_Osmancik.arff diff --git a/CMakeLists.txt b/CMakeLists.txt index f41377d..6ac753d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,5 +1,5 @@ cmake_minimum_required(VERSION 3.25) -project(blt-gp VERSION 0.1.26) +project(blt-gp VERSION 0.1.27) include(CTest) @@ -106,6 +106,7 @@ endmacro() if (${BUILD_EXAMPLES}) blt_add_project(blt-symbolic-regression examples/symbolic_regression.cpp example) + blt_add_project(blt-rice-classification examples/rice_classification.cpp example) endif () diff --git a/examples/operations_common.h b/examples/operations_common.h new file mode 100644 index 0000000..5140595 --- /dev/null +++ b/examples/operations_common.h @@ -0,0 +1,33 @@ +#pragma once +/* + * Copyright (C) 2024 Brett Terpstra + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#ifndef BLT_GP_OPERATIONS_COMMON_H +#define BLT_GP_OPERATIONS_COMMON_H + +#include + +blt::gp::operation_t add([](float a, float b) { return a + b; }, "add"); +blt::gp::operation_t sub([](float a, float b) { return a - b; }, "sub"); +blt::gp::operation_t mul([](float a, float b) { return a * b; }, "mul"); +blt::gp::operation_t pro_div([](float a, float b) { return b == 0.0f ? 1.0f : a / b; }, "div"); +blt::gp::operation_t op_sin([](float a) { return std::sin(a); }, "sin"); +blt::gp::operation_t op_cos([](float a) { return std::cos(a); }, "cos"); +blt::gp::operation_t op_exp([](float a) { return std::exp(a); }, "exp"); +blt::gp::operation_t op_log([](float a) { return a == 0.0f ? 0.0f : std::log(a); }, "log"); + +#endif //BLT_GP_OPERATIONS_COMMON_H diff --git a/examples/rice_classification.cpp b/examples/rice_classification.cpp new file mode 100644 index 0000000..1a0a0f0 --- /dev/null +++ b/examples/rice_classification.cpp @@ -0,0 +1,245 @@ +/* + * + * Copyright (C) 2024 Brett Terpstra + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +#include +#include +#include +#include +#include +#include +#include +#include "operations_common.h" +#include "blt/fs/loader.h" + + +//static constexpr long SEED = 41912; +static const unsigned long SEED = std::random_device()(); + +enum class rice_type_t +{ + Cammeo, + Osmancik +}; + +struct rice_record +{ + float area; + float perimeter; + float major_axis_length; + float minor_axis_length; + float eccentricity; + float convex_area; + float extent; + rice_type_t type; +}; + +std::vector fitness_cases; +std::vector testing_cases; + +blt::gp::prog_config_t config = blt::gp::prog_config_t() + .set_initial_min_tree_size(2) + .set_initial_max_tree_size(6) + .set_elite_count(2) + .set_crossover_chance(0.9) + .set_mutation_chance(0.1) + .set_reproduction_chance(0) + .set_max_generations(50) + .set_pop_size(500) + .set_thread_count(0); + +blt::gp::type_provider type_system; +blt::gp::gp_program program{type_system, SEED, config}; + +auto lit = blt::gp::operation_t([]() { + return program.get_random().get_float(-32000.0f, 32000.0f); +}, "lit").set_ephemeral(); + +blt::gp::operation_t op_area([](const rice_record& rice_data) { + return rice_data.area; +}, "area"); + +blt::gp::operation_t op_perimeter([](const rice_record& rice_data) { + return rice_data.perimeter; +}, "perimeter"); + +blt::gp::operation_t op_major_axis_length([](const rice_record& rice_data) { + return rice_data.major_axis_length; +}, "major_axis_length"); + +blt::gp::operation_t op_minor_axis_length([](const rice_record& rice_data) { + return rice_data.minor_axis_length; +}, "minor_axis_length"); + +blt::gp::operation_t op_eccentricity([](const rice_record& rice_data) { + return rice_data.eccentricity; +}, "eccentricity"); + +blt::gp::operation_t op_convex_area([](const rice_record& rice_data) { + return rice_data.convex_area; +}, "convex_area"); + +blt::gp::operation_t op_extent([](const rice_record& rice_data) { + return rice_data.extent; +}, "extent"); + +constexpr auto fitness_function = [](blt::gp::tree_t& current_tree, blt::gp::fitness_t& fitness, blt::size_t) { + constexpr double value_cutoff = 1.e15; + for (auto& fitness_case : fitness_cases) + { + auto diff = std::abs(fitness_case.y - current_tree.get_evaluation_value(&fitness_case)); + if (diff < value_cutoff) + { + fitness.raw_fitness += diff; + if (diff < 0.01) + fitness.hits++; + } else + fitness.raw_fitness += value_cutoff; + } + fitness.standardized_fitness = fitness.raw_fitness; + fitness.adjusted_fitness = (1.0 / (1.0 + fitness.standardized_fitness)); + return static_cast(fitness.hits) == fitness_cases.size(); +}; + +void load_rice_data(std::string_view rice_file_path) +{ + auto rice_file_data = blt::fs::getLinesFromFile(rice_file_path); + size_t index = 0; + while (!blt::string::contains(rice_file_data[index++], "@DATA")) + {} + std::vector c; + std::vector o; + for (std::string_view v : blt::itr_offset(rice_file_data, index)) + { + auto data = blt::string::split(v, ','); + rice_record r{std::stof(data[0]), std::stof(data[1]), std::stof(data[2]), std::stof(data[3]), std::stof(data[4]), std::stof(data[5]), + std::stof(data[6])}; + if (blt::string::contains(data[7], "Cammeo")) + { + r.type = rice_type_t::Cammeo; + c.push_back(r); + } else + { + r.type = rice_type_t::Osmancik; + o.push_back(r); + } + } + blt::size_t total_records = c.size() + o.size(); + blt::size_t training_size = total_records / 3; + for (blt::size_t i = 0; i < training_size; i++) + { + auto& random = program.get_random(); + auto& vec = random.choice() ? c : o; + auto pos = random.get_i64(0, static_cast(vec.size())); + fitness_cases.push_back(vec[pos]); + vec.erase(vec.begin() + pos); + } + testing_cases.insert(testing_cases.end(), c.begin(), c.end()); + testing_cases.insert(testing_cases.end(), o.begin(), o.end()); + std::shuffle(testing_cases.begin(), testing_cases.end(), program.get_random()); +} + +int main(int argc, const char** argv) +{ + blt::arg_parse parser; + parser.addArgument(blt::arg_builder{"-f", "--file"}.setHelp("File for rice data. Should be in .arff format.").setRequired().build()); + + auto args = parser.parse_args(argc, argv); + + auto rice_file_path = args.get("-f"); + + BLT_INFO("Starting BLT-GP Rice Classification Example"); + BLT_START_INTERVAL("Rice Classification", "Main"); + BLT_DEBUG("Setup Fitness cases"); + load_rice_data(rice_file_path); + + BLT_DEBUG("Setup Types and Operators"); + type_system.register_type(); + + blt::gp::operator_builder builder{type_system}; + program.set_operations(builder.build(add, sub, mul, pro_div, op_sin, op_cos, op_exp, op_log, lit, op_x)); + + BLT_DEBUG("Generate Initial Population"); + auto sel = blt::gp::select_tournament_t{}; + program.generate_population(type_system.get_type().id(), fitness_function, sel, sel, sel); + + BLT_DEBUG("Begin Generation Loop"); + while (!program.should_terminate()) + { + BLT_TRACE("------------{Begin Generation %ld}------------", program.get_current_generation()); + BLT_TRACE("Creating next generation"); + +#ifdef BLT_TRACK_ALLOCATIONS + auto gen_alloc = blt::gp::tracker.start_measurement(); +#endif + + BLT_START_INTERVAL("Rice Classification", "Gen"); + program.create_next_generation(); + BLT_END_INTERVAL("Rice Classification", "Gen"); + +#ifdef BLT_TRACK_ALLOCATIONS + blt::gp::tracker.stop_measurement(gen_alloc); + BLT_TRACE("Generation Allocated %ld times with a total of %s", gen_alloc.getAllocationDifference(), + blt::byte_convert_t(gen_alloc.getAllocatedByteDifference()).convert_to_nearest_type().to_pretty_string().c_str()); + auto fitness_alloc = blt::gp::tracker.start_measurement(); +#endif + + BLT_TRACE("Move to next generation"); + BLT_START_INTERVAL("Rice Classification", "Fitness"); + program.next_generation(); + BLT_TRACE("Evaluate Fitness"); + program.evaluate_fitness(); + BLT_END_INTERVAL("Rice Classification", "Fitness"); + +#ifdef BLT_TRACK_ALLOCATIONS + blt::gp::tracker.stop_measurement(fitness_alloc); + BLT_TRACE("Fitness Allocated %ld times with a total of %s", fitness_alloc.getAllocationDifference(), + blt::byte_convert_t(fitness_alloc.getAllocatedByteDifference()).convert_to_nearest_type().to_pretty_string().c_str()); +#endif + + BLT_TRACE("----------------------------------------------"); + std::cout << std::endl; + } + + BLT_END_INTERVAL("Rice Classification", "Main"); + + auto best = program.get_best_individuals<3>(); + + BLT_INFO("Best approximations:"); + for (auto& i_ref : best) + { + auto& i = i_ref.get(); + BLT_DEBUG("Fitness: %lf, stand: %lf, raw: %lf", i.fitness.adjusted_fitness, i.fitness.standardized_fitness, i.fitness.raw_fitness); + i.tree.print(program, std::cout); + std::cout << "\n"; + } + auto& stats = program.get_population_stats(); + BLT_INFO("Stats:"); + BLT_INFO("Average fitness: %lf", stats.average_fitness.load()); + BLT_INFO("Best fitness: %lf", stats.best_fitness.load()); + BLT_INFO("Worst fitness: %lf", stats.worst_fitness.load()); + BLT_INFO("Overall fitness: %lf", stats.overall_fitness.load()); + // TODO: make stats helper + + BLT_PRINT_PROFILE("Rice Classification", blt::PRINT_CYCLES | blt::PRINT_THREAD | blt::PRINT_WALL); + +#ifdef BLT_TRACK_ALLOCATIONS + BLT_TRACE("Total Allocations: %ld times with a total of %s", blt::gp::tracker.getAllocations(), + blt::byte_convert_t(blt::gp::tracker.getAllocatedBytes()).convert_to_nearest_type().to_pretty_string().c_str()); +#endif + + return 0; +} \ No newline at end of file diff --git a/examples/symbolic_regression.cpp b/examples/symbolic_regression.cpp index 7bb1160..3e2d24c 100644 --- a/examples/symbolic_regression.cpp +++ b/examples/symbolic_regression.cpp @@ -21,6 +21,7 @@ #include #include #include +#include "operations_common.h" //static constexpr long SEED = 41912; static const unsigned long SEED = std::random_device()(); @@ -40,21 +41,12 @@ blt::gp::prog_config_t config = blt::gp::prog_config_t() .set_mutation_chance(0.1) .set_reproduction_chance(0) .set_max_generations(50) - .set_pop_size(5000) + .set_pop_size(500) .set_thread_count(0); blt::gp::type_provider type_system; blt::gp::gp_program program{type_system, SEED, config}; -blt::gp::operation_t add([](float a, float b) { return a + b; }, "add"); -blt::gp::operation_t sub([](float a, float b) { return a - b; }, "sub"); -blt::gp::operation_t mul([](float a, float b) { return a * b; }, "mul"); -blt::gp::operation_t pro_div([](float a, float b) { return b == 0.0f ? 1.0f : a / b; }, "div"); -blt::gp::operation_t op_sin([](float a) { return std::sin(a); }, "sin"); -blt::gp::operation_t op_cos([](float a) { return std::cos(a); }, "cos"); -blt::gp::operation_t op_exp([](float a) { return std::exp(a); }, "exp"); -blt::gp::operation_t op_log([](float a) { return a == 0.0f ? 0.0f : std::log(a); }, "log"); - auto lit = blt::gp::operation_t([]() { return program.get_random().get_float(-320.0f, 320.0f); }, "lit").set_ephemeral(); @@ -107,7 +99,7 @@ int main() program.set_operations(builder.build(add, sub, mul, pro_div, op_sin, op_cos, op_exp, op_log, lit, op_x)); BLT_DEBUG("Generate Initial Population"); - auto sel = blt::gp::select_fitness_proportionate_t{}; + auto sel = blt::gp::select_tournament_t{}; program.generate_population(type_system.get_type().id(), fitness_function, sel, sel, sel); BLT_DEBUG("Begin Generation Loop"); @@ -174,14 +166,6 @@ int main() BLT_TRACE("Total Allocations: %ld times with a total of %s", blt::gp::tracker.getAllocations(), blt::byte_convert_t(blt::gp::tracker.getAllocatedBytes()).convert_to_nearest_type().to_pretty_string().c_str()); #endif - -// BLT_TRACE("Allocations:"); -// auto h = static_cast(blt::gp::hello.load()); -// auto u = static_cast(blt::gp::unhello.load()); -// BLT_TRACE("Allocated: %ld", h); -// BLT_TRACE("Deallocated: %ld", u); -// BLT_TRACE("Ratio: %lf Difference: %ld", static_cast(h) / static_cast(u), std::abs(h - u)); -// BLT_TRACE("Total Allocated Bytes: %ld", blt::gp::hello_bytes.load()); return 0; } \ No newline at end of file diff --git a/include/blt/gp/random.h b/include/blt/gp/random.h index eecc3bd..4875a57 100644 --- a/include/blt/gp/random.h +++ b/include/blt/gp/random.h @@ -25,93 +25,7 @@ namespace blt::gp { -#define BLT_RANDOM_FUNCTION blt::random::murmur_random64 -#define BLT_RANDOM_FLOAT blt::random::murmur_float64 -#define BLT_RANDOM_DOUBLE blt::random::murmur_double64 - - class random_t - { - public: - explicit random_t(blt::u64 seed): seed(seed) - {} - - void set_seed(blt::u64 s) - { - seed = s; - } - - float get_float() - { - return BLT_RANDOM_FLOAT(seed); - } - - double get_double() - { - return BLT_RANDOM_DOUBLE(seed); - } - - // [min, max) - double get_double(double min, double max) - { - return BLT_RANDOM_FUNCTION(seed, min, max); - } - - // [min, max) - float get_float(float min, float max) - { - return BLT_RANDOM_FUNCTION(seed, min, max); - } - - i32 get_i32(i32 min, i32 max) - { - return BLT_RANDOM_FUNCTION(seed, min, max); - } - - u32 get_u32(u32 min, u32 max) - { - return BLT_RANDOM_FUNCTION(seed, min, max); - } - - i64 get_i64(i64 min, i64 max) - { - return BLT_RANDOM_FUNCTION(seed, min, max); - } - - u64 get_u64(u64 min, u64 max) - { - return BLT_RANDOM_FUNCTION(seed, min, max); - } - - blt::size_t get_size_t(blt::size_t min, blt::size_t max) - { - return BLT_RANDOM_FUNCTION(seed, min, max); - } - - bool choice() - { - return BLT_RANDOM_DOUBLE(seed) < 0.5; - } - - bool choice(double cutoff) - { - return BLT_RANDOM_DOUBLE(seed) <= cutoff; - } - - template - auto& select(Container& container) - { - return container[get_u64(0, container.size())]; - } - - template - const auto& select(const Container& container) - { - return container[get_u64(0, container.size())]; - } - - private: - blt::u64 seed; - }; + using random_t = blt::random::random_t; } diff --git a/lib/blt b/lib/blt index 6632d04..b6354be 160000 --- a/lib/blt +++ b/lib/blt @@ -1 +1 @@ -Subproject commit 6632d045286b42d257eb3783e96256c13b588186 +Subproject commit b6354bed7846078e863767ce5afc7daa53b93988