everything slow!

2024-07-11 21:14:23 -04:00 · 2024-07-11 21:14:23 -04:00 · ee3dc8d766
parent 63d6e89136
commit ee3dc8d766
14 changed files with 36427 additions and 93 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,5 +1,5 @@
 cmake_minimum_required(VERSION 3.25)
-project(blt-gp VERSION 0.0.60)
+project(blt-gp VERSION 0.0.61)

 include(CTest)

--- a/callgrind.out.14232
+++ b/callgrind.out.14232
--- a/examples/gp_symbolic_regression_example.cpp
+++ b/examples/gp_symbolic_regression_example.cpp
@ -16,9 +16,11 @@
 *  along with this program.  If not, see <https://www.gnu.org/licenses/>.
 */
 #include <blt/gp/program.h>
+#include <blt/profiling/profiler_v2.h>
 #include <blt/gp/tree.h>
 #include <blt/std/logging.h>
 #include <iostream>
+#include <thread>

 static constexpr long SEED = 41912;

@ -34,10 +36,11 @@ blt::gp::prog_config_t config = blt::gp::prog_config_t()
        .set_initial_max_tree_size(6)
        .set_elite_count(0)
        .set_max_generations(50)
-        .set_pop_size(500);
+        .set_pop_size(500)
+        .set_thread_count(0);

 blt::gp::type_provider type_system;
-blt::gp::gp_program program(type_system, blt::gp::random_t{SEED}, config); // NOLINT
+blt::gp::gp_program program{type_system, SEED, config};

 blt::gp::operation_t add([](float a, float b) { return a + b; }, "add");
 blt::gp::operation_t sub([](float a, float b) { return a - b; }, "sub");
@ -55,7 +58,7 @@ blt::gp::operation_t op_x([](const context& context) {
    return context.x;
 }, "x");

-constexpr auto fitness_function = [](blt::gp::tree_t& current_tree, blt::gp::fitness_t& fitness, blt::size_t index) {
+constexpr auto fitness_function = [](blt::gp::tree_t& current_tree, blt::gp::fitness_t& fitness, blt::size_t) {
    constexpr double value_cutoff = 1.e15;
    for (auto& fitness_case : fitness_cases)
    {
@ -80,6 +83,7 @@ float example_function(float x)

 int main()
 {
+    BLT_START_INTERVAL("Symbolic Regression", "Main");
    for (auto& fitness_case : fitness_cases)
    {
        constexpr float range = 10;
@ -110,11 +114,17 @@ int main()
    
    while (!program.should_terminate())
    {
+        BLT_START_INTERVAL("Symbolic Regression", "Gen");
        program.create_next_generation(blt::gp::select_tournament_t{}, blt::gp::select_tournament_t{}, blt::gp::select_tournament_t{});
+        BLT_END_INTERVAL("Symbolic Regression", "Gen");
+        BLT_START_INTERVAL("Symbolic Regression", "Fitness");
        program.next_generation();
        program.evaluate_fitness();
+        BLT_END_INTERVAL("Symbolic Regression", "Fitness");
    }
    
+    BLT_END_INTERVAL("Symbolic Regression", "Main");
+    
    auto best = program.get_best_individuals<3>();
    
    BLT_INFO("Best approximations:");
@ -125,8 +135,15 @@ int main()
        i.tree.print(program, std::cout);
        std::cout << "\n";
    }
-    BLT_INFO("");
+    auto& stats = program.get_population_stats();
+    BLT_INFO("Stats:");
+    BLT_INFO("Average fitness: %lf", stats.average_fitness.load());
+    BLT_INFO("Best fitness: %lf", stats.best_fitness.load());
+    BLT_INFO("Worst fitness: %lf", stats.worst_fitness.load());
+    BLT_INFO("Overall fitness: %lf", stats.overall_fitness.load());
    // TODO: make stats helper
    
+    BLT_PRINT_PROFILE("Symbolic Regression", blt::PRINT_CYCLES | blt::PRINT_THREAD | blt::PRINT_WALL);
+    
    return 0;
 }
--- a/examples/gp_test_2.cpp
+++ b/examples/gp_test_2.cpp
@ -24,7 +24,7 @@
 static constexpr long SEED = 41912;

 blt::gp::type_provider type_system;
-blt::gp::gp_program program(type_system, blt::gp::random_t{SEED}); // NOLINT
+blt::gp::gp_program program(type_system, SEED); // NOLINT

 blt::gp::operation_t add([](float a, float b) {
    BLT_TRACE("a: %f + b: %f = %f", a, b, a + b);
--- a/examples/gp_test_3.cpp
+++ b/examples/gp_test_3.cpp
@ -23,7 +23,7 @@
 static constexpr long SEED = 41912;

 blt::gp::type_provider type_system;
-blt::gp::gp_program program(type_system, blt::gp::random_t{SEED}); // NOLINT
+blt::gp::gp_program program(type_system, SEED); // NOLINT

 blt::gp::operation_t add([](float a, float b) { return a + b; });
 blt::gp::operation_t sub([](float a, float b) { return a - b; });
--- a/examples/gp_test_4.cpp
+++ b/examples/gp_test_4.cpp
@ -23,7 +23,7 @@
 static constexpr long SEED = 41912;

 blt::gp::type_provider type_system;
-blt::gp::gp_program program(type_system, blt::gp::random_t{SEED}); // NOLINT
+blt::gp::gp_program program(type_system, SEED); // NOLINT

 blt::gp::operation_t add([](float a, float b) { return a + b; });
 blt::gp::operation_t sub([](float a, float b) { return a - b; });
--- a/examples/gp_test_5.cpp
+++ b/examples/gp_test_5.cpp
@ -44,7 +44,7 @@ static constexpr long SEED = 41912;


 blt::gp::type_provider type_system;
-blt::gp::gp_program program(type_system, blt::gp::random_t{SEED}); // NOLINT
+blt::gp::gp_program program(type_system, SEED); // NOLINT

 blt::gp::operation_t add([](float a, float b) { return a + b; }, "add"); // 0
 blt::gp::operation_t sub([](float a, float b) { return a - b; }, "sub"); // 1
--- a/examples/gp_test_6.cpp
+++ b/examples/gp_test_6.cpp
@ -42,7 +42,7 @@ static constexpr long SEED = 41912;


 blt::gp::type_provider type_system;
-blt::gp::gp_program program(type_system, blt::gp::random_t{SEED}); // NOLINT
+blt::gp::gp_program program(type_system, SEED); // NOLINT

 blt::gp::operation_t add([](float a, float b) { return a + b; }, "add"); // 0
 blt::gp::operation_t sub([](float a, float b) { return a - b; }, "sub"); // 1
--- a/examples/gp_test_7.cpp
+++ b/examples/gp_test_7.cpp
@ -24,7 +24,7 @@ static constexpr long SEED = 41912;
 blt::gp::prog_config_t config = blt::gp::prog_config_t().set_elite_count(2);

 blt::gp::type_provider type_system;
-blt::gp::gp_program program(type_system, blt::gp::random_t{SEED}, config); // NOLINT
+blt::gp::gp_program program(type_system, SEED, config); // NOLINT
 std::array<float, 500> result_container;

 blt::gp::operation_t add([](float a, float b) { return a + b; }, "add"); // 0
@ -59,7 +59,7 @@ void print_best()
        auto& tree = v.tree;
        auto size = tree.get_values().size();
        BLT_TRACE("%lf [index %ld] (fitness: %lf, raw: %lf) (depth: %ld) (blocks: %ld) (size: t: %ld m: %ld u: %ld r: %ld) filled: %f%%",
-                  tree.get_evaluation_value<float>(nullptr), i, v.standardized_fitness, v.raw_fitness,
+                  tree.get_evaluation_value<float>(nullptr), i, v.fitness.adjusted_fitness, v.fitness.raw_fitness,
                  tree.get_depth(program), size.blocks, size.total_size_bytes, size.total_no_meta_bytes, size.total_used_bytes,
                  size.total_remaining_bytes,
                  static_cast<double>(size.total_used_bytes) / static_cast<double>(size.total_no_meta_bytes));
--- a/include/blt/gp/config.h
+++ b/include/blt/gp/config.h
@ -20,6 +20,7 @@
 #define BLT_GP_CONFIG_H

 #include <utility>
+#include <thread>
 #include <blt/std/types.h>
 #include <blt/gp/generators.h>
 #include <blt/gp/transformers.h>
@ -47,7 +48,11 @@ namespace blt::gp
        std::reference_wrapper<crossover_t> crossover;
        std::reference_wrapper<population_initializer_t> pop_initializer;
        
-        // default config (ramped half-and-half init) or for buildering
+        blt::size_t threads = std::thread::hardware_concurrency() - 1;
+        // number of elements each thread should pull per execution. this is for granularity performance and can be optimized for better results!
+        blt::size_t evaluation_size = 4;
+                
+                // default config (ramped half-and-half init) or for buildering
        prog_config_t();
        
        // default config with a user specified initializer
@ -60,6 +65,7 @@ namespace blt::gp
        prog_config_t& set_pop_size(blt::size_t pop)
        {
            population_size = pop;
+            //evaluation_size = (population_size / threads) / 2;
            return *this;
        }
        
@ -122,6 +128,19 @@ namespace blt::gp
            try_mutation_on_crossover_failure = new_try_mutation_on_crossover_failure;
            return *this;
        }
+        
+        prog_config_t& set_thread_count(blt::size_t t)
+        {
+            threads = t;
+            //evaluation_size = (population_size / threads) / 2;
+            return *this;
+        }
+        
+        prog_config_t& set_evaluation_size(blt::size_t s)
+        {
+            evaluation_size = s;
+            return *this;
+        }
    };
 }

--- a/include/blt/gp/program.h
+++ b/include/blt/gp/program.h
@ -30,6 +30,9 @@
 #include <algorithm>
 #include <memory>
 #include <array>
+#include <thread>
+#include <mutex>
+#include <atomic>

 #include <blt/std/ranges.h>
 #include <blt/std/hashmap.h>
@ -235,16 +238,16 @@ namespace blt::gp
             * call to one of the evaluator functions. This was the nicest way to provide this as C++ lacks reflection
             *
             * @param system type system to use in tree generation
-             * @param engine random engine to use throughout the program. TODO replace this with something better
+             * @param engine random engine to use throughout the program.
             * @param context_size number of arguments which are always present as "context" to the GP system / operators
             */
-            explicit gp_program(type_provider& system, random_t engine):
-                    system(system), engine(engine)
-            {}
+            explicit gp_program(type_provider& system, blt::u64 seed):
+                    system(system), seed(seed)
+            { create_threads(); }
            
-            explicit gp_program(type_provider& system, random_t engine, prog_config_t config):
-                    system(system), engine(engine), config(config)
-            {}
+            explicit gp_program(type_provider& system, blt::u64 seed, prog_config_t config):
+                    system(system), seed(seed), config(config)
+            { create_threads(); }
            
            template<typename Crossover, typename Mutation, typename Reproduction, typename CreationFunc = decltype(default_next_pop_creator<Crossover, Mutation, Reproduction>)>
            void create_next_generation(Crossover&& crossover_selection, Mutation&& mutation_selection, Reproduction&& reproduction_selection,
@ -262,7 +265,7 @@ namespace blt::gp
            
            void evaluate_fitness()
            {
-                evaluate_fitness_func();
+                evaluate_fitness_internal();
            }
            
            /**
@ -280,10 +283,10 @@ namespace blt::gp
            {
                current_pop = config.pop_initializer.get().generate(
                        {*this, root_type, config.population_size, config.initial_min_tree_size, config.initial_max_tree_size});
-                evaluate_fitness_func = [this, &fitness_function]() {
-                    evaluate_fitness_internal(fitness_function);
+                evaluate_fitness_func = [&fitness_function](tree_t& current_tree, fitness_t& fitness, blt::size_t index) {
+                    fitness_function(current_tree, fitness, index);
                };
-                evaluate_fitness_func();
+                evaluate_fitness_internal();
            }
            
            void next_generation()
@ -343,11 +346,13 @@ namespace blt::gp
                return current_generation >= config.max_generations;
            }
            
-            [[nodiscard]] inline random_t& get_random()
+            [[nodiscard]] bool should_thread_terminate() const
            {
-                return engine;
+                return should_terminate() && thread_helper.lifetime_over;
            }
            
+            [[nodiscard]] random_t& get_random() const;
+            
            [[nodiscard]] inline type_provider& get_typesystem()
            {
                return system;
@ -358,17 +363,17 @@ namespace blt::gp
                // we wanted a terminal, but could not find one, so we will select from a function that has a terminal
                if (storage.terminals[id].empty())
                    return select_non_terminal_too_deep(id);
-                return storage.terminals[id][engine.get_size_t(0, storage.terminals[id].size())];
+                return get_random().select(storage.terminals[id]);
            }
            
            inline operator_id select_non_terminal(type_id id)
            {
-                return storage.non_terminals[id][engine.get_size_t(0, storage.non_terminals[id].size())];
+                return get_random().select(storage.non_terminals[id]);
            }
            
            inline operator_id select_non_terminal_too_deep(type_id id)
            {
-                return storage.operators_ordered_terminals[id][engine.get_size_t(0, storage.operators_ordered_terminals[id].size())].first;
+                return get_random().select(storage.operators_ordered_terminals[id]).first;
            }
            
            inline operator_info& get_operator_info(operator_id id)
@ -408,29 +413,52 @@ namespace blt::gp
            
            [[nodiscard]] inline auto get_current_generation() const
            {
-                return current_generation;
+                return current_generation.load();
+            }
+            
+            [[nodiscard]] inline auto& get_population_stats()
+            {
+                return current_stats;
+            }
+            
+            ~gp_program()
+            {
+                thread_helper.lifetime_over = true;
+                for (auto& thread : thread_helper.threads)
+                {
+                    if (thread->joinable())
+                        thread->join();
+                }
            }
        
        private:
            type_provider& system;
            
-            blt::gp::stack_allocator alloc;
-            
            operator_storage storage;
            population_t current_pop;
            population_stats current_stats;
            population_t next_pop;
-            blt::size_t current_generation = 0;
+            std::atomic_uint64_t current_generation = 0;
            
-            random_t engine;
+            blt::u64 seed;
            prog_config_t config;
            
+            struct concurrency_storage
+            {
+                std::vector<std::unique_ptr<std::thread>> threads;
+                std::mutex evaluation_control;
+                std::atomic_uint64_t evaluation_left = 0;
+                std::atomic_uint64_t threads_left = 0;
+                
+                std::atomic_bool lifetime_over = false;
+            } thread_helper;
+            
            // for convenience, shouldn't decrease performance too much
-            std::function<void()> evaluate_fitness_func;
+            std::function<void(tree_t&, fitness_t&, blt::size_t)> evaluate_fitness_func;
            
            inline selector_args get_selector_args()
            {
-                return {*this, next_pop, current_pop, current_stats, config, engine};
+                return {*this, next_pop, current_pop, current_stats, config, get_random()};
            }
            
            template<typename Return, blt::size_t size, typename Accessor, blt::size_t... indexes>
@ -440,10 +468,46 @@ namespace blt::gp
                return Return{accessor(arr, indexes)...};
            }
            
-            template<typename Callable>
-            void evaluate_fitness_internal(Callable&& fitness_function)
+            void create_threads();
+            
+            void execute_thread();
+            
+            void evaluate_fitness_internal()
            {
-                current_stats = {};
+                current_stats.clear();
+                {
+                    std::scoped_lock lock(thread_helper.evaluation_control);
+                    thread_helper.evaluation_left = current_pop.get_individuals().size();
+                    thread_helper.threads_left = config.threads + 1;
+                }
+                
+                while (thread_helper.threads_left > 0)
+                    execute_thread();
+                
+//                for (auto& ind : current_pop.get_individuals())
+//                {
+//                    if (ind.fitness.adjusted_fitness > current_stats.best_fitness)
+//                    {
+//                        current_stats.best_fitness = ind.fitness.adjusted_fitness;
+//                    }
+//
+//                    if (ind.fitness.adjusted_fitness < current_stats.worst_fitness)
+//                    {
+//                        current_stats.worst_fitness = ind.fitness.adjusted_fitness;
+//                    }
+//
+//                    current_stats.overall_fitness = current_stats.overall_fitness + ind.fitness.adjusted_fitness;
+//                }
+                
+                current_stats.average_fitness = current_stats.overall_fitness / static_cast<double>(config.population_size);
+//
+//                BLT_INFO("Stats:");
+//                BLT_INFO("Average fitness: %lf", current_stats.average_fitness.load());
+//                BLT_INFO("Best fitness: %lf", current_stats.best_fitness.load());
+//                BLT_INFO("Worst fitness: %lf", current_stats.worst_fitness.load());
+//                BLT_INFO("Overall fitness: %lf", current_stats.overall_fitness.load());
+                
+                /*current_stats = {};
                for (const auto& ind : blt::enumerate(current_pop.get_individuals()))
                {
                    fitness_function(ind.second.tree, ind.second.fitness, ind.first);
@ -461,51 +525,7 @@ namespace blt::gp
                    
                    current_stats.overall_fitness += ind.second.fitness.adjusted_fitness;
                }
-                current_stats.average_fitness /= static_cast<double>(config.population_size);
-//                double min = 0;
-//                double max = 0;
-//                for (auto& ind : current_pop.get_individuals())
-//                {
-//                    if (ind.raw_fitness < min)
-//                        min = ind.raw_fitness;
-//                    if (ind.raw_fitness > max)
-//                        max = ind.raw_fitness;
-//                }
-//
-//                double overall_fitness = 0;
-//                double best_fitness = 2;
-//                double worst_fitness = 0;
-//                individual* best = nullptr;
-//                individual* worst = nullptr;
-//
-//                auto diff = -min;
-//                for (auto& ind : current_pop.get_individuals())
-//                {
-//                    // make standardized fitness [0, +inf)
-//                    ind.standardized_fitness = ind.raw_fitness + diff;
-//                    //BLT_WARN(ind.standardized_fitness);
-//                    if (larger_better)
-//                        ind.standardized_fitness = (max + diff) - ind.standardized_fitness;
-//                    //BLT_WARN(ind.standardized_fitness);
-//                    //ind.adjusted_fitness = (1.0 / (1.0 + ind.standardized_fitness));
-//
-//                    if (ind.standardized_fitness > worst_fitness)
-//                    {
-//                        worst_fitness = ind.standardized_fitness;
-//                        worst = &ind;
-//                    }
-//
-//                    if (ind.standardized_fitness < best_fitness)
-//                    {
-//                        best_fitness = ind.standardized_fitness;
-//                        best = &ind;
-//                    }
-//
-//                    overall_fitness += ind.standardized_fitness / static_cast<double>(config.population_size);
-//                }
-//
-//                current_stats = {overall_fitness, overall_fitness, best_fitness, worst_fitness, best,
-//                                 worst};
+                current_stats.average_fitness = current_stats.overall_fitness / static_cast<double>(config.population_size);*/
            }
    };
    
--- a/include/blt/gp/tree.h
+++ b/include/blt/gp/tree.h
@ -27,6 +27,7 @@
 #include <utility>
 #include <stack>
 #include <ostream>
+#include <atomic>

 namespace blt::gp
 {
@ -143,14 +144,20 @@ namespace blt::gp
    
    struct population_stats
    {
-        double overall_fitness = 0;
-        double average_fitness = 0;
-        double best_fitness = 0;
-        double worst_fitness = 1;
-        // these will never be null unless your pop is not initialized / fitness eval was not called!
-        individual* best_individual = nullptr;
-        individual* worst_individual = nullptr;
+        std::atomic<double> overall_fitness = 0;
+        std::atomic<double> average_fitness = 0;
+        std::atomic<double> best_fitness = 0;
+        std::atomic<double> worst_fitness = 1;
        std::vector<double> normalized_fitness;
+        
+        void clear()
+        {
+            overall_fitness = 0;
+            average_fitness = 0;
+            best_fitness = 0;
+            worst_fitness = 0;
+            normalized_fitness.clear();
+        }
    };
    
    class population_t
--- a/lib/blt
+++ b/lib/blt
@ -1 +1 @@
-Subproject commit 456eeb12ac416a4ac4b5e72213f5a93fa576607c
+Subproject commit c5f3d9ba3b805d16c44cca020eeeec8abcee443f
--- a/src/program.cpp
+++ b/src/program.cpp
@ -16,10 +16,10 @@
 *  along with this program.  If not, see <https://www.gnu.org/licenses/>.
 */
 #include <blt/gp/program.h>
+#include <iostream>

 namespace blt::gp
 {
-    
    // default static references for mutation, crossover, and initializer
    // this is largely to not break the tests :3
    // it's also to allow for quick setup of a gp program if you don't care how crossover or mutation is handled
@ -43,4 +43,68 @@ namespace blt::gp
    prog_config_t::prog_config_t(size_t populationSize):
            population_size(populationSize), mutator(s_mutator), crossover(s_crossover), pop_initializer(s_init)
    {}
+    
+    random_t& gp_program::get_random() const
+    {
+        thread_local static blt::gp::random_t random_engine{seed};
+        return random_engine;
+    }
+    
+    void gp_program::create_threads()
+    {
+        if (config.threads == 0)
+            config.set_thread_count(std::thread::hardware_concurrency() - 1);
+        for (blt::size_t i = 0; i < config.threads; i++)
+        {
+            thread_helper.threads.emplace_back(new std::thread([this]() {
+                while (!should_thread_terminate())
+                {
+                    execute_thread();
+                }
+                std::cout << "Ending Thread!" << std::endl;
+            }));
+        }
+    }
+    
+    void gp_program::execute_thread()
+    {
+        if (thread_helper.evaluation_left > 0)
+        {
+            while (thread_helper.evaluation_left > 0)
+            {
+                blt::size_t begin = 0;
+                blt::size_t end = 0;
+                {
+                    std::scoped_lock lock(thread_helper.evaluation_control);
+                    end = thread_helper.evaluation_left;
+                    auto size = std::min(thread_helper.evaluation_left.load(), config.evaluation_size);
+                    begin = thread_helper.evaluation_left - size;
+                    thread_helper.evaluation_left -= size;
+                }
+                //std::cout << "Processing " << begin << " to " << end << " with " << thread_helper.evaluation_left << " left" << std::endl;
+                for (blt::size_t i = begin; i < end; i++)
+                {
+                    auto& ind = current_pop.get_individuals()[i];
+                    
+                    evaluate_fitness_func(ind.tree, ind.fitness, i);
+                    
+                    auto old_best = current_stats.best_fitness.load();
+                    while (ind.fitness.adjusted_fitness > old_best &&
+                           !current_stats.best_fitness.compare_exchange_weak(old_best, ind.fitness.adjusted_fitness,
+                                                                             std::memory_order_release,
+                                                                             std::memory_order_relaxed));
+                    
+                    auto old_worst = current_stats.worst_fitness.load();
+                    while (ind.fitness.adjusted_fitness < old_worst &&
+                           !current_stats.worst_fitness.compare_exchange_weak(old_worst, ind.fitness.adjusted_fitness,
+                                                                              std::memory_order_release, std::memory_order_relaxed));
+                    
+                    auto old_overall = current_stats.overall_fitness.load();
+                    while (!current_stats.overall_fitness.compare_exchange_weak(old_overall, ind.fitness.adjusted_fitness + old_overall,
+                                                                                std::memory_order_release, std::memory_order_relaxed));
+                }
+            }
+            thread_helper.threads_left--;
+        }
+    }
 }