skip to Main Content

I’m trying to optimize the running time of my C++ code by utilizing multiple threads. I have tried a few different solutions.

I have this code using boost:

#include <iostream>
#include <vector>
#include <chrono>
#include <atomic>
#include <thread>
#include <boost/asio.hpp>
#include <boost/bind/bind.hpp>

void Test::boost_worker_task() {
    char new_state[3][3];
    MyEngine::random_start_state(new_state);
    MyEngine::solve_game(new_state);
    ++games_solved_counter;
}

void Test::run(const unsigned int games_to_solve, const bool use_mul_thread) {
    MyEngine::init_rand();
    const auto start = std::chrono::high_resolution_clock::now();

    const unsigned int num_threads = use_mul_thread ? std::thread::hardware_concurrency() : 1;
    std::cout << "Using " << num_threads << " threads to solve " << games_to_solve << " games" << std::endl;

    boost::asio::io_service io_service;
    boost::asio::thread_pool pool(num_threads);

    for (unsigned int i = 0; i < games_to_solve; ++i) {
            io_service.post([] { return Test::boost_worker_task(); });
    }

    // Run and wait for all tasks to complete
    io_service.run();
    pool.join();

    const auto end = std::chrono::high_resolution_clock::now();
    const std::chrono::duration<double, std::milli> elapsed = end - start;

    std::cout << "Solved " << games_solved_counter << " games!" << std::endl;
    std::cout << "Elapsed time: " << elapsed.count() / 1000.0 << " seconds" << std::endl;
    std::cout << "Elapsed time: " << elapsed.count() << " millisecondsn" << std::endl;
}

As well as this code:

#include <iostream>
#include <vector>
#include <thread>
#include <mutex>
#include <chrono>
#include <atomic>
#include <future>

std::atomic<int> games_solved(0);

void TestMulThread::worker_task(const unsigned int num_iterations, std::mutex& games_solved_mutex) {
    for (unsigned int i = 0; i < num_iterations; ++i) {
        char new_state[3][3];
        MyEngine::random_start_state(new_state);
        MyEngine::solve_game(new_state);

        ++games_solved;
    }
}
void TestMulThread::run(const unsigned int total_games_to_solve) {
    MyEngine::init_rand();
    const auto start_time = std::chrono::high_resolution_clock::now();
    const unsigned int num_threads = std::thread::hardware_concurrency();
    const unsigned int games_per_thread = total_games_to_solve / num_threads;
    const unsigned int remaining_games = total_games_to_solve % games_per_thread;
        std::cout << "Using " << num_threads << " threads to solve " << total_games_to_solve << " games" << std::endl;

    // Distribute the remaining games
    std::vector<unsigned int> games_for_each_thread(num_threads, games_per_thread);
    for (unsigned int i = 0; i < remaining_games; ++i) {
        games_for_each_thread[i]++;
    }

    std::vector<std::future<void>> futures;
    std::mutex games_solved_mutex;

    for (unsigned int i = 0; i < num_threads; ++i) {
        futures.push_back(std::async(std::launch::async, worker_task, games_for_each_thread[i], std::ref(games_solved_mutex)));
    }

    for (auto& future : futures) {
        future.get();
    }

    const auto end_time = std::chrono::high_resolution_clock::now();
    const auto elapsed_time = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time).count();
    std::cout << "Solved " << games_solved << " games!" << std::endl;
    std::cout << "Elapsed time: " << elapsed_time / 1000.0 << " seconds" << std::endl;
    std::cout << "Elapsed time: " << elapsed_time << " millisecondsn" << std::endl;
}

My issues are twofold:

  1. The boost version is running much slower than the second code. It’s even running slower than using a simple for loop and not trying to utilize several threads. I understand that trying to run tasks in parallel can lead to overhead of different sorts but the second code is running super fast and I would like to understand why.

  2. The second code snippet is running really fast when using Visual studio (compiling with Release and -O2 flags as C++ optimization), but when I compile and run the same code on my Linux machine using g++ it’s again running slower than using a for loop to run the same amount of games. I’ve tried compiling with a few different settings, like:

g++ -O2 -o test Test.cpp -std=c++20 -lpthread
g++ -O2 -o test Test.cpp -std=c++20 -pthread

Any ideas as to why this is the case?

Thanks!

2

Answers


  1. Chosen as BEST ANSWER

    @sehe

    That's interesting. However, if you don't mind having a look at this modified code:

    #include "test.h"
    #include <boost/asio.hpp>
    #include <iostream>
    #include <random>
    #include <thread>
    #include <math.h>
    
    #define USE_SINGLE_THREAD 1
    
    using namespace std::chrono_literals;
    static constexpr auto now = std::chrono::high_resolution_clock::now;
    
    void MyEngine::init_rand() {}
    void MyEngine::random_start_state(State&) {}
    void MyEngine::solve_game(State&) {
        thread_local double result = 0.0;
        result += sqrt(M_PI);
    
        thread_local std::mt19937 rng(std::random_device{}());
        thread_local std::uniform_int_distribution<std::mt19937::result_type> dist(1, 100000);
        const auto chance_to_print = dist(rng) == 1;
        if (chance_to_print)
            std::cout << "I'm running!" << std::endl;
    };
    
    void TestMulThread::worker_task(unsigned num_iterations) {
        MyEngine::init_rand();
        for (unsigned int i = 0; i < num_iterations; ++i) {
            char new_state[3][3];
            MyEngine::random_start_state(new_state);
            MyEngine::solve_game(new_state);
    
            ++games_solved;
        }
    }
    
    void TestMulThread::run(unsigned total_games_to_solve) {
        auto     start_time      = now();
        unsigned num_tasks       = std::thread::hardware_concurrency();
    #if USE_SINGLE_THREAD
        num_tasks       = 1;
    #endif
        unsigned games_per_task  = total_games_to_solve / num_tasks;
        unsigned remaining_games = total_games_to_solve % games_per_task;
        std::cout << "Using " << num_tasks << " tasks to solve " << total_games_to_solve << " games" << std::endl;
    
        // Distribute the remaining games
        std::vector buckets(num_tasks, games_per_task);
        for (unsigned int i = 0; i < remaining_games; ++i) {
            buckets[i]++;
        }
    
        std::vector<std::future<void>> futures;
    
        for (unsigned i = 0; i < num_tasks; ++i)
            futures.push_back(std::async(std::launch::async, worker_task, buckets[i]));
    
        for (auto& future : futures)
            future.get();
    
        auto elapsed = now() - start_time;
        std::cout << "Solved " << games_solved << " games!" << std::endl;
        std::cout << "Elapsed time: " << elapsed / 1.s << " seconds" << std::endl;
        std::cout << "Elapsed time: " << elapsed / 1ms << " millisecondsn" << std::endl;
    }
    
    void Test::boost_worker_task() {
        char new_state[3][3];
        MyEngine::random_start_state(new_state);
        MyEngine::solve_game(new_state);
        ++games_solved_counter;
    }
    
    void Test::run(unsigned games_to_solve, bool threaded) {
        auto const start = now();
    
    #if USE_SINGLE_THREAD
        threaded = false;
    #endif
        unsigned num_threads = threaded ? std::thread::hardware_concurrency() : 1;
        std::cout << "Using " << num_threads << " threads to solve " << games_to_solve << " games" << std::endl;
    
        boost::asio::thread_pool pool(num_threads);
    
        for (unsigned i = 0; i < games_to_solve; ++i)
            post(pool, Test::boost_worker_task);
    
        pool.join();
    
        auto elapsed = now() - start;
    
        std::cout << "Solved " << games_solved_counter << " games!" << std::endl;
        std::cout << "Elapsed time: " << elapsed / 1.s << " seconds" << std::endl;
        std::cout << "Elapsed time: " << elapsed / 1ms << " millisecondsn" << std::endl;
    }
    
    int main() { //
        auto N = 1'000'000u;
        Test::run(N, true);
        TestMulThread::run(N);
    }
    

    I've changed the solve_game implementation to somewhat resemble the task I do in the original code. I also added a chance to print to verify that the tasks are running. These are the surprising results I get:

    When using USE_SINGLE_THREAD 1:

    Using 1 threads to solve 1000000 games
    I'm running!
    ...
    Solved 1000000 games!
    Elapsed time: 0.126946 seconds
    Elapsed time: 126 milliseconds
    
    Using 1 tasks to solve 1000000 games
    I'm running!
    ...
    Solved 1000000 games!
    Elapsed time: 0.00492185 seconds
    Elapsed time: 4 milliseconds
    

    When using USE_SINGLE_THREAD 0:

    Using 16 threads to solve 1000000 games
    I'm running!
    ...
    Solved 1000000 games!
    Elapsed time: 1.31997 seconds
    Elapsed time: 1319 milliseconds
    
    Using 16 tasks to solve 1000000 games
    I'm running!
    ...
    Solved 1000000 games!
    Elapsed time: 0.0145032 seconds
    Elapsed time: 14 milliseconds
    

    How I compile it on linux: g++ -Ofast -o test test.cpp -lboost_thread -lboost_system && ./test


  2. The first program never uses the threadpool. Except to join.

    If you fix at least the first, the performance matches between the two:

    I’ve also removed cruft that didn’t get used or was unrelated to the question. Also reworded the chrono hell to be more elegant. Notably shifted the random generator to each task because they are NOT threadsafe! Finally renamed "threads" to "tasks" in the std::async version because std::async guarantees little about thread scheduling in the standard.

    Live On Coliru

    • File test.h

       #pragma once
       #include <atomic>
      
       namespace Test {
           static void boost_worker_task();
           static void run(unsigned, bool);
      
           static inline std::atomic_uint games_solved_counter{0};
       }; // namespace Test
      
       namespace TestMulThread {
           static void worker_task(unsigned);
           static void run(unsigned);
      
           static inline std::atomic_uint games_solved{0};
       }; // namespace TestMulThread
      
       struct MyEngine {
           using State = char[3][3];
           static void init_rand();
           static void random_start_state(State&);
           static void solve_game(State&);
       };
      
    • File test.cpp

       #include "test.h"
       #include <boost/asio.hpp>
       #include <iostream>
       #include <random>
       #include <thread>
       using namespace std::chrono_literals;
       static constexpr auto now = std::chrono::high_resolution_clock::now;
      
       void MyEngine::init_rand() {}
       void MyEngine::random_start_state(State&) {}
       void MyEngine::solve_game(State&) {
           thread_local static std::mt19937                  prng{std::random_device{}()};
           thread_local static std::uniform_int_distribution work(3, 8);
           std::this_thread::sleep_for(1ms * work(prng));
       };
      
       void TestMulThread::worker_task(unsigned num_iterations) {
           MyEngine::init_rand();
           for (unsigned int i = 0; i < num_iterations; ++i) {
               char new_state[3][3];
               MyEngine::random_start_state(new_state);
               MyEngine::solve_game(new_state);
      
               ++games_solved;
           }
       }
      
       void TestMulThread::run(unsigned total_games_to_solve) {
           auto     start_time      = now();
           unsigned num_tasks       = std::thread::hardware_concurrency();
           unsigned games_per_task  = total_games_to_solve / num_tasks;
           unsigned remaining_games = total_games_to_solve % games_per_task;
           std::cout << "Using " << num_tasks << " tasks to solve " << total_games_to_solve << " games" << std::endl;
      
           // Distribute the remaining games
           std::vector buckets(num_tasks, games_per_task);
           for (unsigned int i = 0; i < remaining_games; ++i) {
               buckets[i]++;
           }
      
           std::vector<std::future<void>> futures;
      
           for (unsigned i = 0; i < num_tasks; ++i)
               futures.push_back(std::async(std::launch::async, worker_task, buckets[i]));
      
           for (auto& future : futures)
               future.get();
      
           auto elapsed = now() - start_time;
           std::cout << "Solved " << games_solved << " games!" << std::endl;
           std::cout << "Elapsed time: " << elapsed / 1.s << " seconds" << std::endl;
           std::cout << "Elapsed time: " << elapsed / 1ms << " millisecondsn" << std::endl;
       }
      
       void Test::boost_worker_task() {
           char new_state[3][3];
           MyEngine::random_start_state(new_state);
           MyEngine::solve_game(new_state);
           ++games_solved_counter;
       }
      
       void Test::run(unsigned games_to_solve, bool threaded) {
           auto const start = now();
      
           unsigned num_threads = threaded ? std::thread::hardware_concurrency() : 1;
           std::cout << "Using " << num_threads << " threads to solve " << games_to_solve << " games" << std::endl;
      
           boost::asio::thread_pool pool(num_threads);
      
           for (unsigned i = 0; i < games_to_solve; ++i)
               post(pool, Test::boost_worker_task);
      
           // Run and wait for all tasks to complete
           pool.join();
      
           auto elapsed = now() - start;
      
           std::cout << "Solved " << games_solved_counter << " games!" << std::endl;
           std::cout << "Elapsed time: " << elapsed / 1.s << " seconds" << std::endl;
           std::cout << "Elapsed time: " << elapsed / 1ms << " millisecondsn" << std::endl;
       }
      
       int main() { //
           auto N = 10'000u;
           Test::run(N, true);
           TestMulThread::run(N);
       }
      

    Prints, online:

    Using 4 threads to solve 10000 games
    Solved 10000 games!
    Elapsed time: 14.0106 seconds
    Elapsed time: 14010 milliseconds
    
    Using 4 tasks to solve 10000 games
    Solved 10000 games!
    Elapsed time: 14.2782 seconds
    Elapsed time: 14278 milliseconds
    

    And locally for me:

    Using 8 threads to solve 10000 games
    Solved 10000 games!
    Elapsed time: 6.96683 seconds
    Elapsed time: 6966 milliseconds
    
    Using 8 threads to solve 10000 games
    Solved 10000 games!
    Elapsed time: 7.04044 seconds
    Elapsed time: 7040 milliseconds
    
    Login or Signup to reply.
Please signup or login to give your own answer.
Back To Top
Search