From 3e9274b6f91d7f3817f29fd8d539e068481931ce Mon Sep 17 00:00:00 2001 From: Jason Turner Date: Sat, 30 Mar 2024 08:18:49 -0600 Subject: [PATCH] Create game_of_life.cpp for AdaptiveCpp example --- parallel_algorithms/game_of_life.cpp | 302 +++++++++++++++++++++++++++ 1 file changed, 302 insertions(+) create mode 100644 parallel_algorithms/game_of_life.cpp diff --git a/parallel_algorithms/game_of_life.cpp b/parallel_algorithms/game_of_life.cpp new file mode 100644 index 0000000..8af3033 --- /dev/null +++ b/parallel_algorithms/game_of_life.cpp @@ -0,0 +1,302 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// This is a simple conway's game-of-life implementation +// that is constexpr friendly and can work as a benchmark +// for parallel computation models in C++ +// +// Notes I learned along the way while learning AdaptiveCpp +// +// AMD GPU Install notes: +// * AMD focuses on LTS ubuntu releases, if you have a different release, +// expect a little pain +// * I had good luck installing the AMDGPU Installer option here: +// https://rocm.docs.amd.com/projects/install-on-linux/en/latest/tutorial/quick-start.html#amdgpu-ubuntu +// * The rocm-gdb package would not install on my OS because of some outdated +// dependencies +// * The amdgpu-install tool will set up the apt repositories that you need +// * If your OS is fully supported, just install the copy level package +// * Honestly, I just kept installing random ROCm packages until I got things +// working, +// which was I think everything except for the gdb package that I could not +// install +// +// After You've Installed ROCm +// * add yourself to the render group +// * consider rebooting probably +// * run `rocminfo` and make sure it sees your GPUs +// +// Other GPUs: +// * I have no input here +// +// Use the "automatic installation script" to install llvm >= 14 +// * https://apt.llvm.org/ +// * You probably want to install "all" +// ```sh +// wget https://apt.llvm.org/llvm.sh +// chmod +x llvm.sh +// sudo ./llvm.sh all +// ``` +// +// Now Build And Install AdaptiveCpp +// * https://github.com/AdaptiveCpp/AdaptiveCpp/blob/develop/doc/installing.md#a-standard-installation +// * Run `acpp-info` and make sure you get output similar to what `rocminfo` +// gave you +// +// Install nvtop to monitor GPU usage and make sure this is doing what you want. +// +// To Compare with GCC +// * install libttb-dev +// +// Theoretically you are ready to go now?! +// +// +// To compile with all optimizations and parallel std lib support enabled: +// +// ```sh +// # AdaptiveCpp +// acpp -std=c++23 ./game_of_life.cpp -O3 -march=native --acpp-stdpar +// +// # gcc/clang. If you don't have ttb installed/linked it falls back to single +// threaded silently g++ -std=c++23 ./game_of_life.cpp -O3 -march=native -lttb +// clang++ -std=c++23 ./game_of_life.cpp -O3 -march=native -lttb +// +// # Depending on clang version you might need to add -fexperimental-library +// ``` +// +// Run, watch nvtop, htop, run with /usr/bin/time to see total CPU utilization, +// etc and see how it scales on your platform + +// Handy modulo operator that wraps around automatically +[[nodiscard]] constexpr auto floor_modulo(auto dividend, auto divisor) { + return ((dividend % divisor) + divisor) % divisor; +} + +// This is probably unnecessary, but the min_int +// utilities exist to make the `Point` type as compact as possible +// so that we only use int16 if that's all we need, for example +template auto min_int() { + if constexpr (value <= std::numeric_limits::max()) { + return std::int8_t{}; + } else if constexpr (value <= std::numeric_limits::max()) { + return std::int16_t{}; + } else if constexpr (value <= std::numeric_limits::max()) { + return std::int32_t{}; + } else { + return std::int64_t{}; + } +} + +template using min_int_t = decltype(min_int()); + +// templated on size mostly to give the compiler extra hints +// about the code, so it knows what it can unroll, etc. +template struct GameBoard { + // These are the properly sized things necessary to hold coordinates + // that work with this particular size of board + using x_index_t = min_int_t; + using y_index_t = min_int_t; + + static constexpr x_index_t width = Width; + static constexpr y_index_t height = Height; + + std::array data; + + struct Point { + x_index_t x; + y_index_t y; + [[nodiscard]] constexpr Point operator+(Point rhs) const { + return Point{static_cast(x + rhs.x), + static_cast(y + rhs.y)}; + } + }; + + // The 8 relative positions for neighbors for a given point + constexpr static std::array neighbors{ + Point{-1, -1}, Point{0, -1}, Point{1, -1}, Point{-1, 0}, + Point{1, 0}, Point{-1, 1}, Point{0, 1}, Point{1, 1}}; + + // Takes the input point, wraps it veritcally/horizontally and takes + // the new location and maps that to the linear address of the point + // in the underlying array + [[nodiscard]] constexpr static std::size_t index(Point p) { + return static_cast(floor_modulo(p.y, height) * width + + floor_modulo(p.x, width)); + } + + [[nodiscard]] constexpr bool operator[](Point p) const noexcept { + return data[index(p)]; + } + + constexpr void set(Point p) noexcept { data[index(p)] = true; } + + [[nodiscard]] constexpr std::size_t count_neighbors(Point p) const { + return static_cast( + std::count_if(neighbors.begin(), neighbors.end(), + [&](auto offset) { return (*this)[p + offset]; })); + } + + // Pre-compute all of the Point coordinates that exist in this particular + // gameboard. We use this later to iterate over every location in the + // gameboard. + [[nodiscard]] static auto make_indexes() { + auto result = std::make_unique>(); + + std::size_t output_index = 0; + + for (y_index_t y = 0; y < height; ++y) { + for (x_index_t x = 0; x < width; ++x) { + (*result)[output_index] = Point{x, y}; + ++output_index; + } + } + return result; + }; + + // https://en.wikipedia.org/wiki/Conway's_Game_of_Life#Examples_of_patterns + + // Add a glider at a given location on the game board + constexpr void add_glider(Point p) { + set(p); + set(p + Point{1, 1}); + set(p + Point{2, 1}); + set(p + Point{0, 2}); + set(p + Point{1, 2}); + } +}; + +template +constexpr void iterate_board(const BoardType &input, BoardType &output, + auto &indices) { + + const auto rules = [&](const auto &index) { + const auto neighbor_count = input.count_neighbors(index); + const auto is_alive = input[index]; + + if (is_alive) { + if (neighbor_count < 2) { + return false; + } else if (neighbor_count <= 3) { + return true; + } else { + return false; + } + } else { + if (neighbor_count == 3) { + return true; + } else { + return false; + } + } + + return true; + }; + + if consteval { + std::transform(indices.begin(), indices.end(), output.data.begin(), rules); + } else { + // std::execution::par_unseq to tell the runtime that it can parallelize + // this + std::transform(std::execution::par_unseq, indices.begin(), indices.end(), + output.data.begin(), rules); + } +} + +struct Timer { + std::string m_event; + std::chrono::steady_clock::time_point m_start_time; + Timer(std::string event) : m_event(std::move(event)) { + std::cout << m_event << " timing started\n"; + // get start time at last moment possible + m_start_time = std::chrono::steady_clock::now(); + } + + ~Timer() { + const std::chrono::duration milliseconds = + std::chrono::steady_clock::now() - m_start_time; + std::cout << m_event << " timing ended " << milliseconds << "\n"; + } + + auto seconds() const { + return std::chrono::duration(std::chrono::steady_clock::now() - + m_start_time); + } +}; + +template auto print_board(const BoardType &board) { + for (int y = 0; y < board.height; ++y) { + for (int x = 0; x < board.width; ++x) { + if (board[typename BoardType::Point(x, y)]) { + putchar('*'); + } else { + putchar(' '); + } + } + putchar('\n'); + } +} + +template +void run_board() { + using board_type = GameBoard; + + std::cout << "Running Time Test: " << Width << "x" << Height << " " + << Iterations << " iterations. " << Width * Height * Iterations + << " cell computations\n"; + + // I would consider putting these on the stack, but the GPU engine + // requires pointers that it knows how to work with. With AdaptiveCpp + // it swaps out malloc and owns these pointers in a way that can be used + // with the GPU automagically + + std::optional timer{"Setup"}; + auto board1 = std::make_unique(); + board1->add_glider(typename board_type::Point(1, 3)); + board1->add_glider(typename board_type::Point(10, 1)); + auto board2 = std::make_unique(); + + const auto indices = board_type::make_indexes(); + timer.reset(); // show how long setup took + + { + Timer timer2{"Running Board"}; + for (int i = 0; i < Iterations; ++i) { + // just swapping buffers back and forth + iterate_board(*board1, *board2, *indices); + std::swap(board1, board2); + } + std::cout << "Cells Per Second: " + << static_cast(Width * Height * Iterations) / + timer2.seconds().count() + << '\n'; + } + + if (Width <= 100 && Height <= 30) { + print_board(*board1); + } else { + // this exists solely to make sure the compiler doesn't optimize out the + // actual work + if ((*board1)[typename board_type::Point(0, 0)]) { + std::cout << "0,0 is Set!\n"; + } else { + std::cout << "0,0 is Not Set!\n"; + } + } +} + +int main() { + run_board<10, 10, 5'000'000>(); + run_board<100, 10, 500'000>(); + run_board<100, 100, 50'000>(); + run_board<100, 1000, 5'000>(); + run_board<1000, 1000, 500>(); + run_board<10000, 1000, 50>(); + run_board<10000, 10000, 5>(); +}