From 3e9274b6f91d7f3817f29fd8d539e068481931ce Mon Sep 17 00:00:00 2001
From: Jason Turner <jason@emptycrate.com>
Date: Sat, 30 Mar 2024 08:18:49 -0600
Subject: [PATCH] Create game_of_life.cpp for AdaptiveCpp example

---
 parallel_algorithms/game_of_life.cpp | 302 +++++++++++++++++++++++++++
 1 file changed, 302 insertions(+)
 create mode 100644 parallel_algorithms/game_of_life.cpp
diff --git a/parallel_algorithms/game_of_life.cpp b/parallel_algorithms/game_of_life.cpp
new file mode 100644
index 0000000..8af3033
--- /dev/null
+++ b/parallel_algorithms/game_of_life.cpp
@@ -0,0 +1,302 @@
+#include <algorithm>
+#include <array>
+#include <chrono>
+#include <cstdint>
+#include <execution>
+#include <iostream>
+#include <limits>
+#include <memory>
+#include <optional>
+
+// This is a simple conway's game-of-life implementation
+// that is constexpr friendly and can work as a benchmark
+// for parallel computation models in C++
+//
+// Notes I learned along the way while learning AdaptiveCpp
+//
+// AMD GPU Install notes:
+//  * AMD focuses on LTS ubuntu releases, if you have a different release,
+//  expect a little pain
+//  * I had good luck installing the AMDGPU Installer option here:
+//    https://rocm.docs.amd.com/projects/install-on-linux/en/latest/tutorial/quick-start.html#amdgpu-ubuntu
+//  * The rocm-gdb package would not install on my OS because of some outdated
+//  dependencies
+//  * The amdgpu-install tool will set up the apt repositories that you need
+//  * If your OS is fully supported, just install the copy level package
+//  * Honestly, I just kept installing random ROCm packages until I got things
+//  working,
+//    which was I think everything except for the gdb package that I could not
+//    install
+//
+//     After You've Installed ROCm
+//      * add yourself to the render group
+//      * consider rebooting probably
+//      * run `rocminfo` and make sure it sees your GPUs
+//
+// Other GPUs:
+//  * I have no input here
+//
+// Use the "automatic installation script" to install llvm >= 14
+//  * https://apt.llvm.org/
+//  * You probably want to install "all"
+//   ```sh
+//   wget https://apt.llvm.org/llvm.sh
+//   chmod +x llvm.sh
+//   sudo ./llvm.sh <version number> all
+//   ```
+//
+// Now Build And Install AdaptiveCpp
+//  * https://github.com/AdaptiveCpp/AdaptiveCpp/blob/develop/doc/installing.md#a-standard-installation
+//  * Run `acpp-info` and make sure you get output similar to what `rocminfo`
+//  gave you
+//
+// Install nvtop to monitor GPU usage and make sure this is doing what you want.
+//
+// To Compare with GCC
+//   * install libttb-dev
+//
+// Theoretically you are ready to go now?!
+//
+//
+// To compile with all optimizations and parallel std lib support enabled:
+//
+// ```sh
+// # AdaptiveCpp
+// acpp -std=c++23 ./game_of_life.cpp -O3 -march=native --acpp-stdpar
+//
+// # gcc/clang. If you don't have ttb installed/linked it falls back to single
+// threaded silently g++ -std=c++23 ./game_of_life.cpp -O3 -march=native -lttb
+// clang++ -std=c++23 ./game_of_life.cpp -O3 -march=native -lttb
+//
+// # Depending on clang version you might need to add -fexperimental-library
+// ```
+//
+// Run, watch nvtop, htop, run with /usr/bin/time to see total CPU utilization,
+// etc and see how it scales on your platform
+
+// Handy modulo operator that wraps around automatically
+[[nodiscard]] constexpr auto floor_modulo(auto dividend, auto divisor) {
+  return ((dividend % divisor) + divisor) % divisor;
+}
+
+// This is probably unnecessary, but the min_int
+// utilities exist to make the `Point` type as compact as possible
+// so that we only use int16 if that's all we need, for example
+template <std::size_t value> auto min_int() {
+  if constexpr (value <= std::numeric_limits<std::int8_t>::max()) {
+    return std::int8_t{};
+  } else if constexpr (value <= std::numeric_limits<std::int16_t>::max()) {
+    return std::int16_t{};
+  } else if constexpr (value <= std::numeric_limits<std::int32_t>::max()) {
+    return std::int32_t{};
+  } else {
+    return std::int64_t{};
+  }
+}
+
+template <std::size_t value> using min_int_t = decltype(min_int<value>());
+
+// templated on size mostly to give the compiler extra hints
+// about the code, so it knows what it can unroll, etc.
+template <std::size_t Width, std::size_t Height> struct GameBoard {
+  // These are the properly sized things necessary to hold coordinates
+  // that work with this particular size of board
+  using x_index_t = min_int_t<Width>;
+  using y_index_t = min_int_t<Height>;
+
+  static constexpr x_index_t width = Width;
+  static constexpr y_index_t height = Height;
+
+  std::array<bool, Width * Height> data;
+
+  struct Point {
+    x_index_t x;
+    y_index_t y;
+    [[nodiscard]] constexpr Point operator+(Point rhs) const {
+      return Point{static_cast<x_index_t>(x + rhs.x),
+                   static_cast<y_index_t>(y + rhs.y)};
+    }
+  };
+
+  // The 8 relative positions for neighbors for a given point
+  constexpr static std::array<Point, 8> neighbors{
+      Point{-1, -1}, Point{0, -1}, Point{1, -1}, Point{-1, 0},
+      Point{1, 0},   Point{-1, 1}, Point{0, 1},  Point{1, 1}};
+
+  // Takes the input point, wraps it veritcally/horizontally and takes
+  // the new location and maps that to the linear address of the point
+  // in the underlying array
+  [[nodiscard]] constexpr static std::size_t index(Point p) {
+    return static_cast<std::size_t>(floor_modulo(p.y, height) * width +
+                                    floor_modulo(p.x, width));
+  }
+
+  [[nodiscard]] constexpr bool operator[](Point p) const noexcept {
+    return data[index(p)];
+  }
+
+  constexpr void set(Point p) noexcept { data[index(p)] = true; }
+
+  [[nodiscard]] constexpr std::size_t count_neighbors(Point p) const {
+    return static_cast<std::size_t>(
+        std::count_if(neighbors.begin(), neighbors.end(),
+                      [&](auto offset) { return (*this)[p + offset]; }));
+  }
+
+  // Pre-compute all of the Point coordinates that exist in this particular
+  // gameboard. We use this later to iterate over every location in the
+  // gameboard.
+  [[nodiscard]] static auto make_indexes() {
+    auto result = std::make_unique<std::array<Point, Width * Height>>();
+
+    std::size_t output_index = 0;
+
+    for (y_index_t y = 0; y < height; ++y) {
+      for (x_index_t x = 0; x < width; ++x) {
+        (*result)[output_index] = Point{x, y};
+        ++output_index;
+      }
+    }
+    return result;
+  };
+
+  // https://en.wikipedia.org/wiki/Conway's_Game_of_Life#Examples_of_patterns
+
+  // Add a glider at a given location on the game board
+  constexpr void add_glider(Point p) {
+    set(p);
+    set(p + Point{1, 1});
+    set(p + Point{2, 1});
+    set(p + Point{0, 2});
+    set(p + Point{1, 2});
+  }
+};
+
+template <typename BoardType>
+constexpr void iterate_board(const BoardType &input, BoardType &output,
+                             auto &indices) {
+
+  const auto rules = [&](const auto &index) {
+    const auto neighbor_count = input.count_neighbors(index);
+    const auto is_alive = input[index];
+
+    if (is_alive) {
+      if (neighbor_count < 2) {
+        return false;
+      } else if (neighbor_count <= 3) {
+        return true;
+      } else {
+        return false;
+      }
+    } else {
+      if (neighbor_count == 3) {
+        return true;
+      } else {
+        return false;
+      }
+    }
+
+    return true;
+  };
+
+  if consteval {
+    std::transform(indices.begin(), indices.end(), output.data.begin(), rules);
+  } else {
+    // std::execution::par_unseq to tell the runtime that it can parallelize
+    // this
+    std::transform(std::execution::par_unseq, indices.begin(), indices.end(),
+                   output.data.begin(), rules);
+  }
+}
+
+struct Timer {
+  std::string m_event;
+  std::chrono::steady_clock::time_point m_start_time;
+  Timer(std::string event) : m_event(std::move(event)) {
+    std::cout << m_event << " timing started\n";
+    // get start time at last moment possible
+    m_start_time = std::chrono::steady_clock::now();
+  }
+
+  ~Timer() {
+    const std::chrono::duration<double, std::milli> milliseconds =
+        std::chrono::steady_clock::now() - m_start_time;
+    std::cout << m_event << " timing ended " << milliseconds << "\n";
+  }
+
+  auto seconds() const {
+    return std::chrono::duration<double>(std::chrono::steady_clock::now() -
+                                         m_start_time);
+  }
+};
+
+template <typename BoardType> auto print_board(const BoardType &board) {
+  for (int y = 0; y < board.height; ++y) {
+    for (int x = 0; x < board.width; ++x) {
+      if (board[typename BoardType::Point(x, y)]) {
+        putchar('*');
+      } else {
+        putchar(' ');
+      }
+    }
+    putchar('\n');
+  }
+}
+
+template <std::size_t Width, std::size_t Height, std::size_t Iterations>
+void run_board() {
+  using board_type = GameBoard<Width, Height>;
+
+  std::cout << "Running Time Test: " << Width << "x" << Height << " "
+            << Iterations << " iterations. " << Width * Height * Iterations
+            << " cell computations\n";
+
+  // I would consider putting these on the stack, but the GPU engine
+  // requires pointers that it knows how to work with. With AdaptiveCpp
+  // it swaps out malloc and owns these pointers in a way that can be used
+  // with the GPU automagically
+
+  std::optional<Timer> timer{"Setup"};
+  auto board1 = std::make_unique<board_type>();
+  board1->add_glider(typename board_type::Point(1, 3));
+  board1->add_glider(typename board_type::Point(10, 1));
+  auto board2 = std::make_unique<board_type>();
+
+  const auto indices = board_type::make_indexes();
+  timer.reset(); // show how long setup took
+
+  {
+    Timer timer2{"Running Board"};
+    for (int i = 0; i < Iterations; ++i) {
+      // just swapping buffers back and forth
+      iterate_board(*board1, *board2, *indices);
+      std::swap(board1, board2);
+    }
+    std::cout << "Cells Per Second: "
+              << static_cast<double>(Width * Height * Iterations) /
+                     timer2.seconds().count()
+              << '\n';
+  }
+
+  if (Width <= 100 && Height <= 30) {
+    print_board(*board1);
+  } else {
+    // this exists solely to make sure the compiler doesn't optimize out the
+    // actual work
+    if ((*board1)[typename board_type::Point(0, 0)]) {
+      std::cout << "0,0 is Set!\n";
+    } else {
+      std::cout << "0,0 is Not Set!\n";
+    }
+  }
+}
+
+int main() {
+  run_board<10, 10, 5'000'000>();
+  run_board<100, 10, 500'000>();
+  run_board<100, 100, 50'000>();
+  run_board<100, 1000, 5'000>();
+  run_board<1000, 1000, 500>();
+  run_board<10000, 1000, 50>();
+  run_board<10000, 10000, 5>();
+}