From ab3a275dee3cb901c71fe95a9ed4ba7d54cc465a Mon Sep 17 00:00:00 2001
From: Juniper Tyree <50025784+juntyr@users.noreply.github.com>
Date: Mon, 20 May 2024 18:13:14 +0300
Subject: [PATCH] Upgrade to async-capable (for kernel launches and memory
 transfers) rust-cuda (#276)

* Initial progress towards using rust-cuda with async

Some progress

Further async integration progress, rustcoalescence fails to compile

Some progress with dispatch coersion

Small cleanup

Cleanup cuda algorithm coersion

Some more cleanup

Add back missing Backup for SeaHash and WyHash rngs

Fix CUDA kernel extraneous pub exports

Minor improvement of the event buffer hack

Remove unused control_flow_enum feature

Revert Copy for [Indexed]Location

Revert new clone

Update to rust-cuda with async kernel launch async return

Update to latest rust-cuda

Fix rustfmt

Temporary fix to allow CUDA algorithm linking

Small cleanup, mostly of unused clippy allows

Small improvement to CUDA EventBuffer

Try trait-based kernel signature check

Update rust-toolchain

Fix clippy lints

Try with const match instead

Try with memcmp intrinsic

Try out experimental const-type-layout with compression

Try interning all const layout strings

Try check

Try check again

* Small fixes after rebase

* Use cuda with (hopefully) much faster type checks

* Try with llvm-bitcode-linker

* Upgrade to latest ptx-builder

* Fix codecov action

* Fix RUSTFLAGS merging for ptx kernels in codecov CI

https://blog.rust-lang.org/inside-rust/2023/08/24/cargo-config-merging.html
https://doc.rust-lang.org/cargo/reference/config.html#command-line-overrides

* Update const-type-layout
---
 .github/workflows/ci.yml                      |  21 -
 .github/workflows/coverage.yml                |  35 +-
 .github/workflows/rustdoc.yml                 |   7 -
 .gitpod.Dockerfile                            |  18 +-
 Cargo.lock                                    | 387 ++++++++++++++---
 README.md                                     |   5 -
 necsim/core/Cargo.toml                        |   6 +-
 necsim/core/bond/Cargo.toml                   |   2 +-
 necsim/core/bond/src/closed_open_unit_f64.rs  |   1 +
 necsim/core/bond/src/closed_unit_f64.rs       |   1 +
 necsim/core/bond/src/non_negative_f64.rs      |   1 +
 necsim/core/bond/src/non_positive_f64.rs      |   1 +
 necsim/core/bond/src/open_closed_unit_f64.rs  |   1 +
 necsim/core/bond/src/positive_f64.rs          |   1 +
 necsim/core/src/cogs/coalescence_sampler.rs   |   1 -
 necsim/core/src/event.rs                      |   1 -
 necsim/core/src/landscape/extent.rs           |   6 +-
 necsim/core/src/landscape/location.rs         |  14 +-
 necsim/core/src/landscape/mod.rs              |   2 +-
 necsim/core/src/lib.rs                        |   1 -
 necsim/core/src/lineage.rs                    |  13 +-
 necsim/core/src/reporter/boolean.rs           |   2 +-
 necsim/core/src/reporter/mod.rs               |   6 +-
 necsim/core/src/simulation/builder.rs         |   2 +-
 necsim/core/src/simulation/mod.rs             |  12 +-
 necsim/impls/cuda/Cargo.toml                  |   6 +-
 necsim/impls/cuda/src/cogs/maths.rs           |  11 +-
 necsim/impls/cuda/src/cogs/rng.rs             |  47 +-
 necsim/impls/cuda/src/event_buffer.rs         | 199 ++++++---
 necsim/impls/cuda/src/utils.rs                |   2 +-
 necsim/impls/cuda/src/value_buffer.rs         |  49 ++-
 necsim/impls/no-std/Cargo.toml                |   6 +-
 necsim/impls/no-std/src/alias/mod.rs          |  11 +-
 necsim/impls/no-std/src/array2d.rs            |   4 +-
 .../independent/event_time_sampler/const.rs   |   2 +-
 .../independent/event_time_sampler/exp.rs     |   2 +-
 .../independent/event_time_sampler/fixed.rs   |   2 +-
 .../event_time_sampler/geometric.rs           |   2 +-
 .../independent/event_time_sampler/poisson.rs |   2 +-
 .../active_lineage_sampler/independent/mod.rs |   7 +-
 .../cogs/coalescence_sampler/independent.rs   |   2 +-
 .../almost_infinite_clark2dt.rs               |   2 +-
 .../almost_infinite_normal.rs                 |   2 +-
 .../in_memory/packed_alias/mod.rs             |   2 +-
 .../src/cogs/dispersal_sampler/non_spatial.rs |   2 +-
 .../dispersal_sampler/spatially_implicit.rs   |   2 +-
 .../cogs/dispersal_sampler/trespassing/mod.rs |   2 +-
 .../dispersal_sampler/trespassing/uniform.rs  |   2 +-
 .../cogs/dispersal_sampler/wrapping_noise.rs  |   2 +-
 .../no-std/src/cogs/emigration_exit/never.rs  |   2 +-
 .../src/cogs/event_sampler/independent.rs     |   9 +-
 .../no-std/src/cogs/event_sampler/tracking.rs |   2 +
 .../src/cogs/habitat/almost_infinite.rs       |   2 +-
 .../no-std/src/cogs/habitat/in_memory.rs      |   3 +-
 .../no-std/src/cogs/habitat/non_spatial.rs    |   3 +-
 .../src/cogs/habitat/spatially_implicit.rs    |   2 +-
 .../src/cogs/habitat/wrapping_noise/mod.rs    |   2 +-
 .../src/cogs/immigration_entry/never.rs       |   2 +-
 .../src/cogs/lineage_store/independent.rs     |   2 +-
 .../impls/no-std/src/cogs/maths/intrinsics.rs |   2 -
 necsim/impls/no-std/src/cogs/rng/seahash.rs   |   2 +-
 necsim/impls/no-std/src/cogs/rng/wyhash.rs    |   2 +-
 .../spatially_implicit.rs                     |   2 +-
 .../cogs/speciation_probability/uniform.rs    |   2 +-
 .../src/cogs/turnover_rate/in_memory.rs       |   2 +-
 .../no-std/src/cogs/turnover_rate/uniform.rs  |   2 +-
 .../independent/individuals.rs                |   2 +-
 .../parallelisation/independent/landscape.rs  |   2 +-
 .../independent/monolithic/mod.rs             |   2 +-
 .../parallelisation/monolithic/averaging.rs   |   2 +-
 .../parallelisation/monolithic/lockstep.rs    |   4 +-
 .../parallelisation/monolithic/monolithic.rs  |   2 +-
 .../parallelisation/monolithic/optimistic.rs  |   2 +-
 .../monolithic/optimistic_lockstep.rs         |   6 +-
 .../src/event_log/replay/sorted_segments.rs   |   1 +
 necsim/partitioning/mpi/src/partition/mod.rs  |   4 +-
 necsim/plugins/core/src/import/combinator.rs  |   4 +-
 rust-toolchain                                |   2 +-
 rustcoalescence/algorithms/cuda/Cargo.toml    |   2 +-
 .../algorithms/cuda/cpu-kernel/Cargo.toml     |   2 +-
 .../algorithms/cuda/cpu-kernel/src/lib.rs     | 177 +-------
 .../algorithms/cuda/cpu-kernel/src/link.rs    | 107 +----
 .../algorithms/cuda/cpu-kernel/src/patch.rs   | 402 +++---------------
 .../cuda/gpu-kernel/.cargo/config.toml        |   2 +-
 .../algorithms/cuda/gpu-kernel/Cargo.toml     |   6 +-
 .../algorithms/cuda/gpu-kernel/src/lib.rs     | 142 +++----
 rustcoalescence/algorithms/cuda/src/cuda.rs   |   4 +-
 rustcoalescence/algorithms/cuda/src/error.rs  |   2 +-
 rustcoalescence/algorithms/cuda/src/info.rs   |   2 +-
 .../algorithms/cuda/src/initialiser/fixup.rs  |  26 +-
 .../cuda/src/initialiser/genesis.rs           |  24 +-
 .../algorithms/cuda/src/initialiser/mod.rs    |  26 +-
 .../algorithms/cuda/src/initialiser/resume.rs |  26 +-
 rustcoalescence/algorithms/cuda/src/launch.rs | 163 ++++---
 rustcoalescence/algorithms/cuda/src/lib.rs    | 291 +++++++------
 .../cuda/src/parallelisation/monolithic.rs    | 193 ++++-----
 .../gillespie/src/event_skipping/mod.rs       |  30 +-
 .../gillespie/src/gillespie/classical/mod.rs  |  22 +-
 .../algorithms/gillespie/src/gillespie/mod.rs |   3 +
 .../gillespie/src/gillespie/turnover/mod.rs   |  29 +-
 .../algorithms/independent/src/lib.rs         |  31 +-
 rustcoalescence/algorithms/src/lib.rs         |  17 +-
 .../dispatch/valid/algorithm_scenario.rs      |  52 ++-
 .../src/cli/simulate/dispatch/valid/info.rs   |  16 +-
 .../src/cli/simulate/dispatch/valid/launch.rs |  14 +-
 .../src/cli/simulate/dispatch/valid/rng.rs    |  14 +-
 106 files changed, 1380 insertions(+), 1438 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 4241206c0..1451b64a5 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -32,13 +32,6 @@ jobs:
         run: |
           sudo apt-get update -q
           sudo apt-get install libopenmpi-dev -y --no-install-recommends
-
-      - name: Install the rust-ptx-linker
-        run: |
-          wget https://apt.llvm.org/llvm.sh && chmod +x llvm.sh
-          sudo ./llvm.sh $(rustc --version -v | grep -oP "LLVM version: \K\d+")
-          rm llvm.sh
-          cargo install rust-ptx-linker --git https://github.com/juntyr/rust-ptx-linker --force
       
       - name: Check the default features
         run: cargo check
@@ -64,13 +57,6 @@ jobs:
         run: |
           sudo apt-get update -q
           sudo apt-get install libopenmpi-dev -y --no-install-recommends
-
-      - name: Install the rust-ptx-linker
-        run: |
-          wget https://apt.llvm.org/llvm.sh && chmod +x llvm.sh
-          sudo ./llvm.sh $(rustc --version -v | grep -oP "LLVM version: \K\d+")
-          rm llvm.sh
-          cargo install rust-ptx-linker --git https://github.com/juntyr/rust-ptx-linker --force
       
       - name: Run the test-suite
         run: cargo test --workspace --no-fail-fast
@@ -104,13 +90,6 @@ jobs:
           sudo apt-get update -q
           sudo apt-get install libopenmpi-dev -y --no-install-recommends
 
-      - name: Install the rust-ptx-linker
-        run: |
-          wget https://apt.llvm.org/llvm.sh && chmod +x llvm.sh
-          sudo ./llvm.sh $(rustc --version -v | grep -oP "LLVM version: \K\d+")
-          rm llvm.sh
-          cargo install rust-ptx-linker --git https://github.com/juntyr/rust-ptx-linker --force
-
       - name: Check the code style for the default features
         run: cargo clippy -- -D warnings
       
diff --git a/.github/workflows/coverage.yml b/.github/workflows/coverage.yml
index 101d07515..6bcc5138b 100644
--- a/.github/workflows/coverage.yml
+++ b/.github/workflows/coverage.yml
@@ -31,26 +31,26 @@ jobs:
           sudo apt-get update -q
           sudo apt-get install libopenmpi-dev -y --no-install-recommends
 
-      - name: Install the Rust toolchain
+      - name: Generate the coverage data
         run: |
-          cargo install grcov --force --locked
-          rustup component add llvm-tools-preview
-
-      - name: Install the rust-ptx-linker
+          cargo clean
+          cargo \
+            --config "target.'cfg(all())'.rustflags=['-Cinstrument-coverage']" \
+            test --workspace --all-targets
+        env:
+          CARGO_INCREMENTAL: 0
+          RUSTDOCFLAGS: -Cinstrument-coverage
+          LLVM_PROFILE_FILE: coverage/coverage-%p-%m.profraw
+      
+      - name: Download grcov
         run: |
-          wget https://apt.llvm.org/llvm.sh && chmod +x llvm.sh
-          sudo ./llvm.sh $(rustc --version -v | grep -oP "LLVM version: \K\d+")
-          rm llvm.sh
-          cargo install rust-ptx-linker --git https://github.com/juntyr/rust-ptx-linker --force
-
-      - name: Generate the code coverage
+          curl -sL https://github.com/mozilla/grcov/releases/download/v0.8.18/grcov-x86_64-unknown-linux-gnu.tar.bz2 | tar jxf -
+          chmod +x ./grcov
+      
+      - name: Generate the coverage reports
         run: |
-          RUSTFLAGS="-Cinstrument-coverage" \
-          LLVM_PROFILE_FILE="codecov-%p-%m.profraw" \
-          cargo test --workspace
-
-          grcov . -s . --binary-path ./target/debug/ \
-            -t cobertura -o cobertura.xml --branch \
+          ./grcov . -s . --binary-path ./target/debug/deps \
+            -t lcov -o coverage.lcov --branch \
             --keep-only "necsim/*" \
             --keep-only "rustcoalescence/*" \
             --ignore-not-existing \
@@ -62,4 +62,5 @@ jobs:
         uses: codecov/codecov-action@v1
         with:
           token: ${{ secrets.CODECOV_TOKEN }}
+          files: coverage.lcov
           fail_ci_if_error: true
diff --git a/.github/workflows/rustdoc.yml b/.github/workflows/rustdoc.yml
index 55a80c515..b03fe3334 100644
--- a/.github/workflows/rustdoc.yml
+++ b/.github/workflows/rustdoc.yml
@@ -21,13 +21,6 @@ jobs:
           sudo apt-get update -q
           sudo apt-get install libopenmpi-dev -y --no-install-recommends
 
-      - name: Install the rust-ptx-linker
-        run: |
-          wget https://apt.llvm.org/llvm.sh && chmod +x llvm.sh
-          sudo ./llvm.sh $(rustc --version -v | grep -oP "LLVM version: \K\d+")
-          rm llvm.sh
-          cargo install rust-ptx-linker --git https://github.com/juntyr/rust-ptx-linker --force
-
       - name: Build the Documentation
         run: |
           RUSTDOCFLAGS="\
diff --git a/.gitpod.Dockerfile b/.gitpod.Dockerfile
index 8f03bc5a6..ba1ab8901 100644
--- a/.gitpod.Dockerfile
+++ b/.gitpod.Dockerfile
@@ -8,15 +8,14 @@ RUN echo "debconf debconf/frontend select Noninteractive" | sudo debconf-set-sel
     echo "keyboard-configuration keyboard-configuration/layout select 'English (US)'" | sudo debconf-set-selections && \
     echo "keyboard-configuration keyboard-configuration/layoutcode select 'us'" | sudo debconf-set-selections && \
     echo "resolvconf resolvconf/linkify-resolvconf boolean false" | sudo debconf-set-selections && \
-    wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-ubuntu1804.pin && \
-    sudo mv cuda-ubuntu1804.pin /etc/apt/preferences.d/cuda-repository-pin-600 && \
-    sudo apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/7fa2af80.pub && \
-    sudo add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/ /" && \
+    wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.0-1_all.deb -O cuda_keyring.deb && \
+    sudo dpkg -i cuda_keyring.deb && \
+    rm cuda_keyring.deb && \
+    wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-ubuntu2204.pin && \
+    sudo mv cuda-ubuntu2204.pin /etc/apt/preferences.d/cuda-repository-pin-600 && \
+    sudo add-apt-repository deb http://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/ / && \
     sudo apt-get update -q && \
-    sudo apt-get install cuda -y --no-install-recommends && \
-    wget https://apt.llvm.org/llvm.sh && chmod +x llvm.sh && \
-    sudo ./llvm.sh $(rustc --version -v | grep -oP "LLVM version: \K\d+") && \
-    rm llvm.sh && \
+    sudo apt-get install cuda-12-3 -y --no-install-recommends && \
     sudo apt-get clean autoclean && \
     sudo apt-get autoremove -y && \
     sudo rm -rf /var/lib/{apt,dpkg,cache,log}/
@@ -31,6 +30,5 @@ RUN sudo apt-get update -q && \
     sudo apt-get autoremove -y && \
     sudo rm -rf /var/lib/{apt,dpkg,cache,log}/
 
-RUN cargo install rust-ptx-linker --git https://github.com/juntyr/rust-ptx-linker --force && \
-    cargo install cargo-mpirun --force && \
+RUN cargo install cargo-mpirun --force && \
     cargo install cargo-reaper --git https://github.com/juntyr/grim-reaper --force
diff --git a/Cargo.lock b/Cargo.lock
index 9cda7c632..aa89df4c6 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -175,7 +175,7 @@ dependencies = [
  "regex",
  "rustc-hash",
  "shlex",
- "syn 2.0.64",
+ "syn 2.0.65",
  "which",
 ]
 
@@ -236,7 +236,7 @@ checksum = "4da9a32f3fed317401fa3c862968128267c3106685286e15d5aaa3d7389c2f60"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.64",
+ "syn 2.0.65",
 ]
 
 [[package]]
@@ -280,9 +280,9 @@ dependencies = [
 
 [[package]]
 name = "cc"
-version = "1.0.97"
+version = "1.0.98"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "099a5357d84c4c61eb35fc8eafa9a79a902c2f76911e5747ced4e032edd8d9b4"
+checksum = "41c270e7540d725e65ac7f1b212ac8ce349719624d7bcff99f8e2e488e8cf03f"
 dependencies = [
  "jobserver",
  "libc",
@@ -355,7 +355,7 @@ dependencies = [
  "heck",
  "proc-macro2",
  "quote",
- "syn 2.0.64",
+ "syn 2.0.65",
 ]
 
 [[package]]
@@ -382,9 +382,9 @@ dependencies = [
 
 [[package]]
 name = "const-type-layout"
-version = "0.3.0"
+version = "0.3.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0de72aa3d2f0cfa2d220013066afd32a9e716447c74205d9c2c5944b2cac37f3"
+checksum = "5daceeb879dcbf74fb11d2aba295197eccecaae7b65e19698a3540d53d7345da"
 dependencies = [
  "const-type-layout-derive",
 ]
@@ -423,9 +423,9 @@ dependencies = [
 
 [[package]]
 name = "crc32fast"
-version = "1.4.0"
+version = "1.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b3855a8a784b474f333699ef2bbca9db2c4a1f6d9088a90a2d25b1eb53111eaa"
+checksum = "58ebf8d6963185c7625d2c3c3962d99eb8936637b1427536d21dc36ae402ebad"
 dependencies = [
  "cfg-if",
 ]
@@ -505,7 +505,7 @@ dependencies = [
  "proc-macro2",
  "quote",
  "strsim 0.11.1",
- "syn 2.0.64",
+ "syn 2.0.65",
 ]
 
 [[package]]
@@ -527,7 +527,7 @@ checksum = "733cabb43482b1a1b53eee8583c2b9e8684d592215ea83efd305dd31bc2f0178"
 dependencies = [
  "darling_core 0.20.9",
  "quote",
- "syn 2.0.64",
+ "syn 2.0.65",
 ]
 
 [[package]]
@@ -538,7 +538,7 @@ checksum = "4e018fccbeeb50ff26562ece792ed06659b9c2dae79ece77c4456bb10d9bf79b"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.64",
+ "syn 2.0.65",
 ]
 
 [[package]]
@@ -580,7 +580,7 @@ dependencies = [
  "darling 0.20.9",
  "proc-macro2",
  "quote",
- "syn 2.0.64",
+ "syn 2.0.65",
 ]
 
 [[package]]
@@ -600,7 +600,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "206868b8242f27cecce124c19fd88157fbd0dd334df2587f36417bafbc85097b"
 dependencies = [
  "derive_builder_core 0.20.0",
- "syn 2.0.64",
+ "syn 2.0.65",
 ]
 
 [[package]]
@@ -611,7 +611,7 @@ checksum = "487585f4d0c6655fe74905e2504d8ad6908e4db67f744eb140876906c2f3175d"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.64",
+ "syn 2.0.65",
 ]
 
 [[package]]
@@ -626,6 +626,12 @@ version = "1.12.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3dca9240753cf90908d7e4aac30f630662b02aebaa1b58a3cadabdb23385b58b"
 
+[[package]]
+name = "equivalent"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5"
+
 [[package]]
 name = "erased-serde"
 version = "0.4.5"
@@ -670,6 +676,15 @@ version = "0.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1912868bad388722991f80323855d922e32b09ad00d76a13a98e465358765079"
 
+[[package]]
+name = "find_cuda_helper"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f9f9e65c593dd01ac77daad909ea4ad17f0d6d1776193fc8ea766356177abdad"
+dependencies = [
+ "glob",
+]
+
 [[package]]
 name = "findshlibs"
 version = "0.10.2"
@@ -704,6 +719,19 @@ version = "0.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ee1b05cbd864bcaecbd3455d6d967862d446e4ebfc3c2e5e5b9841e53cba6673"
 
+[[package]]
+name = "generator"
+version = "0.7.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5cc16584ff22b460a382b7feec54b23d2908d858152e5739a120b949293bd74e"
+dependencies = [
+ "cc",
+ "libc",
+ "log",
+ "rustversion",
+ "windows",
+]
+
 [[package]]
 name = "getrandom"
 version = "0.2.15"
@@ -734,7 +762,7 @@ checksum = "53010ccb100b96a67bc32c0175f0ed1426b31b655d562898e57325f81c023ac0"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.64",
+ "syn 2.0.65",
 ]
 
 [[package]]
@@ -811,6 +839,16 @@ version = "1.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39"
 
+[[package]]
+name = "indexmap"
+version = "2.2.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "168fb715dda47215e360912c096649d23d58bf392ac62f73919e831745e40f26"
+dependencies = [
+ "equivalent",
+ "hashbrown",
+]
+
 [[package]]
 name = "is_terminal_polyfill"
 version = "1.70.0"
@@ -912,6 +950,29 @@ version = "0.4.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "90ed8c1e510134f979dbc4f070f87d4313098b704861a105fe34231c70a3901c"
 
+[[package]]
+name = "loom"
+version = "0.5.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ff50ecb28bb86013e935fb6683ab1f6d3a20016f123c76fd4c27470076ac30f5"
+dependencies = [
+ "cfg-if",
+ "generator",
+ "pin-utils",
+ "scoped-tls",
+ "tracing",
+ "tracing-subscriber",
+]
+
+[[package]]
+name = "matchers"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8263075bb86c5a1b1427b5ae862e8889656f126e9f77c484496e8b47cf5c5558"
+dependencies = [
+ "regex-automata 0.1.10",
+]
+
 [[package]]
 name = "memchr"
 version = "2.7.2"
@@ -964,7 +1025,7 @@ source = "git+https://github.com/juntyr/rsmpi?rev=2988f56#2988f56e350311acc04119
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.64",
+ "syn 2.0.65",
 ]
 
 [[package]]
@@ -1196,6 +1257,16 @@ dependencies = [
  "minimal-lexical",
 ]
 
+[[package]]
+name = "nu-ansi-term"
+version = "0.46.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "77a8165726e8236064dbb45459242600304b42a5ea24ee2948e18e023bf7ba84"
+dependencies = [
+ "overload",
+ "winapi",
+]
+
 [[package]]
 name = "num-traits"
 version = "0.2.19"
@@ -1211,6 +1282,21 @@ version = "1.19.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92"
 
+[[package]]
+name = "oneshot"
+version = "0.1.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6f6640c6bda7731b1fdbab747981a0f896dd1fedaf9f4a53fa237a04a84431f4"
+dependencies = [
+ "loom",
+]
+
+[[package]]
+name = "overload"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39"
+
 [[package]]
 name = "pcg_rand"
 version = "0.13.0"
@@ -1224,6 +1310,18 @@ dependencies = [
  "serde",
 ]
 
+[[package]]
+name = "pin-project-lite"
+version = "0.2.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bda66fc9667c18cb2758a2ac84d1167245054bcf85d5d1aaa6923f45801bdd02"
+
+[[package]]
+name = "pin-utils"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184"
+
 [[package]]
 name = "pkg-config"
 version = "0.3.30"
@@ -1252,7 +1350,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5f12335488a2f3b0a83b14edad48dca9879ce89b2edd10e80237e4e852dd645e"
 dependencies = [
  "proc-macro2",
- "syn 2.0.64",
+ "syn 2.0.65",
 ]
 
 [[package]]
@@ -1281,9 +1379,9 @@ dependencies = [
 
 [[package]]
 name = "proc-macro2"
-version = "1.0.82"
+version = "1.0.83"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8ad3d49ab951a01fbaafe34f2ec74122942fe18a3f9814c3268f1bb72042131b"
+checksum = "0b33eb56c327dec362a9e55b3ad14f9d2f0904fb5a5b03b513ab5465399e9f43"
 dependencies = [
  "unicode-ident",
 ]
@@ -1300,8 +1398,8 @@ dependencies = [
 
 [[package]]
 name = "ptx-builder"
-version = "0.5.3"
-source = "git+https://github.com/juntyr/rust-ptx-builder?rev=1f1f49d#1f1f49df761e919f721ef234722ee7b2cfcf9104"
+version = "0.6.0"
+source = "git+https://github.com/juntyr/rust-ptx-builder?rev=aeb3b68#aeb3b68a85e3a5ee10757b357104e554ed44729f"
 dependencies = [
  "anyhow",
  "colored",
@@ -1360,8 +1458,17 @@ checksum = "c117dbdfde9c8308975b6a18d71f3f385c89461f7b3fb054288ecf2a2058ba4c"
 dependencies = [
  "aho-corasick",
  "memchr",
- "regex-automata",
- "regex-syntax",
+ "regex-automata 0.4.6",
+ "regex-syntax 0.8.3",
+]
+
+[[package]]
+name = "regex-automata"
+version = "0.1.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132"
+dependencies = [
+ "regex-syntax 0.6.29",
 ]
 
 [[package]]
@@ -1372,9 +1479,15 @@ checksum = "86b83b8b9847f9bf95ef68afb0b8e6cdb80f498442f5179a29fad448fcc1eaea"
 dependencies = [
  "aho-corasick",
  "memchr",
- "regex-syntax",
+ "regex-syntax 0.8.3",
 ]
 
+[[package]]
+name = "regex-syntax"
+version = "0.6.29"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1"
+
 [[package]]
 name = "regex-syntax"
 version = "0.8.3"
@@ -1410,24 +1523,39 @@ dependencies = [
 [[package]]
 name = "rust-cuda"
 version = "0.1.0"
-source = "git+https://github.com/juntyr/rust-cuda?rev=f395253#f395253bb244827bc46600ef5ee87f687eda249b"
+source = "git+https://github.com/juntyr/rust-cuda?rev=697dcf5#697dcf54bf362cd08e28d282fa947823382b49ff"
 dependencies = [
  "const-type-layout",
  "final",
+ "oneshot",
+ "regex",
  "rust-cuda-derive",
- "rust-cuda-ptx-jit",
+ "rust-cuda-kernel",
  "rustacuda",
  "rustacuda_core",
  "rustacuda_derive",
+ "safer_owning_ref",
 ]
 
 [[package]]
 name = "rust-cuda-derive"
 version = "0.1.0"
-source = "git+https://github.com/juntyr/rust-cuda?rev=f395253#f395253bb244827bc46600ef5ee87f687eda249b"
+source = "git+https://github.com/juntyr/rust-cuda?rev=697dcf5#697dcf54bf362cd08e28d282fa947823382b49ff"
+dependencies = [
+ "proc-macro-error",
+ "proc-macro2",
+ "quote",
+ "syn 1.0.109",
+]
+
+[[package]]
+name = "rust-cuda-kernel"
+version = "0.1.0"
+source = "git+https://github.com/juntyr/rust-cuda?rev=697dcf5#697dcf54bf362cd08e28d282fa947823382b49ff"
 dependencies = [
  "cargo_metadata",
  "colored",
+ "find_cuda_helper",
  "lazy_static",
  "proc-macro-error",
  "proc-macro2",
@@ -1438,23 +1566,13 @@ dependencies = [
  "serde_json",
  "strip-ansi-escapes",
  "syn 1.0.109",
-]
-
-[[package]]
-name = "rust-cuda-ptx-jit"
-version = "0.1.0"
-source = "git+https://github.com/juntyr/rust-cuda?rev=f395253#f395253bb244827bc46600ef5ee87f687eda249b"
-dependencies = [
- "lazy_static",
- "regex",
- "rustacuda",
+ "thiserror",
 ]
 
 [[package]]
 name = "rustacuda"
 version = "0.1.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "47208516ab5338b592d63560e90eaef405d0ec880347eaf7742d893b0a31e228"
+source = "git+https://github.com/juntyr/RustaCUDA?rev=c6ea7cc#c6ea7ccf24b15c4edbd5576852a8dcdc7df272b0"
 dependencies = [
  "bitflags 1.3.2",
  "cuda-driver-sys",
@@ -1465,14 +1583,12 @@ dependencies = [
 [[package]]
 name = "rustacuda_core"
 version = "0.1.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d3858b08976dc2f860c5efbbb48cdcb0d4fafca92a6ac0898465af16c0dbe848"
+source = "git+https://github.com/juntyr/RustaCUDA?rev=c6ea7cc#c6ea7ccf24b15c4edbd5576852a8dcdc7df272b0"
 
 [[package]]
 name = "rustacuda_derive"
 version = "0.1.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "43ce8670a1a1d0fc2514a3b846dacdb65646f9bd494b6674cfacbb4ce430bd7e"
+source = "git+https://github.com/juntyr/RustaCUDA?rev=c6ea7cc#c6ea7ccf24b15c4edbd5576852a8dcdc7df272b0"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -1651,12 +1767,33 @@ dependencies = [
  "windows-sys 0.52.0",
 ]
 
+[[package]]
+name = "rustversion"
+version = "1.0.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "955d28af4278de8121b7ebeb796b6a45735dc01436d898801014aced2773a3d6"
+
 [[package]]
 name = "ryu"
 version = "1.0.18"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f"
 
+[[package]]
+name = "safer_owning_ref"
+version = "0.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "af21b9de2df966f61c07b5b541c81c98225b86e48ababd43366a642654de30ef"
+dependencies = [
+ "stable_deref_trait",
+]
+
+[[package]]
+name = "scoped-tls"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e1cf6437eb19a8f4a6cc0f7dca544973b0b78843adbfeb3683d1a94a0024a294"
+
 [[package]]
 name = "seahash"
 version = "4.1.0"
@@ -1689,7 +1826,7 @@ checksum = "6048858004bcff69094cd972ed40a32500f153bd3be9f716b2eed2e8217c4838"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.64",
+ "syn 2.0.65",
 ]
 
 [[package]]
@@ -1724,6 +1861,15 @@ dependencies = [
  "serde",
 ]
 
+[[package]]
+name = "serde_spanned"
+version = "0.6.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "79e674e01f999af37c49f70a6ede167a8a60b2503e56c5599532a65baa5969a0"
+dependencies = [
+ "serde",
+]
+
 [[package]]
 name = "serde_state"
 version = "0.4.8"
@@ -1733,6 +1879,15 @@ dependencies = [
  "serde",
 ]
 
+[[package]]
+name = "sharded-slab"
+version = "0.1.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f40ca3c46823713e0d4209592e8d6e826aa57e928f09752619fc696c499637f6"
+dependencies = [
+ "lazy_static",
+]
+
 [[package]]
 name = "shell-words"
 version = "1.1.0"
@@ -1766,6 +1921,12 @@ version = "1.13.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67"
 
+[[package]]
+name = "stable_deref_trait"
+version = "1.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3"
+
 [[package]]
 name = "streaming-iterator"
 version = "0.1.9"
@@ -1806,9 +1967,9 @@ dependencies = [
 
 [[package]]
 name = "syn"
-version = "2.0.64"
+version = "2.0.65"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7ad3dee41f36859875573074334c200d1add8e4a87bb37113ebd31d926b7b11f"
+checksum = "d2863d96a84c6439701d7a38f9de935ec562c8832cc55d1dde0f513b52fad106"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -1832,7 +1993,17 @@ checksum = "46c3384250002a6d5af4d114f2845d37b57521033f30d5c3f46c4d70e1197533"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.64",
+ "syn 2.0.65",
+]
+
+[[package]]
+name = "thread_local"
+version = "1.1.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8b9ef9bad013ada3808854ceac7b46812a6465ba368859a37e2100283d2d719c"
+dependencies = [
+ "cfg-if",
+ "once_cell",
 ]
 
 [[package]]
@@ -1857,11 +2028,97 @@ dependencies = [
 
 [[package]]
 name = "toml"
-version = "0.5.11"
+version = "0.8.13"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a4e43f8cc456c9704c851ae29c67e17ef65d2c30017c17a9765b89c382dc8bba"
+dependencies = [
+ "serde",
+ "serde_spanned",
+ "toml_datetime",
+ "toml_edit",
+]
+
+[[package]]
+name = "toml_datetime"
+version = "0.6.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4badfd56924ae69bcc9039335b2e017639ce3f9b001c393c1b2d1ef846ce2cbf"
+dependencies = [
+ "serde",
+]
+
+[[package]]
+name = "toml_edit"
+version = "0.22.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f4f7f0dd8d50a853a531c426359045b1998f04219d88799810762cd4ad314234"
+checksum = "c127785850e8c20836d49732ae6abfa47616e60bf9d9f57c43c250361a9db96c"
 dependencies = [
+ "indexmap",
  "serde",
+ "serde_spanned",
+ "toml_datetime",
+ "winnow",
+]
+
+[[package]]
+name = "tracing"
+version = "0.1.40"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c3523ab5a71916ccf420eebdf5521fcef02141234bbc0b8a49f2fdc4544364ef"
+dependencies = [
+ "pin-project-lite",
+ "tracing-attributes",
+ "tracing-core",
+]
+
+[[package]]
+name = "tracing-attributes"
+version = "0.1.27"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.65",
+]
+
+[[package]]
+name = "tracing-core"
+version = "0.1.32"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c06d3da6113f116aaee68e4d601191614c9053067f9ab7f6edbcb161237daa54"
+dependencies = [
+ "once_cell",
+ "valuable",
+]
+
+[[package]]
+name = "tracing-log"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ee855f1f400bd0e5c02d150ae5de3840039a3f54b025156404e34c23c03f47c3"
+dependencies = [
+ "log",
+ "once_cell",
+ "tracing-core",
+]
+
+[[package]]
+name = "tracing-subscriber"
+version = "0.3.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ad0f048c97dbd9faa9b7df56362b8ebcaa52adb06b498c050d2f4e32f90a7a8b"
+dependencies = [
+ "matchers",
+ "nu-ansi-term",
+ "once_cell",
+ "regex",
+ "sharded-slab",
+ "smallvec",
+ "thread_local",
+ "tracing",
+ "tracing-core",
+ "tracing-log",
 ]
 
 [[package]]
@@ -1908,6 +2165,12 @@ version = "0.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a"
 
+[[package]]
+name = "valuable"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d"
+
 [[package]]
 name = "vcpkg"
 version = "0.2.15"
@@ -1967,7 +2230,7 @@ dependencies = [
  "once_cell",
  "proc-macro2",
  "quote",
- "syn 2.0.64",
+ "syn 2.0.65",
  "wasm-bindgen-shared",
 ]
 
@@ -1989,7 +2252,7 @@ checksum = "e94f17b526d0a461a191c78ea52bbce64071ed5c04c9ffe424dcb38f74171bb7"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.64",
+ "syn 2.0.65",
  "wasm-bindgen-backend",
  "wasm-bindgen-shared",
 ]
@@ -2040,6 +2303,15 @@ version = "0.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
 
+[[package]]
+name = "windows"
+version = "0.48.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e686886bc078bc1b0b600cac0147aadb815089b6e4da64016cbd754b6342700f"
+dependencies = [
+ "windows-targets 0.48.5",
+]
+
 [[package]]
 name = "windows-sys"
 version = "0.48.0"
@@ -2179,6 +2451,15 @@ version = "0.52.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "bec47e5bfd1bff0eeaf6d8b485cc1074891a197ab4225d504cb7a1ab88b02bf0"
 
+[[package]]
+name = "winnow"
+version = "0.6.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c3c52e9c97a68071b23e836c9380edae937f17b9c4667bd021973efc689f618d"
+dependencies = [
+ "memchr",
+]
+
 [[package]]
 name = "zerocopy"
 version = "0.7.34"
@@ -2196,5 +2477,5 @@ checksum = "15e934569e47891f7d9411f1a451d947a60e000ab3bd24fbb970f000387d1b3b"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.64",
+ "syn 2.0.65",
 ]
diff --git a/README.md b/README.md
index 0698d53eb..927e82462 100644
--- a/README.md
+++ b/README.md
@@ -41,11 +41,6 @@ First, you need to clone the necsim-rust GitHub repository:
 ```
 necsim-rust is written in the [Rust language](https://www.rust-lang.org/tools/install), which must be installed in your `PATH` first. necsim-rust includes a `rust-toolchain` file that configures Rust to use a working nightly toolchain version and install all components required for compilation. If you want to use necsim-rust on a target different than `x86_64-unknown-linux-gnu`, please update the [rust-toolchain](rust-toolchain) config file accordingly.
 
-If you also want to use the CUDA-based algorithm, it is **required** that you also install the following:
-```shell
-> cargo install ptx-linker --force --locked
-```
-
 ## Installation
 
 To install `rustcoalescence`, you need to decide which algorithms you want to compile with it. You can enable the provided algorithms by enabling their corresponding features. For instance, to compile all CPU-based algorithms with all scenarios, you can use
diff --git a/necsim/core/Cargo.toml b/necsim/core/Cargo.toml
index f481e5de1..ce503bce7 100644
--- a/necsim/core/Cargo.toml
+++ b/necsim/core/Cargo.toml
@@ -15,12 +15,12 @@ cuda = ["rust-cuda"]
 necsim-core-maths = { path = "maths" }
 necsim-core-bond = { path = "bond" }
 
-const-type-layout = { version = "0.3.0", features = ["derive"] }
+const-type-layout = { version = "0.3.1", features = ["derive"] }
 contracts = "0.6.3"
 serde = { version = "1.0", default-features = false, features = ["derive"] }
 
 [target.'cfg(target_os = "cuda")'.dependencies]
-rust-cuda = { git = "https://github.com/juntyr/rust-cuda", rev = "f395253", features = ["derive"], optional = true }
+rust-cuda = { git = "https://github.com/juntyr/rust-cuda", rev = "697dcf5", features = ["derive"], optional = true }
 
 [target.'cfg(not(target_os = "cuda"))'.dependencies]
-rust-cuda = { git = "https://github.com/juntyr/rust-cuda", rev = "f395253", features = ["derive", "host"], optional = true }
+rust-cuda = { git = "https://github.com/juntyr/rust-cuda", rev = "697dcf5", features = ["derive", "host"], optional = true }
diff --git a/necsim/core/bond/Cargo.toml b/necsim/core/bond/Cargo.toml
index fc004b4da..445bbc9a2 100644
--- a/necsim/core/bond/Cargo.toml
+++ b/necsim/core/bond/Cargo.toml
@@ -13,5 +13,5 @@ default = []
 [dependencies]
 necsim-core-maths = { path = "../maths" }
 
-const-type-layout = { version = "0.3.0", features = ["derive"] }
+const-type-layout = { version = "0.3.1", features = ["derive"] }
 serde = { version = "1.0", default-features = false, features = ["derive"] }
diff --git a/necsim/core/bond/src/closed_open_unit_f64.rs b/necsim/core/bond/src/closed_open_unit_f64.rs
index 0d2155c13..e6424106a 100644
--- a/necsim/core/bond/src/closed_open_unit_f64.rs
+++ b/necsim/core/bond/src/closed_open_unit_f64.rs
@@ -88,6 +88,7 @@ impl ClosedOpenUnitF64 {
 }
 
 impl PartialEq for ClosedOpenUnitF64 {
+    #[allow(clippy::unconditional_recursion)]
     fn eq(&self, other: &Self) -> bool {
         self.0.eq(&other.0)
     }
diff --git a/necsim/core/bond/src/closed_unit_f64.rs b/necsim/core/bond/src/closed_unit_f64.rs
index 664c9f20e..d5c0bdc02 100644
--- a/necsim/core/bond/src/closed_unit_f64.rs
+++ b/necsim/core/bond/src/closed_unit_f64.rs
@@ -122,6 +122,7 @@ impl From<ClosedOpenUnitF64> for ClosedUnitF64 {
 }
 
 impl PartialEq for ClosedUnitF64 {
+    #[allow(clippy::unconditional_recursion)]
     fn eq(&self, other: &Self) -> bool {
         self.0.eq(&other.0)
     }
diff --git a/necsim/core/bond/src/non_negative_f64.rs b/necsim/core/bond/src/non_negative_f64.rs
index cf60da503..89e3ea295 100644
--- a/necsim/core/bond/src/non_negative_f64.rs
+++ b/necsim/core/bond/src/non_negative_f64.rs
@@ -161,6 +161,7 @@ impl From<ClosedOpenUnitF64> for NonNegativeF64 {
 }
 
 impl PartialEq for NonNegativeF64 {
+    #[allow(clippy::unconditional_recursion)]
     fn eq(&self, other: &Self) -> bool {
         self.0.eq(&other.0)
     }
diff --git a/necsim/core/bond/src/non_positive_f64.rs b/necsim/core/bond/src/non_positive_f64.rs
index 62807c4bf..2e7cce0e8 100644
--- a/necsim/core/bond/src/non_positive_f64.rs
+++ b/necsim/core/bond/src/non_positive_f64.rs
@@ -94,6 +94,7 @@ impl NonPositiveF64 {
 }
 
 impl PartialEq for NonPositiveF64 {
+    #[allow(clippy::unconditional_recursion)]
     fn eq(&self, other: &Self) -> bool {
         self.0.eq(&other.0)
     }
diff --git a/necsim/core/bond/src/open_closed_unit_f64.rs b/necsim/core/bond/src/open_closed_unit_f64.rs
index a82fdfc37..b4b3441dc 100644
--- a/necsim/core/bond/src/open_closed_unit_f64.rs
+++ b/necsim/core/bond/src/open_closed_unit_f64.rs
@@ -94,6 +94,7 @@ impl OpenClosedUnitF64 {
 }
 
 impl PartialEq for OpenClosedUnitF64 {
+    #[allow(clippy::unconditional_recursion)]
     fn eq(&self, other: &Self) -> bool {
         self.0.eq(&other.0)
     }
diff --git a/necsim/core/bond/src/positive_f64.rs b/necsim/core/bond/src/positive_f64.rs
index ac8bf4090..de5766741 100644
--- a/necsim/core/bond/src/positive_f64.rs
+++ b/necsim/core/bond/src/positive_f64.rs
@@ -127,6 +127,7 @@ impl From<NonZeroU64> for PositiveF64 {
 }
 
 impl PartialEq for PositiveF64 {
+    #[allow(clippy::unconditional_recursion)]
     fn eq(&self, other: &Self) -> bool {
         self.0.eq(&other.0)
     }
diff --git a/necsim/core/src/cogs/coalescence_sampler.rs b/necsim/core/src/cogs/coalescence_sampler.rs
index 93af7bc92..f4d0aa4da 100644
--- a/necsim/core/src/cogs/coalescence_sampler.rs
+++ b/necsim/core/src/cogs/coalescence_sampler.rs
@@ -28,7 +28,6 @@ pub trait CoalescenceSampler<M: MathsCore, H: Habitat<M>, S: LineageStore<M, H>>
     ) -> (IndexedLocation, LineageInteraction);
 }
 
-#[allow(clippy::unsafe_derive_deserialize)]
 #[derive(Debug, PartialEq, Serialize, Deserialize, TypeLayout)]
 #[repr(transparent)]
 pub struct CoalescenceRngSample(ClosedOpenUnitF64);
diff --git a/necsim/core/src/event.rs b/necsim/core/src/event.rs
index 40108ae85..af42ac633 100644
--- a/necsim/core/src/event.rs
+++ b/necsim/core/src/event.rs
@@ -55,7 +55,6 @@ pub struct Dispersal {
 }
 
 #[allow(clippy::module_name_repetitions)]
-#[allow(clippy::unsafe_derive_deserialize)]
 #[derive(Debug, Clone, Serialize, Deserialize, TypeLayout)]
 #[repr(C)]
 pub struct SpeciationEvent {
diff --git a/necsim/core/src/landscape/extent.rs b/necsim/core/src/landscape/extent.rs
index 063a8c445..c38e7afc3 100644
--- a/necsim/core/src/landscape/extent.rs
+++ b/necsim/core/src/landscape/extent.rs
@@ -2,12 +2,14 @@ use necsim_core_bond::OffByOneU32;
 
 use super::Location;
 
-#[allow(clippy::module_name_repetitions, clippy::unsafe_derive_deserialize)]
+#[allow(clippy::module_name_repetitions)]
 #[derive(PartialEq, Eq, Clone, Debug, serde::Deserialize, serde::Serialize, TypeLayout)]
+#[cfg_attr(feature = "cuda", derive(rust_cuda::lend::LendRustToCuda))]
+#[cfg_attr(feature = "cuda", cuda(ignore))]
 #[serde(rename = "Extent")]
 #[serde(deny_unknown_fields)]
-#[repr(C)]
 pub struct LandscapeExtent {
+    #[cfg_attr(feature = "cuda", cuda(embed))]
     origin: Location,
     width: OffByOneU32,
     height: OffByOneU32,
diff --git a/necsim/core/src/landscape/location.rs b/necsim/core/src/landscape/location.rs
index c3686e5c6..6bcc520a6 100644
--- a/necsim/core/src/landscape/location.rs
+++ b/necsim/core/src/landscape/location.rs
@@ -2,12 +2,14 @@ use serde::{Deserialize, Serialize};
 
 use crate::cogs::Backup;
 
-#[allow(clippy::unsafe_derive_deserialize)]
+#[allow(clippy::module_name_repetitions)]
 #[derive(
     Eq, PartialEq, PartialOrd, Ord, Clone, Hash, Debug, Serialize, Deserialize, TypeLayout,
 )]
-#[serde(deny_unknown_fields)]
+#[cfg_attr(feature = "cuda", derive(rust_cuda::lend::LendRustToCuda))]
 #[repr(C)]
+#[cfg_attr(feature = "cuda", cuda(ignore))]
+#[serde(deny_unknown_fields)]
 pub struct Location {
     x: u32,
     y: u32,
@@ -46,10 +48,13 @@ impl From<IndexedLocation> for Location {
 #[derive(
     Eq, PartialEq, PartialOrd, Ord, Clone, Hash, Debug, Serialize, Deserialize, TypeLayout,
 )]
-#[allow(clippy::module_name_repetitions, clippy::unsafe_derive_deserialize)]
-#[serde(from = "IndexedLocationRaw", into = "IndexedLocationRaw")]
+#[allow(clippy::module_name_repetitions)]
+#[cfg_attr(feature = "cuda", derive(rust_cuda::lend::LendRustToCuda))]
 #[repr(C)]
+#[cfg_attr(feature = "cuda", cuda(ignore))]
+#[serde(from = "IndexedLocationRaw", into = "IndexedLocationRaw")]
 pub struct IndexedLocation {
+    #[cfg_attr(feature = "cuda", cuda(embed))]
     location: Location,
     index: u32,
 }
@@ -74,7 +79,6 @@ impl IndexedLocation {
 #[derive(Serialize, Deserialize)]
 #[serde(deny_unknown_fields)]
 #[serde(rename = "IndexedLocation")]
-#[repr(C)]
 struct IndexedLocationRaw {
     x: u32,
     y: u32,
diff --git a/necsim/core/src/landscape/mod.rs b/necsim/core/src/landscape/mod.rs
index 6c05344ca..41a00b87f 100644
--- a/necsim/core/src/landscape/mod.rs
+++ b/necsim/core/src/landscape/mod.rs
@@ -1,6 +1,6 @@
 mod extent;
 mod location;
 
-#[allow(clippy::useless_attribute, clippy::module_name_repetitions)]
+#[allow(clippy::module_name_repetitions)]
 pub use extent::{LandscapeExtent, LocationIterator};
 pub use location::{IndexedLocation, Location};
diff --git a/necsim/core/src/lib.rs b/necsim/core/src/lib.rs
index 86e145fbc..a8da66266 100644
--- a/necsim/core/src/lib.rs
+++ b/necsim/core/src/lib.rs
@@ -1,7 +1,6 @@
 #![deny(clippy::pedantic)]
 #![no_std]
 #![feature(const_type_name)]
-#![feature(control_flow_enum)]
 #![feature(min_specialization)]
 
 #[doc(hidden)]
diff --git a/necsim/core/src/lineage.rs b/necsim/core/src/lineage.rs
index 8e20ba0a5..398973fd0 100644
--- a/necsim/core/src/lineage.rs
+++ b/necsim/core/src/lineage.rs
@@ -16,6 +16,7 @@ use crate::{
 };
 
 #[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash, TypeLayout)]
+#[cfg_attr(feature = "cuda", derive(rust_cuda::lend::LendRustToCuda))]
 #[repr(transparent)]
 pub struct GlobalLineageReference(u64);
 
@@ -94,21 +95,29 @@ impl From<Option<GlobalLineageReference>> for LineageInteraction {
     }
 }
 
-#[allow(clippy::unsafe_derive_deserialize)]
+#[allow(clippy::module_name_repetitions)]
 #[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize, TypeLayout)]
-#[serde(deny_unknown_fields)]
+#[cfg_attr(feature = "cuda", derive(rust_cuda::lend::LendRustToCuda))]
 #[repr(C)]
+#[cfg_attr(feature = "cuda", cuda(ignore))]
+#[serde(deny_unknown_fields)]
 pub struct Lineage {
+    #[cfg_attr(feature = "cuda", cuda(embed))]
+    #[cfg_attr(feature = "cuda", cuda(ignore))]
     #[serde(alias = "id", alias = "ref")]
     pub global_reference: GlobalLineageReference,
+    #[cfg_attr(feature = "cuda", cuda(ignore))]
     #[serde(alias = "time")]
     pub last_event_time: NonNegativeF64,
+    #[cfg_attr(feature = "cuda", cuda(embed))]
+    #[cfg_attr(feature = "cuda", cuda(ignore))]
     #[serde(alias = "loc")]
     pub indexed_location: IndexedLocation,
 }
 
 impl Lineage {
     #[must_use]
+    #[allow(clippy::no_effect_underscore_binding)]
     #[debug_ensures(
         ret.indexed_location == old(indexed_location.clone()),
         "stores the indexed_location"
diff --git a/necsim/core/src/reporter/boolean.rs b/necsim/core/src/reporter/boolean.rs
index 372b43db1..686330300 100644
--- a/necsim/core/src/reporter/boolean.rs
+++ b/necsim/core/src/reporter/boolean.rs
@@ -5,7 +5,7 @@ mod private {
     impl Sealed for super::False {}
 }
 
-pub trait Boolean: private::Sealed {
+pub trait Boolean: 'static + private::Sealed {
     const VALUE: bool;
 }
 
diff --git a/necsim/core/src/reporter/mod.rs b/necsim/core/src/reporter/mod.rs
index 821ae269f..a934f58b1 100644
--- a/necsim/core/src/reporter/mod.rs
+++ b/necsim/core/src/reporter/mod.rs
@@ -12,11 +12,11 @@ use used::MaybeUsed;
 pub mod boolean;
 pub mod used;
 
-#[allow(clippy::useless_attribute, clippy::module_name_repetitions)]
+#[allow(clippy::module_name_repetitions)]
 pub use combinator::ReporterCombinator;
-#[allow(clippy::useless_attribute, clippy::module_name_repetitions)]
+#[allow(clippy::module_name_repetitions)]
 pub use filter::FilteredReporter;
-#[allow(clippy::useless_attribute, clippy::module_name_repetitions)]
+#[allow(clippy::module_name_repetitions)]
 pub use null::NullReporter;
 
 pub trait Reporter: core::fmt::Debug {
diff --git a/necsim/core/src/simulation/builder.rs b/necsim/core/src/simulation/builder.rs
index eda540f91..6156662c2 100644
--- a/necsim/core/src/simulation/builder.rs
+++ b/necsim/core/src/simulation/builder.rs
@@ -86,7 +86,7 @@ impl<
 }
 
 #[derive(Debug, TypeLayout)]
-#[cfg_attr(feature = "cuda", derive(rust_cuda::common::LendRustToCuda))]
+#[cfg_attr(feature = "cuda", derive(rust_cuda::lend::LendRustToCuda))]
 #[cfg_attr(feature = "cuda", cuda(free = "M"))]
 #[repr(C)]
 pub struct Simulation<
diff --git a/necsim/core/src/simulation/mod.rs b/necsim/core/src/simulation/mod.rs
index c5356f1a2..29368e5a6 100644
--- a/necsim/core/src/simulation/mod.rs
+++ b/necsim/core/src/simulation/mod.rs
@@ -18,7 +18,7 @@ use crate::{
     reporter::Reporter,
 };
 
-#[allow(clippy::useless_attribute, clippy::module_name_repetitions)]
+#[allow(clippy::module_name_repetitions)]
 pub use builder::{Simulation, SimulationBuilder};
 use necsim_core_bond::{NonNegativeF64, PositiveF64};
 
@@ -51,7 +51,7 @@ impl<
 
     #[inline]
     pub fn simulate_incremental_early_stop<
-        F: FnMut(&Self, u64, PositiveF64) -> ControlFlow<(), ()>,
+        F: FnMut(&Self, u64, PositiveF64, &P) -> ControlFlow<(), ()>,
         P: Reporter,
     >(
         &mut self,
@@ -69,13 +69,17 @@ impl<
                 .map(|lineage| (lineage.event_time, lineage.tie_breaker));
 
             let self_ptr = self as *const Self;
+            let reporter_ptr = reporter as *const P;
 
             let old_rng = unsafe { self.rng.backup_unchecked() };
             let mut early_stop_flow = ControlFlow::Continue(());
 
             let early_peek_stop = |next_event_time| {
                 // Safety: We are only passing in an immutable reference
-                early_stop_flow = early_stop(unsafe { &*self_ptr }, steps, next_event_time);
+                early_stop_flow =
+                    early_stop(unsafe { &*self_ptr }, steps, next_event_time, unsafe {
+                        &*reporter_ptr
+                    });
 
                 if early_stop_flow.is_break() {
                     return ControlFlow::Break(());
@@ -131,6 +135,6 @@ impl<
 
     #[inline]
     pub fn simulate<P: Reporter>(mut self, reporter: &mut P) -> (NonNegativeF64, u64) {
-        self.simulate_incremental_early_stop(|_, _, _| ControlFlow::Continue(()), reporter)
+        self.simulate_incremental_early_stop(|_, _, _, _| ControlFlow::Continue(()), reporter)
     }
 }
diff --git a/necsim/impls/cuda/Cargo.toml b/necsim/impls/cuda/Cargo.toml
index 5288ebe32..37412bab1 100644
--- a/necsim/impls/cuda/Cargo.toml
+++ b/necsim/impls/cuda/Cargo.toml
@@ -10,12 +10,12 @@ edition = "2021"
 [dependencies]
 necsim-core = { path = "../../core", features = ["cuda"] }
 
-const-type-layout = { version = "0.3.0", features = ["derive"] }
+const-type-layout = { version = "0.3.1", features = ["derive"] }
 contracts = "0.6.3"
 serde = { version = "1.0", default-features = false, features = ["derive"] }
 
 [target.'cfg(target_os = "cuda")'.dependencies]
-rust-cuda = { git = "https://github.com/juntyr/rust-cuda", rev = "f395253", features = ["derive"] }
+rust-cuda = { git = "https://github.com/juntyr/rust-cuda", rev = "697dcf5", features = ["derive"] }
 
 [target.'cfg(not(target_os = "cuda"))'.dependencies]
-rust-cuda = { git = "https://github.com/juntyr/rust-cuda", rev = "f395253", features = ["derive", "host"] }
+rust-cuda = { git = "https://github.com/juntyr/rust-cuda", rev = "697dcf5", features = ["derive", "host"] }
diff --git a/necsim/impls/cuda/src/cogs/maths.rs b/necsim/impls/cuda/src/cogs/maths.rs
index 11c49ffc1..4b5df0d36 100644
--- a/necsim/impls/cuda/src/cogs/maths.rs
+++ b/necsim/impls/cuda/src/cogs/maths.rs
@@ -36,11 +36,14 @@ impl MathsCore for NvptxMathsCore {
         }
         #[cfg(not(target_os = "cuda"))]
         {
-            extern "C" {
-                fn nvptx_maths_core_ln_on_cpu(_x: f64) -> !;
-            }
+            // extern "C" {
+            //     fn nvptx_maths_core_ln_on_cpu(_x: f64) -> !;
+            // }
+
+            // unsafe { nvptx_maths_core_ln_on_cpu(x) }
 
-            unsafe { nvptx_maths_core_ln_on_cpu(x) }
+            // TODO: disallow using NvptxMathsCore::ln on CPU
+            unsafe { core::intrinsics::logf64(x) }
         }
     }
 
diff --git a/necsim/impls/cuda/src/cogs/rng.rs b/necsim/impls/cuda/src/cogs/rng.rs
index bc34a8f0f..8237ed1cf 100644
--- a/necsim/impls/cuda/src/cogs/rng.rs
+++ b/necsim/impls/cuda/src/cogs/rng.rs
@@ -3,49 +3,48 @@ use core::marker::PhantomData;
 use necsim_core::cogs::{MathsCore, PrimeableRng, RngCore};
 
 use const_type_layout::TypeGraphLayout;
-use rust_cuda::safety::StackOnly;
+use rust_cuda::{
+    safety::{PortableBitSemantics, StackOnly},
+    utils::adapter::RustToCudaWithPortableBitCloneSemantics,
+};
 
 use serde::{Deserialize, Deserializer, Serialize, Serializer};
 
 #[allow(clippy::module_name_repetitions)]
-#[derive(Debug, rust_cuda::common::LendRustToCuda)]
+#[derive(Debug, Clone, rust_cuda::lend::LendRustToCuda)]
 #[cuda(free = "M", free = "R")]
 pub struct CudaRng<M: MathsCore, R>
 where
-    R: RngCore<M> + StackOnly + TypeGraphLayout,
+    R: RngCore<M> + StackOnly + PortableBitSemantics + TypeGraphLayout,
 {
-    inner: R,
+    #[cuda(embed)]
+    inner: RustToCudaWithPortableBitCloneSemantics<R>,
     marker: PhantomData<M>,
 }
 
-impl<M: MathsCore, R: RngCore<M> + StackOnly + TypeGraphLayout> Clone for CudaRng<M, R> {
-    fn clone(&self) -> Self {
-        Self {
-            inner: self.inner.clone(),
-            marker: PhantomData::<M>,
-        }
-    }
-}
-
-impl<M: MathsCore, R: RngCore<M> + StackOnly + TypeGraphLayout> From<R> for CudaRng<M, R> {
+impl<M: MathsCore, R: RngCore<M> + StackOnly + PortableBitSemantics + TypeGraphLayout> From<R>
+    for CudaRng<M, R>
+{
     #[must_use]
     #[inline]
     fn from(rng: R) -> Self {
         Self {
-            inner: rng,
+            inner: rng.into(),
             marker: PhantomData::<M>,
         }
     }
 }
 
-impl<M: MathsCore, R: RngCore<M> + StackOnly + TypeGraphLayout> RngCore<M> for CudaRng<M, R> {
+impl<M: MathsCore, R: RngCore<M> + StackOnly + PortableBitSemantics + TypeGraphLayout> RngCore<M>
+    for CudaRng<M, R>
+{
     type Seed = <R as RngCore<M>>::Seed;
 
     #[must_use]
     #[inline]
     fn from_seed(seed: Self::Seed) -> Self {
         Self {
-            inner: R::from_seed(seed),
+            inner: R::from_seed(seed).into(),
             marker: PhantomData::<M>,
         }
     }
@@ -57,8 +56,8 @@ impl<M: MathsCore, R: RngCore<M> + StackOnly + TypeGraphLayout> RngCore<M> for C
     }
 }
 
-impl<M: MathsCore, R: PrimeableRng<M> + StackOnly + TypeGraphLayout> PrimeableRng<M>
-    for CudaRng<M, R>
+impl<M: MathsCore, R: PrimeableRng<M> + StackOnly + PortableBitSemantics + TypeGraphLayout>
+    PrimeableRng<M> for CudaRng<M, R>
 {
     #[inline]
     fn prime_with(&mut self, location_index: u64, time_index: u64) {
@@ -66,17 +65,19 @@ impl<M: MathsCore, R: PrimeableRng<M> + StackOnly + TypeGraphLayout> PrimeableRn
     }
 }
 
-impl<M: MathsCore, R: RngCore<M> + StackOnly + TypeGraphLayout> Serialize for CudaRng<M, R> {
+impl<M: MathsCore, R: RngCore<M> + StackOnly + PortableBitSemantics + TypeGraphLayout> Serialize
+    for CudaRng<M, R>
+{
     fn serialize<S: Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
         self.inner.serialize(serializer)
     }
 }
 
-impl<'de, M: MathsCore, R: RngCore<M> + StackOnly + TypeGraphLayout> Deserialize<'de>
-    for CudaRng<M, R>
+impl<'de, M: MathsCore, R: RngCore<M> + StackOnly + PortableBitSemantics + TypeGraphLayout>
+    Deserialize<'de> for CudaRng<M, R>
 {
     fn deserialize<D: Deserializer<'de>>(deserializer: D) -> Result<Self, D::Error> {
-        let inner = R::deserialize(deserializer)?;
+        let inner = R::deserialize(deserializer)?.into();
 
         Ok(Self {
             inner,
diff --git a/necsim/impls/cuda/src/event_buffer.rs b/necsim/impls/cuda/src/event_buffer.rs
index 6fb9f314f..1a08d85ca 100644
--- a/necsim/impls/cuda/src/event_buffer.rs
+++ b/necsim/impls/cuda/src/event_buffer.rs
@@ -1,13 +1,22 @@
-use core::fmt;
+use core::{
+    fmt,
+    ops::{Deref, DerefMut},
+};
 
+use const_type_layout::TypeGraphLayout;
 #[cfg(not(target_os = "cuda"))]
-use rust_cuda::rustacuda::{
+use rust_cuda::deps::rustacuda::{
     error::CudaResult,
     function::{BlockSize, GridSize},
 };
 
-use rust_cuda::utils::{
-    aliasing::SplitSliceOverCudaThreadsDynamicStride, exchange::buffer::CudaExchangeBuffer,
+use rust_cuda::{
+    lend::RustToCudaProxy,
+    safety::{PortableBitSemantics, SafeMutableAliasing, StackOnly},
+    utils::{
+        aliasing::SplitSliceOverCudaThreadsDynamicStride,
+        exchange::buffer::{CudaExchangeBuffer, CudaExchangeItem},
+    },
 };
 
 use necsim_core::{
@@ -24,11 +33,16 @@ use necsim_core::impl_report;
 use super::utils::MaybeSome;
 
 #[allow(clippy::module_name_repetitions, clippy::type_complexity)]
-#[derive(rust_cuda::common::LendRustToCuda)]
+#[derive(rust_cuda::lend::LendRustToCuda)]
 #[cuda(free = "ReportSpeciation", free = "ReportDispersal")]
 pub struct EventBuffer<ReportSpeciation: Boolean, ReportDispersal: Boolean> {
+    #[cfg(not(target_os = "cuda"))]
     #[cuda(embed)]
     event_mask: SplitSliceOverCudaThreadsDynamicStride<CudaExchangeBuffer<bool, true, true>>,
+    #[cfg(target_os = "cuda")]
+    #[cuda(embed = "SplitSliceOverCudaThreadsDynamicStride<CudaExchangeBuffer<bool, true, true>>")]
+    event_mask: CudaExchangeSlice<CudaExchangeItem<bool, true, true>>,
+    #[cfg(not(target_os = "cuda"))]
     #[cuda(embed)]
     event_buffer: SplitSliceOverCudaThreadsDynamicStride<
         CudaExchangeBuffer<
@@ -37,14 +51,49 @@ pub struct EventBuffer<ReportSpeciation: Boolean, ReportDispersal: Boolean> {
             true,
         >,
     >,
-    max_events: usize,
-    event_counter: usize,
+    #[cfg(target_os = "cuda")]
+    #[cuda(embed = "SplitSliceOverCudaThreadsDynamicStride<
+    CudaExchangeBuffer<
+        MaybeSome<<EventBuffer<ReportSpeciation, ReportDispersal> as EventType>::Event>,
+        false,
+        true,
+    >,
+>")]
+    event_buffer: CudaExchangeSlice<
+        CudaExchangeItem<
+            MaybeSome<<EventBuffer<ReportSpeciation, ReportDispersal> as EventType>::Event>,
+            false,
+            true,
+        >,
+    >,
+}
+
+// Safety:
+// - no mutable aliasing occurs since all parts implement SafeMutableAliasing
+// - dropping does not trigger (de)alloc since EventBuffer doesn't impl Drop and
+//   all parts implement SafeMutableAliasing
+// - EventBuffer has no shallow mutable state
+unsafe impl<ReportSpeciation: Boolean, ReportDispersal: Boolean> SafeMutableAliasing
+    for EventBuffer<ReportSpeciation, ReportDispersal>
+where
+    SplitSliceOverCudaThreadsDynamicStride<CudaExchangeBuffer<bool, true, true>>:
+        SafeMutableAliasing,
+    SplitSliceOverCudaThreadsDynamicStride<
+        CudaExchangeBuffer<
+            MaybeSome<<EventBuffer<ReportSpeciation, ReportDispersal> as EventType>::Event>,
+            false,
+            true,
+        >,
+    >: SafeMutableAliasing,
+{
 }
 
 pub trait EventType {
     type Event: 'static
-        + rust_cuda::const_type_layout::TypeGraphLayout
+        + Sync
+        + rust_cuda::deps::const_type_layout::TypeGraphLayout
         + rust_cuda::safety::StackOnly
+        + rust_cuda::safety::PortableBitSemantics
         + Into<TypedEvent>
         + Into<PackedEvent>
         + Clone;
@@ -76,10 +125,7 @@ impl<ReportSpeciation: Boolean, ReportDispersal: Boolean> fmt::Debug
     for EventBuffer<ReportSpeciation, ReportDispersal>
 {
     fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
-        fmt.debug_struct("EventBuffer")
-            .field("max_events", &self.max_events)
-            .field("event_counter", &self.event_counter)
-            .finish_non_exhaustive()
+        fmt.debug_struct("EventBuffer").finish_non_exhaustive()
     }
 }
 
@@ -120,8 +166,6 @@ impl<ReportSpeciation: Boolean, ReportDispersal: Boolean>
                 CudaExchangeBuffer::from_vec(event_buffer)?,
                 max_events,
             ),
-            max_events,
-            event_counter: 0_usize,
         })
     }
 
@@ -146,9 +190,31 @@ impl<ReportSpeciation: Boolean, ReportDispersal: Boolean>
             mask.write(false);
         }
     }
+}
+
+#[cfg(target_os = "cuda")]
+impl<ReportSpeciation: Boolean, ReportDispersal: Boolean>
+    EventBuffer<ReportSpeciation, ReportDispersal>
+{
+    #[must_use]
+    pub fn can_buffer_next_event(&self) -> bool {
+        !self.event_buffer.is_empty()
+    }
 
-    pub fn max_events_per_individual(&self) -> usize {
-        self.max_events
+    fn report_event(
+        &mut self,
+        event: impl Into<<EventBuffer<ReportSpeciation, ReportDispersal> as EventType>::Event>,
+    ) {
+        if let ([mask, mask_rest @ ..], [buffer, buffer_rest @ ..]) = (
+            core::mem::take(&mut *self.event_mask),
+            core::mem::take(&mut *self.event_buffer),
+        ) {
+            mask.write(true);
+            buffer.write(MaybeSome::Some(event.into()));
+
+            *self.event_mask = mask_rest;
+            *self.event_buffer = buffer_rest;
+        }
     }
 }
 
@@ -167,19 +233,11 @@ impl<ReportSpeciation: Boolean, ReportDispersal: Boolean> Reporter
 impl Reporter for EventBuffer<False, True> {
     impl_report!(
         #[debug_requires(
-            self.event_counter < self.max_events,
+            self.can_buffer_next_event(),
             "does not report extraneous dispersal events"
         )]
         dispersal(&mut self, event: Used) {
-            if let Some(mask) = self.event_mask.get_mut(self.event_counter) {
-                mask.write(true);
-
-                unsafe {
-                    self.event_buffer.get_unchecked_mut(self.event_counter)
-                }.write(MaybeSome::Some(event.clone().into()));
-            }
-
-            self.event_counter += 1;
+            self.report_event(event.clone());
         }
     );
 }
@@ -188,19 +246,14 @@ impl Reporter for EventBuffer<False, True> {
 impl Reporter for EventBuffer<True, False> {
     impl_report!(
         #[debug_requires(
-            self.event_counter == 0,
+            self.can_buffer_next_event(),
             "does not report extraneous speciation events"
         )]
         speciation(&mut self, event: Used) {
-            if let Some(mask) = self.event_mask.get_mut(0) {
-                mask.write(true);
+            self.report_event(event.clone());
 
-                unsafe {
-                    self.event_buffer.get_unchecked_mut(0)
-                }.write(MaybeSome::Some(event.clone()));
-            }
-
-            self.event_counter = self.max_events;
+            *self.event_mask = &mut [];
+            *self.event_buffer = &mut [];
         }
     );
 }
@@ -209,37 +262,75 @@ impl Reporter for EventBuffer<True, False> {
 impl Reporter for EventBuffer<True, True> {
     impl_report!(
         #[debug_requires(
-            self.event_counter < self.max_events,
+            self.can_buffer_next_event(),
             "does not report extraneous speciation events"
         )]
         speciation(&mut self, event: Used) {
-            if let Some(mask) = self.event_mask.get_mut(self.event_counter) {
-                mask.write(true);
-
-                unsafe {
-                    self.event_buffer.get_unchecked_mut(self.event_counter)
-                }.write(MaybeSome::Some(event.clone().into()));
-            }
+            self.report_event(event.clone());
 
-            self.event_counter = self.max_events;
+            *self.event_mask = &mut [];
+            *self.event_buffer = &mut [];
         }
     );
 
     impl_report!(
         #[debug_requires(
-            self.event_counter < self.max_events,
+            self.can_buffer_next_event(),
             "does not report extraneous dispersal events"
         )]
         dispersal(&mut self, event: Used) {
-            if let Some(mask) = self.event_mask.get_mut(self.event_counter) {
-                mask.write(true);
-
-                unsafe {
-                    self.event_buffer.get_unchecked_mut(self.event_counter)
-                }.write(MaybeSome::Some(event.clone().into()));
-            }
-
-            self.event_counter += 1;
+            self.report_event(event.clone());
         }
     );
 }
+
+// TODO: find a prettier workaround
+struct CudaExchangeSlice<T: 'static + StackOnly + PortableBitSemantics + TypeGraphLayout>(
+    &'static mut [T],
+);
+
+impl<T: 'static + StackOnly + PortableBitSemantics + TypeGraphLayout> Deref
+    for CudaExchangeSlice<T>
+{
+    type Target = &'static mut [T];
+
+    fn deref(&self) -> &Self::Target {
+        &self.0
+    }
+}
+
+impl<T: 'static + StackOnly + PortableBitSemantics + TypeGraphLayout> DerefMut
+    for CudaExchangeSlice<T>
+{
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        &mut self.0
+    }
+}
+
+impl<
+        T: 'static + StackOnly + PortableBitSemantics + TypeGraphLayout,
+        const M2D: bool,
+        const M2H: bool,
+    > RustToCudaProxy<CudaExchangeSlice<CudaExchangeItem<T, M2D, M2H>>>
+    for SplitSliceOverCudaThreadsDynamicStride<CudaExchangeBuffer<T, M2D, M2H>>
+{
+    fn from_ref(_val: &CudaExchangeSlice<CudaExchangeItem<T, M2D, M2H>>) -> &Self {
+        unsafe { unreachable_cuda_event_buffer_hack() }
+    }
+
+    fn from_mut(_val: &mut CudaExchangeSlice<CudaExchangeItem<T, M2D, M2H>>) -> &mut Self {
+        unsafe { unreachable_cuda_event_buffer_hack() }
+    }
+
+    fn into(mut self) -> CudaExchangeSlice<CudaExchangeItem<T, M2D, M2H>> {
+        let slice: &mut [CudaExchangeItem<T, M2D, M2H>] = &mut self;
+
+        let slice = unsafe { core::slice::from_raw_parts_mut(slice.as_mut_ptr(), slice.len()) };
+
+        CudaExchangeSlice(slice)
+    }
+}
+
+extern "C" {
+    fn unreachable_cuda_event_buffer_hack() -> !;
+}
diff --git a/necsim/impls/cuda/src/utils.rs b/necsim/impls/cuda/src/utils.rs
index 8ff8033a5..39c1c8285 100644
--- a/necsim/impls/cuda/src/utils.rs
+++ b/necsim/impls/cuda/src/utils.rs
@@ -3,7 +3,7 @@ use core::mem::MaybeUninit;
 use rust_cuda::safety::StackOnly;
 
 #[derive(TypeLayout)]
-#[repr(C)]
+#[repr(transparent)]
 #[doc(hidden)]
 pub struct MaybeSome<T: StackOnly>(MaybeUninit<T>);
 
diff --git a/necsim/impls/cuda/src/value_buffer.rs b/necsim/impls/cuda/src/value_buffer.rs
index 04d844f6f..b1dc71f1a 100644
--- a/necsim/impls/cuda/src/value_buffer.rs
+++ b/necsim/impls/cuda/src/value_buffer.rs
@@ -3,7 +3,7 @@ use core::iter::Iterator;
 
 use const_type_layout::TypeGraphLayout;
 use rust_cuda::{
-    safety::StackOnly,
+    safety::{PortableBitSemantics, SafeMutableAliasing, StackOnly},
     utils::{
         aliasing::SplitSliceOverCudaThreadsConstStride,
         exchange::buffer::{CudaExchangeBuffer, CudaExchangeItem},
@@ -11,19 +11,19 @@ use rust_cuda::{
 };
 
 #[cfg(not(target_os = "cuda"))]
-use rust_cuda::rustacuda::{
+use rust_cuda::deps::rustacuda::{
     error::CudaResult,
     function::{BlockSize, GridSize},
 };
 
 use super::utils::MaybeSome;
 
-#[derive(rust_cuda::common::LendRustToCuda)]
+#[derive(rust_cuda::lend::LendRustToCuda)]
 #[cuda(free = "T")]
 #[allow(clippy::module_name_repetitions)]
 pub struct ValueBuffer<T, const M2D: bool, const M2H: bool>
 where
-    T: StackOnly + TypeGraphLayout,
+    T: StackOnly + PortableBitSemantics + TypeGraphLayout,
 {
     #[cuda(embed)]
     mask: SplitSliceOverCudaThreadsConstStride<CudaExchangeBuffer<bool, true, true>, 1_usize>,
@@ -32,8 +32,25 @@ where
         SplitSliceOverCudaThreadsConstStride<CudaExchangeBuffer<MaybeSome<T>, M2D, M2H>, 1_usize>,
 }
 
+// Safety:
+// - no mutable aliasing occurs since all parts implement SafeMutableAliasing
+// - dropping does not trigger (de)alloc since ValueBuffer doesn't impl Drop and
+//   all parts implement SafeMutableAliasing
+// - ValueBuffer has no shallow mutable state
+unsafe impl<T: StackOnly + PortableBitSemantics + TypeGraphLayout, const M2D: bool, const M2H: bool>
+    SafeMutableAliasing for ValueBuffer<T, M2D, M2H>
+where
+    SplitSliceOverCudaThreadsConstStride<CudaExchangeBuffer<bool, true, true>, 1_usize>:
+        SafeMutableAliasing,
+    SplitSliceOverCudaThreadsConstStride<CudaExchangeBuffer<MaybeSome<T>, M2D, M2H>, 1_usize>:
+        SafeMutableAliasing,
+{
+}
+
 #[cfg(not(target_os = "cuda"))]
-impl<T: StackOnly + TypeGraphLayout, const M2D: bool, const M2H: bool> ValueBuffer<T, M2D, M2H> {
+impl<T: StackOnly + PortableBitSemantics + TypeGraphLayout, const M2D: bool, const M2H: bool>
+    ValueBuffer<T, M2D, M2H>
+{
     /// # Errors
     /// Returns a `rustacuda::errors::CudaError` iff an error occurs inside CUDA
     pub fn new(block_size: &BlockSize, grid_size: &GridSize) -> CudaResult<Self> {
@@ -67,7 +84,9 @@ impl<T: StackOnly + TypeGraphLayout, const M2D: bool, const M2H: bool> ValueBuff
 }
 
 #[cfg(not(target_os = "cuda"))]
-impl<T: StackOnly + TypeGraphLayout, const M2D: bool> ValueBuffer<T, M2D, true> {
+impl<T: StackOnly + PortableBitSemantics + TypeGraphLayout, const M2D: bool>
+    ValueBuffer<T, M2D, true>
+{
     pub fn iter(&self) -> impl Iterator<Item = Option<&T>> {
         self.mask
             .iter()
@@ -90,7 +109,7 @@ impl<T: StackOnly + TypeGraphLayout, const M2D: bool> ValueBuffer<T, M2D, true>
 }
 
 #[cfg(target_os = "cuda")]
-impl<T: StackOnly + TypeGraphLayout> ValueBuffer<T, true, true> {
+impl<T: StackOnly + PortableBitSemantics + TypeGraphLayout> ValueBuffer<T, true, true> {
     pub fn with_value_for_core<F: FnOnce(Option<T>) -> Option<T>>(&mut self, inner: F) {
         let value = if self
             .mask
@@ -117,7 +136,9 @@ impl<T: StackOnly + TypeGraphLayout> ValueBuffer<T, true, true> {
 }
 
 #[cfg(target_os = "cuda")]
-impl<T: StackOnly + TypeGraphLayout, const M2H: bool> ValueBuffer<T, true, M2H> {
+impl<T: StackOnly + PortableBitSemantics + TypeGraphLayout, const M2H: bool>
+    ValueBuffer<T, true, M2H>
+{
     pub fn take_value_for_core(&mut self) -> Option<T> {
         #[allow(clippy::option_if_let_else)]
         if let Some(mask) = self.mask.get_mut(0) {
@@ -135,7 +156,9 @@ impl<T: StackOnly + TypeGraphLayout, const M2H: bool> ValueBuffer<T, true, M2H>
 }
 
 #[cfg(target_os = "cuda")]
-impl<T: StackOnly + TypeGraphLayout, const M2D: bool> ValueBuffer<T, M2D, true> {
+impl<T: StackOnly + PortableBitSemantics + TypeGraphLayout, const M2D: bool>
+    ValueBuffer<T, M2D, true>
+{
     pub fn put_value_for_core(&mut self, value: Option<T>) {
         if let Some(mask) = self.mask.get_mut(0) {
             mask.write(value.is_some());
@@ -148,13 +171,15 @@ impl<T: StackOnly + TypeGraphLayout, const M2D: bool> ValueBuffer<T, M2D, true>
 }
 
 #[cfg(not(target_os = "cuda"))]
-pub struct ValueRefMut<'v, T: StackOnly, const M2D: bool> {
+pub struct ValueRefMut<'v, T: StackOnly + PortableBitSemantics + TypeGraphLayout, const M2D: bool> {
     mask: &'v mut CudaExchangeItem<bool, true, true>,
     value: &'v mut CudaExchangeItem<MaybeSome<T>, M2D, true>,
 }
 
 #[cfg(not(target_os = "cuda"))]
-impl<'v, T: StackOnly, const M2D: bool> ValueRefMut<'v, T, M2D> {
+impl<'v, T: StackOnly + PortableBitSemantics + TypeGraphLayout, const M2D: bool>
+    ValueRefMut<'v, T, M2D>
+{
     pub fn take(&mut self) -> Option<T> {
         if *self.mask.read() {
             self.mask.write(false);
@@ -176,7 +201,7 @@ impl<'v, T: StackOnly, const M2D: bool> ValueRefMut<'v, T, M2D> {
 }
 
 #[cfg(not(target_os = "cuda"))]
-impl<'v, T: StackOnly> ValueRefMut<'v, T, true> {
+impl<'v, T: StackOnly + PortableBitSemantics + TypeGraphLayout> ValueRefMut<'v, T, true> {
     #[must_use]
     pub fn as_mut(&mut self) -> Option<&mut T> {
         if *self.mask.read() {
diff --git a/necsim/impls/no-std/Cargo.toml b/necsim/impls/no-std/Cargo.toml
index d85745eb1..b6a6b6866 100644
--- a/necsim/impls/no-std/Cargo.toml
+++ b/necsim/impls/no-std/Cargo.toml
@@ -17,7 +17,7 @@ necsim-core-maths = { path = "../../core/maths" }
 necsim-core-bond = { path = "../../core/bond" }
 necsim-partitioning-core = { path = "../../partitioning/core" }
 
-const-type-layout = { version = "0.3.0", features = ["derive"] }
+const-type-layout = { version = "0.3.1", features = ["derive"] }
 contracts = "0.6.3"
 libm = "0.2"
 hashbrown = "0.14"
@@ -30,7 +30,7 @@ fnv = { version = "1.0", default-features = false, features = [] }
 rand_core = "0.6"
 
 [target.'cfg(target_os = "cuda")'.dependencies]
-rust-cuda = { git = "https://github.com/juntyr/rust-cuda", rev = "f395253", features = ["derive"], optional = true }
+rust-cuda = { git = "https://github.com/juntyr/rust-cuda", rev = "697dcf5", features = ["derive", "final"], optional = true }
 
 [target.'cfg(not(target_os = "cuda"))'.dependencies]
-rust-cuda = { git = "https://github.com/juntyr/rust-cuda", rev = "f395253", features = ["derive", "host"], optional = true }
+rust-cuda = { git = "https://github.com/juntyr/rust-cuda", rev = "697dcf5", features = ["derive", "final", "host"], optional = true }
diff --git a/necsim/impls/no-std/src/alias/mod.rs b/necsim/impls/no-std/src/alias/mod.rs
index 641478464..558416936 100644
--- a/necsim/impls/no-std/src/alias/mod.rs
+++ b/necsim/impls/no-std/src/alias/mod.rs
@@ -1,3 +1,5 @@
+use core::cmp::Ordering;
+
 use alloc::vec::Vec;
 
 use necsim_core::cogs::{MathsCore, RngCore};
@@ -62,11 +64,10 @@ impl<E: Copy + PartialEq> AliasMethodSampler<E> {
             };
             Ks[underfull_index] = Es[overfull_index];
 
-            #[allow(clippy::comparison_chain)]
-            if Us[overfull_index] < 1.0_f64 {
-                underfull_indices.push(overfull_index);
-            } else if Us[overfull_index] > 1.0_f64 {
-                overfull_indices.push(overfull_index);
+            match Us[overfull_index].cmp(&NonNegativeF64::one()) {
+                Ordering::Less => underfull_indices.push(overfull_index),
+                Ordering::Equal => (),
+                Ordering::Greater => overfull_indices.push(overfull_index),
             }
         }
 
diff --git a/necsim/impls/no-std/src/array2d.rs b/necsim/impls/no-std/src/array2d.rs
index 14fe7fc83..dd4552ebe 100644
--- a/necsim/impls/no-std/src/array2d.rs
+++ b/necsim/impls/no-std/src/array2d.rs
@@ -10,12 +10,12 @@ use core::ops::{Index, IndexMut};
 
 /// A fixed sized two-dimensional array.
 #[derive(Clone, Eq, PartialEq)]
-#[cfg_attr(feature = "cuda", derive(rust_cuda::common::LendRustToCuda))]
+#[cfg_attr(feature = "cuda", derive(rust_cuda::lend::LendRustToCuda))]
 #[cfg_attr(
     feature = "cuda",
     cuda(
         free = "T",
-        bound = "T: rust_cuda::safety::StackOnly + const_type_layout::TypeGraphLayout"
+        bound = "T: rust_cuda::safety::PortableBitSemantics + const_type_layout::TypeGraphLayout"
     )
 )]
 pub struct Array2D<T> {
diff --git a/necsim/impls/no-std/src/cogs/active_lineage_sampler/independent/event_time_sampler/const.rs b/necsim/impls/no-std/src/cogs/active_lineage_sampler/independent/event_time_sampler/const.rs
index b69bc20c0..598721483 100644
--- a/necsim/impls/no-std/src/cogs/active_lineage_sampler/independent/event_time_sampler/const.rs
+++ b/necsim/impls/no-std/src/cogs/active_lineage_sampler/independent/event_time_sampler/const.rs
@@ -8,7 +8,7 @@ use super::EventTimeSampler;
 
 #[allow(clippy::module_name_repetitions)]
 #[derive(Clone, Debug)]
-#[cfg_attr(feature = "cuda", derive(rust_cuda::common::LendRustToCuda))]
+#[cfg_attr(feature = "cuda", derive(rust_cuda::lend::LendRustToCuda))]
 pub struct ConstEventTimeSampler {
     event_time: PositiveF64,
 }
diff --git a/necsim/impls/no-std/src/cogs/active_lineage_sampler/independent/event_time_sampler/exp.rs b/necsim/impls/no-std/src/cogs/active_lineage_sampler/independent/event_time_sampler/exp.rs
index 8b6bdc9c4..9e7b1207e 100644
--- a/necsim/impls/no-std/src/cogs/active_lineage_sampler/independent/event_time_sampler/exp.rs
+++ b/necsim/impls/no-std/src/cogs/active_lineage_sampler/independent/event_time_sampler/exp.rs
@@ -11,7 +11,7 @@ const INV_PHI: u64 = 0x9e37_79b9_7f4a_7c15_u64;
 
 #[allow(clippy::module_name_repetitions)]
 #[derive(Clone, Debug)]
-#[cfg_attr(feature = "cuda", derive(rust_cuda::common::LendRustToCuda))]
+#[cfg_attr(feature = "cuda", derive(rust_cuda::lend::LendRustToCuda))]
 pub struct ExpEventTimeSampler {
     delta_t: PositiveF64,
 }
diff --git a/necsim/impls/no-std/src/cogs/active_lineage_sampler/independent/event_time_sampler/fixed.rs b/necsim/impls/no-std/src/cogs/active_lineage_sampler/independent/event_time_sampler/fixed.rs
index 5685d57fe..c6ac3227d 100644
--- a/necsim/impls/no-std/src/cogs/active_lineage_sampler/independent/event_time_sampler/fixed.rs
+++ b/necsim/impls/no-std/src/cogs/active_lineage_sampler/independent/event_time_sampler/fixed.rs
@@ -8,7 +8,7 @@ use super::EventTimeSampler;
 
 #[allow(clippy::module_name_repetitions)]
 #[derive(Clone, Debug, Default)]
-#[cfg_attr(feature = "cuda", derive(rust_cuda::common::LendRustToCuda))]
+#[cfg_attr(feature = "cuda", derive(rust_cuda::lend::LendRustToCuda))]
 pub struct FixedEventTimeSampler([u8; 0]);
 
 #[contract_trait]
diff --git a/necsim/impls/no-std/src/cogs/active_lineage_sampler/independent/event_time_sampler/geometric.rs b/necsim/impls/no-std/src/cogs/active_lineage_sampler/independent/event_time_sampler/geometric.rs
index be31a8a60..476685396 100644
--- a/necsim/impls/no-std/src/cogs/active_lineage_sampler/independent/event_time_sampler/geometric.rs
+++ b/necsim/impls/no-std/src/cogs/active_lineage_sampler/independent/event_time_sampler/geometric.rs
@@ -8,7 +8,7 @@ use super::EventTimeSampler;
 
 #[allow(clippy::module_name_repetitions)]
 #[derive(Clone, Debug)]
-#[cfg_attr(feature = "cuda", derive(rust_cuda::common::LendRustToCuda))]
+#[cfg_attr(feature = "cuda", derive(rust_cuda::lend::LendRustToCuda))]
 pub struct GeometricEventTimeSampler {
     delta_t: PositiveF64,
 }
diff --git a/necsim/impls/no-std/src/cogs/active_lineage_sampler/independent/event_time_sampler/poisson.rs b/necsim/impls/no-std/src/cogs/active_lineage_sampler/independent/event_time_sampler/poisson.rs
index fcd1355ab..db7a42683 100644
--- a/necsim/impls/no-std/src/cogs/active_lineage_sampler/independent/event_time_sampler/poisson.rs
+++ b/necsim/impls/no-std/src/cogs/active_lineage_sampler/independent/event_time_sampler/poisson.rs
@@ -11,7 +11,7 @@ const INV_PHI: u64 = 0x9e37_79b9_7f4a_7c15_u64;
 
 #[allow(clippy::module_name_repetitions)]
 #[derive(Clone, Debug)]
-#[cfg_attr(feature = "cuda", derive(rust_cuda::common::LendRustToCuda))]
+#[cfg_attr(feature = "cuda", derive(rust_cuda::lend::LendRustToCuda))]
 pub struct PoissonEventTimeSampler {
     delta_t: PositiveF64,
 }
diff --git a/necsim/impls/no-std/src/cogs/active_lineage_sampler/independent/mod.rs b/necsim/impls/no-std/src/cogs/active_lineage_sampler/independent/mod.rs
index 1aafbee33..eb5243a48 100644
--- a/necsim/impls/no-std/src/cogs/active_lineage_sampler/independent/mod.rs
+++ b/necsim/impls/no-std/src/cogs/active_lineage_sampler/independent/mod.rs
@@ -25,7 +25,7 @@ use event_time_sampler::EventTimeSampler;
 
 #[allow(clippy::module_name_repetitions)]
 #[derive(Debug)]
-#[cfg_attr(feature = "cuda", derive(rust_cuda::common::LendRustToCuda))]
+#[cfg_attr(feature = "cuda", derive(rust_cuda::lend::LendRustToCuda))]
 #[cfg_attr(feature = "cuda", cuda(free = "M"))]
 pub struct IndependentActiveLineageSampler<
     M: MathsCore,
@@ -37,10 +37,7 @@ pub struct IndependentActiveLineageSampler<
     N: SpeciationProbability<M, H>,
     J: EventTimeSampler<M, H, G, T>,
 > {
-    #[cfg_attr(
-        feature = "cuda",
-        cuda(embed = "Option<rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<Lineage>>")
-    )]
+    #[cfg_attr(feature = "cuda", cuda(embed))]
     active_lineage: Option<Lineage>,
     min_event_time: NonNegativeF64,
     last_event_time: NonNegativeF64,
diff --git a/necsim/impls/no-std/src/cogs/coalescence_sampler/independent.rs b/necsim/impls/no-std/src/cogs/coalescence_sampler/independent.rs
index 0e9a16f6a..f15e3f672 100644
--- a/necsim/impls/no-std/src/cogs/coalescence_sampler/independent.rs
+++ b/necsim/impls/no-std/src/cogs/coalescence_sampler/independent.rs
@@ -15,7 +15,7 @@ use crate::cogs::lineage_store::{
 
 #[allow(clippy::module_name_repetitions)]
 #[derive(Debug)]
-#[cfg_attr(feature = "cuda", derive(rust_cuda::common::LendRustToCuda))]
+#[cfg_attr(feature = "cuda", derive(rust_cuda::lend::LendRustToCuda))]
 #[cfg_attr(feature = "cuda", cuda(free = "M", free = "H"))]
 pub struct IndependentCoalescenceSampler<M: MathsCore, H: Habitat<M>>(PhantomData<(M, H)>);
 
diff --git a/necsim/impls/no-std/src/cogs/dispersal_sampler/almost_infinite_clark2dt.rs b/necsim/impls/no-std/src/cogs/dispersal_sampler/almost_infinite_clark2dt.rs
index decb649ce..b75075a5e 100644
--- a/necsim/impls/no-std/src/cogs/dispersal_sampler/almost_infinite_clark2dt.rs
+++ b/necsim/impls/no-std/src/cogs/dispersal_sampler/almost_infinite_clark2dt.rs
@@ -10,7 +10,7 @@ use crate::cogs::habitat::almost_infinite::AlmostInfiniteHabitat;
 
 #[allow(clippy::module_name_repetitions)]
 #[derive(Debug)]
-#[cfg_attr(feature = "cuda", derive(rust_cuda::common::LendRustToCuda))]
+#[cfg_attr(feature = "cuda", derive(rust_cuda::lend::LendRustToCuda))]
 #[cfg_attr(feature = "cuda", cuda(free = "M", free = "G"))]
 pub struct AlmostInfiniteClark2DtDispersalSampler<M: MathsCore, G: RngCore<M>> {
     shape_u: PositiveF64,
diff --git a/necsim/impls/no-std/src/cogs/dispersal_sampler/almost_infinite_normal.rs b/necsim/impls/no-std/src/cogs/dispersal_sampler/almost_infinite_normal.rs
index 5dc7b7cd7..36a731790 100644
--- a/necsim/impls/no-std/src/cogs/dispersal_sampler/almost_infinite_normal.rs
+++ b/necsim/impls/no-std/src/cogs/dispersal_sampler/almost_infinite_normal.rs
@@ -10,7 +10,7 @@ use crate::cogs::habitat::almost_infinite::AlmostInfiniteHabitat;
 
 #[allow(clippy::module_name_repetitions)]
 #[derive(Debug)]
-#[cfg_attr(feature = "cuda", derive(rust_cuda::common::LendRustToCuda))]
+#[cfg_attr(feature = "cuda", derive(rust_cuda::lend::LendRustToCuda))]
 #[cfg_attr(feature = "cuda", cuda(free = "M", free = "G"))]
 pub struct AlmostInfiniteNormalDispersalSampler<M: MathsCore, G: RngCore<M>> {
     sigma: NonNegativeF64,
diff --git a/necsim/impls/no-std/src/cogs/dispersal_sampler/in_memory/packed_alias/mod.rs b/necsim/impls/no-std/src/cogs/dispersal_sampler/in_memory/packed_alias/mod.rs
index f162c0199..f364b2f50 100644
--- a/necsim/impls/no-std/src/cogs/dispersal_sampler/in_memory/packed_alias/mod.rs
+++ b/necsim/impls/no-std/src/cogs/dispersal_sampler/in_memory/packed_alias/mod.rs
@@ -40,7 +40,7 @@ impl From<AliasSamplerRange> for Range<usize> {
 }
 
 #[allow(clippy::module_name_repetitions)]
-#[cfg_attr(feature = "cuda", derive(rust_cuda::common::LendRustToCuda))]
+#[cfg_attr(feature = "cuda", derive(rust_cuda::lend::LendRustToCuda))]
 #[cfg_attr(feature = "cuda", cuda(free = "M", free = "H", free = "G"))]
 pub struct InMemoryPackedAliasDispersalSampler<M: MathsCore, H: Habitat<M>, G: RngCore<M>> {
     #[cfg_attr(feature = "cuda", cuda(embed))]
diff --git a/necsim/impls/no-std/src/cogs/dispersal_sampler/non_spatial.rs b/necsim/impls/no-std/src/cogs/dispersal_sampler/non_spatial.rs
index 4b5d21861..f1186b1ec 100644
--- a/necsim/impls/no-std/src/cogs/dispersal_sampler/non_spatial.rs
+++ b/necsim/impls/no-std/src/cogs/dispersal_sampler/non_spatial.rs
@@ -10,7 +10,7 @@ use crate::cogs::habitat::non_spatial::NonSpatialHabitat;
 
 #[allow(clippy::module_name_repetitions)]
 #[derive(Debug)]
-#[cfg_attr(feature = "cuda", derive(rust_cuda::common::LendRustToCuda))]
+#[cfg_attr(feature = "cuda", derive(rust_cuda::lend::LendRustToCuda))]
 #[cfg_attr(feature = "cuda", cuda(free = "M", free = "G"))]
 pub struct NonSpatialDispersalSampler<M: MathsCore, G: RngCore<M>> {
     marker: PhantomData<(M, G)>,
diff --git a/necsim/impls/no-std/src/cogs/dispersal_sampler/spatially_implicit.rs b/necsim/impls/no-std/src/cogs/dispersal_sampler/spatially_implicit.rs
index 9e8d2cd21..fbd5d0dc0 100644
--- a/necsim/impls/no-std/src/cogs/dispersal_sampler/spatially_implicit.rs
+++ b/necsim/impls/no-std/src/cogs/dispersal_sampler/spatially_implicit.rs
@@ -11,7 +11,7 @@ use crate::cogs::{
 
 #[allow(clippy::module_name_repetitions)]
 #[derive(Debug)]
-#[cfg_attr(feature = "cuda", derive(rust_cuda::common::LendRustToCuda))]
+#[cfg_attr(feature = "cuda", derive(rust_cuda::lend::LendRustToCuda))]
 #[cfg_attr(feature = "cuda", cuda(free = "M"))]
 pub struct SpatiallyImplicitDispersalSampler<M: MathsCore, G: RngCore<M>> {
     #[cfg_attr(feature = "cuda", cuda(embed))]
diff --git a/necsim/impls/no-std/src/cogs/dispersal_sampler/trespassing/mod.rs b/necsim/impls/no-std/src/cogs/dispersal_sampler/trespassing/mod.rs
index 996dc2684..1ad63b0b7 100644
--- a/necsim/impls/no-std/src/cogs/dispersal_sampler/trespassing/mod.rs
+++ b/necsim/impls/no-std/src/cogs/dispersal_sampler/trespassing/mod.rs
@@ -28,7 +28,7 @@ pub trait AntiTrespassingDispersalSampler<M: MathsCore, H: Habitat<M>, G: RngCor
 
 #[allow(clippy::module_name_repetitions)]
 #[derive(Debug)]
-#[cfg_attr(feature = "cuda", derive(rust_cuda::common::LendRustToCuda))]
+#[cfg_attr(feature = "cuda", derive(rust_cuda::lend::LendRustToCuda))]
 #[cfg_attr(feature = "cuda", cuda(free = "M"))]
 pub struct TrespassingDispersalSampler<
     M: MathsCore,
diff --git a/necsim/impls/no-std/src/cogs/dispersal_sampler/trespassing/uniform.rs b/necsim/impls/no-std/src/cogs/dispersal_sampler/trespassing/uniform.rs
index 22e3216d2..26bef8225 100644
--- a/necsim/impls/no-std/src/cogs/dispersal_sampler/trespassing/uniform.rs
+++ b/necsim/impls/no-std/src/cogs/dispersal_sampler/trespassing/uniform.rs
@@ -9,7 +9,7 @@ use super::AntiTrespassingDispersalSampler;
 
 #[allow(clippy::module_name_repetitions)]
 #[derive(Debug)]
-#[cfg_attr(feature = "cuda", derive(rust_cuda::common::LendRustToCuda))]
+#[cfg_attr(feature = "cuda", derive(rust_cuda::lend::LendRustToCuda))]
 #[cfg_attr(feature = "cuda", cuda(free = "M", free = "H", free = "G"))]
 pub struct UniformAntiTrespassingDispersalSampler<
     M: MathsCore,
diff --git a/necsim/impls/no-std/src/cogs/dispersal_sampler/wrapping_noise.rs b/necsim/impls/no-std/src/cogs/dispersal_sampler/wrapping_noise.rs
index 5f38306db..6f3075bf4 100644
--- a/necsim/impls/no-std/src/cogs/dispersal_sampler/wrapping_noise.rs
+++ b/necsim/impls/no-std/src/cogs/dispersal_sampler/wrapping_noise.rs
@@ -14,7 +14,7 @@ use crate::cogs::{
 
 #[allow(clippy::module_name_repetitions)]
 #[derive(Debug)]
-#[cfg_attr(feature = "cuda", derive(rust_cuda::common::LendRustToCuda))]
+#[cfg_attr(feature = "cuda", derive(rust_cuda::lend::LendRustToCuda))]
 #[cfg_attr(feature = "cuda", cuda(free = "M"))]
 pub struct WrappingNoiseApproximateNormalDispersalSampler<M: MathsCore, G: RngCore<M>> {
     #[cfg_attr(feature = "cuda", cuda(embed))]
diff --git a/necsim/impls/no-std/src/cogs/emigration_exit/never.rs b/necsim/impls/no-std/src/cogs/emigration_exit/never.rs
index 74a68fdda..62e5320a5 100644
--- a/necsim/impls/no-std/src/cogs/emigration_exit/never.rs
+++ b/necsim/impls/no-std/src/cogs/emigration_exit/never.rs
@@ -8,7 +8,7 @@ use necsim_core_bond::{NonNegativeF64, PositiveF64};
 
 #[allow(clippy::module_name_repetitions)]
 #[derive(Debug, Default)]
-#[cfg_attr(feature = "cuda", derive(rust_cuda::common::LendRustToCuda))]
+#[cfg_attr(feature = "cuda", derive(rust_cuda::lend::LendRustToCuda))]
 pub struct NeverEmigrationExit([u8; 0]);
 
 #[contract_trait]
diff --git a/necsim/impls/no-std/src/cogs/event_sampler/independent.rs b/necsim/impls/no-std/src/cogs/event_sampler/independent.rs
index baeb01622..17ac313d0 100644
--- a/necsim/impls/no-std/src/cogs/event_sampler/independent.rs
+++ b/necsim/impls/no-std/src/cogs/event_sampler/independent.rs
@@ -21,7 +21,7 @@ use super::tracking::{MinSpeciationTrackingEventSampler, SpeciationSample};
 
 #[allow(clippy::module_name_repetitions)]
 #[derive(Debug)]
-#[cfg_attr(feature = "cuda", derive(rust_cuda::common::LendRustToCuda))]
+#[cfg_attr(feature = "cuda", derive(rust_cuda::lend::LendRustToCuda))]
 #[cfg_attr(
     feature = "cuda",
     cuda(
@@ -43,12 +43,7 @@ pub struct IndependentEventSampler<
     T: TurnoverRate<M, H>,
     N: SpeciationProbability<M, H>,
 > {
-    #[cfg_attr(
-        feature = "cuda",
-        cuda(
-            embed = "Option<rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<SpeciationSample>>"
-        )
-    )]
+    #[cfg_attr(feature = "cuda", cuda(embed))]
     min_spec_sample: Option<SpeciationSample>,
     marker: PhantomData<(M, H, G, X, D, T, N)>,
 }
diff --git a/necsim/impls/no-std/src/cogs/event_sampler/tracking.rs b/necsim/impls/no-std/src/cogs/event_sampler/tracking.rs
index 8b5c1cccd..6804e70ea 100644
--- a/necsim/impls/no-std/src/cogs/event_sampler/tracking.rs
+++ b/necsim/impls/no-std/src/cogs/event_sampler/tracking.rs
@@ -27,10 +27,12 @@ pub trait MinSpeciationTrackingEventSampler<
 }
 
 #[derive(Clone, Debug, TypeLayout)]
+#[cfg_attr(feature = "cuda", derive(rust_cuda::lend::LendRustToCuda))]
 #[repr(C)]
 pub struct SpeciationSample {
     speciation_sample: ClosedOpenUnitF64,
     sample_time: PositiveF64,
+    #[cfg_attr(feature = "cuda", cuda(embed))]
     sample_location: IndexedLocation,
 }
 
diff --git a/necsim/impls/no-std/src/cogs/habitat/almost_infinite.rs b/necsim/impls/no-std/src/cogs/habitat/almost_infinite.rs
index 62b06c356..b974b42ac 100644
--- a/necsim/impls/no-std/src/cogs/habitat/almost_infinite.rs
+++ b/necsim/impls/no-std/src/cogs/habitat/almost_infinite.rs
@@ -12,7 +12,7 @@ const ALMOST_INFINITE_EXTENT: LandscapeExtent =
     LandscapeExtent::new(Location::new(0, 0), OffByOneU32::max(), OffByOneU32::max());
 
 #[allow(clippy::module_name_repetitions)]
-#[cfg_attr(feature = "cuda", derive(rust_cuda::common::LendRustToCuda))]
+#[cfg_attr(feature = "cuda", derive(rust_cuda::lend::LendRustToCuda))]
 #[cfg_attr(feature = "cuda", cuda(free = "M"))]
 pub struct AlmostInfiniteHabitat<M: MathsCore> {
     marker: PhantomData<M>,
diff --git a/necsim/impls/no-std/src/cogs/habitat/in_memory.rs b/necsim/impls/no-std/src/cogs/habitat/in_memory.rs
index ea6b4d314..a0464df4a 100644
--- a/necsim/impls/no-std/src/cogs/habitat/in_memory.rs
+++ b/necsim/impls/no-std/src/cogs/habitat/in_memory.rs
@@ -14,13 +14,14 @@ use crate::array2d::Array2D;
 
 #[allow(clippy::module_name_repetitions)]
 #[derive(Debug)]
-#[cfg_attr(feature = "cuda", derive(rust_cuda::common::LendRustToCuda))]
+#[cfg_attr(feature = "cuda", derive(rust_cuda::lend::LendRustToCuda))]
 #[cfg_attr(feature = "cuda", cuda(free = "M"))]
 pub struct InMemoryHabitat<M: MathsCore> {
     #[cfg_attr(feature = "cuda", cuda(embed))]
     habitat: Final<Box<[u32]>>,
     #[cfg_attr(feature = "cuda", cuda(embed))]
     u64_injection: Final<Box<[u64]>>,
+    #[cfg_attr(feature = "cuda", cuda(embed))]
     extent: LandscapeExtent,
     marker: PhantomData<M>,
 }
diff --git a/necsim/impls/no-std/src/cogs/habitat/non_spatial.rs b/necsim/impls/no-std/src/cogs/habitat/non_spatial.rs
index 5da4667c1..947243289 100644
--- a/necsim/impls/no-std/src/cogs/habitat/non_spatial.rs
+++ b/necsim/impls/no-std/src/cogs/habitat/non_spatial.rs
@@ -11,9 +11,10 @@ use necsim_core_bond::{OffByOneU32, OffByOneU64};
 
 #[allow(clippy::module_name_repetitions)]
 #[derive(Debug)]
-#[cfg_attr(feature = "cuda", derive(rust_cuda::common::LendRustToCuda))]
+#[cfg_attr(feature = "cuda", derive(rust_cuda::lend::LendRustToCuda))]
 #[cfg_attr(feature = "cuda", cuda(free = "M"))]
 pub struct NonSpatialHabitat<M: MathsCore> {
+    #[cfg_attr(feature = "cuda", cuda(embed))]
     extent: LandscapeExtent,
     deme: NonZeroU32,
     marker: PhantomData<M>,
diff --git a/necsim/impls/no-std/src/cogs/habitat/spatially_implicit.rs b/necsim/impls/no-std/src/cogs/habitat/spatially_implicit.rs
index 7b13925bc..d2a434daa 100644
--- a/necsim/impls/no-std/src/cogs/habitat/spatially_implicit.rs
+++ b/necsim/impls/no-std/src/cogs/habitat/spatially_implicit.rs
@@ -13,7 +13,7 @@ const SPATIALLY_IMPLICIT_EXTENT: LandscapeExtent =
 
 #[allow(clippy::module_name_repetitions)]
 #[derive(Debug)]
-#[cfg_attr(feature = "cuda", derive(rust_cuda::common::LendRustToCuda))]
+#[cfg_attr(feature = "cuda", derive(rust_cuda::lend::LendRustToCuda))]
 #[cfg_attr(feature = "cuda", cuda(free = "M"))]
 pub struct SpatiallyImplicitHabitat<M: MathsCore> {
     #[cfg_attr(feature = "cuda", cuda(embed))]
diff --git a/necsim/impls/no-std/src/cogs/habitat/wrapping_noise/mod.rs b/necsim/impls/no-std/src/cogs/habitat/wrapping_noise/mod.rs
index e6482e557..892c02f11 100644
--- a/necsim/impls/no-std/src/cogs/habitat/wrapping_noise/mod.rs
+++ b/necsim/impls/no-std/src/cogs/habitat/wrapping_noise/mod.rs
@@ -18,7 +18,7 @@ use crate::cogs::{
 };
 
 #[allow(clippy::module_name_repetitions)]
-#[cfg_attr(feature = "cuda", derive(rust_cuda::common::LendRustToCuda))]
+#[cfg_attr(feature = "cuda", derive(rust_cuda::lend::LendRustToCuda))]
 #[cfg_attr(feature = "cuda", cuda(free = "M"))]
 pub struct WrappingNoiseHabitat<M: MathsCore> {
     #[cfg_attr(feature = "cuda", cuda(embed))]
diff --git a/necsim/impls/no-std/src/cogs/immigration_entry/never.rs b/necsim/impls/no-std/src/cogs/immigration_entry/never.rs
index fc148b60e..9c4df3ac8 100644
--- a/necsim/impls/no-std/src/cogs/immigration_entry/never.rs
+++ b/necsim/impls/no-std/src/cogs/immigration_entry/never.rs
@@ -5,7 +5,7 @@ use necsim_core::{
 
 #[allow(clippy::module_name_repetitions)]
 #[derive(Debug, Default)]
-#[cfg_attr(feature = "cuda", derive(rust_cuda::common::LendRustToCuda))]
+#[cfg_attr(feature = "cuda", derive(rust_cuda::lend::LendRustToCuda))]
 pub struct NeverImmigrationEntry([u8; 0]);
 
 #[contract_trait]
diff --git a/necsim/impls/no-std/src/cogs/lineage_store/independent.rs b/necsim/impls/no-std/src/cogs/lineage_store/independent.rs
index d20b0dbd1..606be853e 100644
--- a/necsim/impls/no-std/src/cogs/lineage_store/independent.rs
+++ b/necsim/impls/no-std/src/cogs/lineage_store/independent.rs
@@ -7,7 +7,7 @@ use necsim_core::{
 
 #[allow(clippy::module_name_repetitions)]
 #[derive(Debug)]
-#[cfg_attr(feature = "cuda", derive(rust_cuda::common::LendRustToCuda))]
+#[cfg_attr(feature = "cuda", derive(rust_cuda::lend::LendRustToCuda))]
 #[cfg_attr(feature = "cuda", cuda(free = "M", free = "H"))]
 pub struct IndependentLineageStore<M: MathsCore, H: Habitat<M>> {
     marker: PhantomData<(M, H)>,
diff --git a/necsim/impls/no-std/src/cogs/maths/intrinsics.rs b/necsim/impls/no-std/src/cogs/maths/intrinsics.rs
index 7375c9fc8..46801aac8 100644
--- a/necsim/impls/no-std/src/cogs/maths/intrinsics.rs
+++ b/necsim/impls/no-std/src/cogs/maths/intrinsics.rs
@@ -1,4 +1,2 @@
-#![allow(clippy::useless_attribute)]
-
 #[allow(clippy::module_name_repetitions)]
 pub use necsim_core_maths::IntrinsicsMathsCore;
diff --git a/necsim/impls/no-std/src/cogs/rng/seahash.rs b/necsim/impls/no-std/src/cogs/rng/seahash.rs
index 93cc87ecd..bbfc0df7b 100644
--- a/necsim/impls/no-std/src/cogs/rng/seahash.rs
+++ b/necsim/impls/no-std/src/cogs/rng/seahash.rs
@@ -4,7 +4,7 @@ use necsim_core::cogs::{Backup, MathsCore, PrimeableRng, RngCore};
 
 use serde::{Deserialize, Serialize};
 
-#[allow(clippy::module_name_repetitions, clippy::unsafe_derive_deserialize)]
+#[allow(clippy::module_name_repetitions)]
 #[derive(Clone, Debug, Serialize, Deserialize, TypeLayout)]
 #[serde(deny_unknown_fields)]
 #[layout(free = "M")]
diff --git a/necsim/impls/no-std/src/cogs/rng/wyhash.rs b/necsim/impls/no-std/src/cogs/rng/wyhash.rs
index c4fdeed68..dfa2d4d3e 100644
--- a/necsim/impls/no-std/src/cogs/rng/wyhash.rs
+++ b/necsim/impls/no-std/src/cogs/rng/wyhash.rs
@@ -11,7 +11,7 @@ const P1: u64 = 0xe703_7ed1_a0b4_28db;
 const P2: u64 = 0x8ebc_6af0_9c88_c6e3;
 const P5: u64 = 0xeb44_acca_b455_d165;
 
-#[allow(clippy::module_name_repetitions, clippy::unsafe_derive_deserialize)]
+#[allow(clippy::module_name_repetitions)]
 #[derive(Clone, Debug, Serialize, Deserialize, TypeLayout)]
 #[layout(free = "M")]
 #[serde(deny_unknown_fields)]
diff --git a/necsim/impls/no-std/src/cogs/speciation_probability/spatially_implicit.rs b/necsim/impls/no-std/src/cogs/speciation_probability/spatially_implicit.rs
index d50e77707..a542e24b1 100644
--- a/necsim/impls/no-std/src/cogs/speciation_probability/spatially_implicit.rs
+++ b/necsim/impls/no-std/src/cogs/speciation_probability/spatially_implicit.rs
@@ -7,7 +7,7 @@ use necsim_core_bond::{ClosedUnitF64, OpenClosedUnitF64 as PositiveUnitF64};
 use crate::cogs::habitat::spatially_implicit::SpatiallyImplicitHabitat;
 
 #[derive(Debug)]
-#[cfg_attr(feature = "cuda", derive(rust_cuda::common::LendRustToCuda))]
+#[cfg_attr(feature = "cuda", derive(rust_cuda::lend::LendRustToCuda))]
 #[allow(clippy::module_name_repetitions)]
 pub struct SpatiallyImplicitSpeciationProbability {
     meta_speciation_probability: PositiveUnitF64,
diff --git a/necsim/impls/no-std/src/cogs/speciation_probability/uniform.rs b/necsim/impls/no-std/src/cogs/speciation_probability/uniform.rs
index dd8d2dfae..82ceeeba7 100644
--- a/necsim/impls/no-std/src/cogs/speciation_probability/uniform.rs
+++ b/necsim/impls/no-std/src/cogs/speciation_probability/uniform.rs
@@ -5,7 +5,7 @@ use necsim_core::{
 use necsim_core_bond::ClosedUnitF64;
 
 #[derive(Debug)]
-#[cfg_attr(feature = "cuda", derive(rust_cuda::common::LendRustToCuda))]
+#[cfg_attr(feature = "cuda", derive(rust_cuda::lend::LendRustToCuda))]
 #[allow(clippy::module_name_repetitions)]
 pub struct UniformSpeciationProbability {
     speciation_probability: ClosedUnitF64,
diff --git a/necsim/impls/no-std/src/cogs/turnover_rate/in_memory.rs b/necsim/impls/no-std/src/cogs/turnover_rate/in_memory.rs
index dc884bc0d..6f6f52b20 100644
--- a/necsim/impls/no-std/src/cogs/turnover_rate/in_memory.rs
+++ b/necsim/impls/no-std/src/cogs/turnover_rate/in_memory.rs
@@ -14,7 +14,7 @@ use crate::{array2d::Array2D, cogs::habitat::in_memory::InMemoryHabitat};
 
 #[allow(clippy::module_name_repetitions)]
 #[derive(Debug)]
-#[cfg_attr(feature = "cuda", derive(rust_cuda::common::LendRustToCuda))]
+#[cfg_attr(feature = "cuda", derive(rust_cuda::lend::LendRustToCuda))]
 pub struct InMemoryTurnoverRate {
     #[cfg_attr(feature = "cuda", cuda(embed))]
     turnover_rate: Final<Box<[NonNegativeF64]>>,
diff --git a/necsim/impls/no-std/src/cogs/turnover_rate/uniform.rs b/necsim/impls/no-std/src/cogs/turnover_rate/uniform.rs
index 99411a19a..5255625bf 100644
--- a/necsim/impls/no-std/src/cogs/turnover_rate/uniform.rs
+++ b/necsim/impls/no-std/src/cogs/turnover_rate/uniform.rs
@@ -5,7 +5,7 @@ use necsim_core::{
 use necsim_core_bond::{NonNegativeF64, PositiveF64};
 
 #[derive(Debug)]
-#[cfg_attr(feature = "cuda", derive(rust_cuda::common::LendRustToCuda))]
+#[cfg_attr(feature = "cuda", derive(rust_cuda::lend::LendRustToCuda))]
 #[allow(clippy::module_name_repetitions)]
 pub struct UniformTurnoverRate {
     turnover_rate: PositiveF64,
diff --git a/necsim/impls/no-std/src/parallelisation/independent/individuals.rs b/necsim/impls/no-std/src/parallelisation/independent/individuals.rs
index 93fbe37f0..c6355adf4 100644
--- a/necsim/impls/no-std/src/parallelisation/independent/individuals.rs
+++ b/necsim/impls/no-std/src/parallelisation/independent/individuals.rs
@@ -125,7 +125,7 @@ pub fn simulate<
         //  detected at the next shared duplicate event
 
         let (new_time, new_steps) = simulation.simulate_incremental_early_stop(
-            |_, steps, _| {
+            |_, steps, _, _| {
                 if steps >= step_slice.get() {
                     ControlFlow::Break(())
                 } else {
diff --git a/necsim/impls/no-std/src/parallelisation/independent/landscape.rs b/necsim/impls/no-std/src/parallelisation/independent/landscape.rs
index 75c83085d..0177ec941 100644
--- a/necsim/impls/no-std/src/parallelisation/independent/landscape.rs
+++ b/necsim/impls/no-std/src/parallelisation/independent/landscape.rs
@@ -137,7 +137,7 @@ pub fn simulate<
         //  detected at the next shared duplicate event
 
         let (new_time, new_steps) = simulation.simulate_incremental_early_stop(
-            |_, steps, _| {
+            |_, steps, _, _| {
                 if steps >= step_slice.get() {
                     ControlFlow::Break(())
                 } else {
diff --git a/necsim/impls/no-std/src/parallelisation/independent/monolithic/mod.rs b/necsim/impls/no-std/src/parallelisation/independent/monolithic/mod.rs
index b3ac9a64c..faecd44c9 100644
--- a/necsim/impls/no-std/src/parallelisation/independent/monolithic/mod.rs
+++ b/necsim/impls/no-std/src/parallelisation/independent/monolithic/mod.rs
@@ -226,7 +226,7 @@ pub fn simulate<
             previous_next_event_time = None;
 
             let (new_time, new_steps) = simulation.simulate_incremental_early_stop(
-                |_, steps, next_event_time| {
+                |_, steps, next_event_time, _| {
                     previous_next_event_time = Some(next_event_time);
 
                     if steps >= step_slice.get() || next_event_time >= level_time {
diff --git a/necsim/impls/no-std/src/parallelisation/monolithic/averaging.rs b/necsim/impls/no-std/src/parallelisation/monolithic/averaging.rs
index 187ee4038..143b61156 100644
--- a/necsim/impls/no-std/src/parallelisation/monolithic/averaging.rs
+++ b/necsim/impls/no-std/src/parallelisation/monolithic/averaging.rs
@@ -78,7 +78,7 @@ pub fn simulate<
         let next_safe_time = global_safe_time + independent_time_slice;
 
         let (_, new_steps) = simulation.simulate_incremental_early_stop(
-            |_, _, next_event_time| {
+            |_, _, next_event_time, _| {
                 if next_event_time >= next_safe_time {
                     ControlFlow::Break(())
                 } else {
diff --git a/necsim/impls/no-std/src/parallelisation/monolithic/lockstep.rs b/necsim/impls/no-std/src/parallelisation/monolithic/lockstep.rs
index 174c2c358..2b29631aa 100644
--- a/necsim/impls/no-std/src/parallelisation/monolithic/lockstep.rs
+++ b/necsim/impls/no-std/src/parallelisation/monolithic/lockstep.rs
@@ -77,7 +77,7 @@ pub fn simulate<
         // Simulate for zero-steps (immediate early stop) without side effects
         //  to peek the next local event time
         simulation.simulate_incremental_early_stop(
-            |_, _, next_event_time| {
+            |_, _, next_event_time, _| {
                 next_local_time = Some(next_event_time);
 
                 ControlFlow::Break(())
@@ -102,7 +102,7 @@ pub fn simulate<
         // The partition with the next event gets to simulate just the next step
         if let Ok(next_global_time) = local_partition.reduce_vote_min_time(next_local_time) {
             let (_, new_steps) = simulation.simulate_incremental_early_stop(
-                |_, _, next_event_time| {
+                |_, _, next_event_time, _| {
                     if next_event_time > next_global_time {
                         ControlFlow::Break(())
                     } else {
diff --git a/necsim/impls/no-std/src/parallelisation/monolithic/monolithic.rs b/necsim/impls/no-std/src/parallelisation/monolithic/monolithic.rs
index 895344836..246e582df 100644
--- a/necsim/impls/no-std/src/parallelisation/monolithic/monolithic.rs
+++ b/necsim/impls/no-std/src/parallelisation/monolithic/monolithic.rs
@@ -69,7 +69,7 @@ pub fn simulate<
     //  ically later time
 
     let (time, steps) = simulation.simulate_incremental_early_stop(
-        |_, _, next_event_time| {
+        |_, _, next_event_time, _| {
             pause_before.map_or(ControlFlow::Continue(()), |pause_before| {
                 if next_event_time >= pause_before {
                     ControlFlow::Break(())
diff --git a/necsim/impls/no-std/src/parallelisation/monolithic/optimistic.rs b/necsim/impls/no-std/src/parallelisation/monolithic/optimistic.rs
index b12afb7d7..349c74164 100644
--- a/necsim/impls/no-std/src/parallelisation/monolithic/optimistic.rs
+++ b/necsim/impls/no-std/src/parallelisation/monolithic/optimistic.rs
@@ -109,7 +109,7 @@ pub fn simulate<
             // e.g. (1->2)|(2->3)|(3->1) => (1->2)|(3->1)
 
             let (_, new_steps) = simulation.simulate_incremental_early_stop(
-                |_, _, next_event_time| {
+                |_, _, next_event_time, _| {
                     if next_event_time >= next_safe_time {
                         ControlFlow::Break(())
                     } else {
diff --git a/necsim/impls/no-std/src/parallelisation/monolithic/optimistic_lockstep.rs b/necsim/impls/no-std/src/parallelisation/monolithic/optimistic_lockstep.rs
index 767e5cbb9..def28db5e 100644
--- a/necsim/impls/no-std/src/parallelisation/monolithic/optimistic_lockstep.rs
+++ b/necsim/impls/no-std/src/parallelisation/monolithic/optimistic_lockstep.rs
@@ -78,7 +78,7 @@ pub fn simulate<
         //  (we already know at least one partition has some next event time)
         let next_local_emigration_time = {
             let (_, new_steps) = simulation.simulate_incremental_early_stop(
-                |simulation, _, _| {
+                |simulation, _, _, _| {
                     if simulation.emigration_exit().is_empty() {
                         ControlFlow::Continue(())
                     } else {
@@ -115,7 +115,7 @@ pub fn simulate<
             //  that event
             Ok(next_global_time) => {
                 let (_, new_steps) = simulation.simulate_incremental_early_stop(
-                    |_, _, next_event_time| {
+                    |_, _, next_event_time, _| {
                         if next_event_time > next_global_time {
                             ControlFlow::Break(())
                         } else {
@@ -139,7 +139,7 @@ pub fn simulate<
             // All other partitions get to simulate until just before this next migration event
             Err(next_global_time) => {
                 let (_, new_steps) = simulation.simulate_incremental_early_stop(
-                    |_, _, next_event_time| {
+                    |_, _, next_event_time, _| {
                         if next_event_time >= next_global_time {
                             ControlFlow::Break(())
                         } else {
diff --git a/necsim/impls/std/src/event_log/replay/sorted_segments.rs b/necsim/impls/std/src/event_log/replay/sorted_segments.rs
index 2c209cd95..57c18b6e9 100644
--- a/necsim/impls/std/src/event_log/replay/sorted_segments.rs
+++ b/necsim/impls/std/src/event_log/replay/sorted_segments.rs
@@ -101,6 +101,7 @@ impl PartialOrd for SortedSortedSegments {
 }
 
 impl PartialEq for SortedSortedSegments {
+    #[allow(clippy::unconditional_recursion)]
     fn eq(&self, other: &Self) -> bool {
         self.next.eq(&other.next)
     }
diff --git a/necsim/partitioning/mpi/src/partition/mod.rs b/necsim/partitioning/mpi/src/partition/mod.rs
index 90055f711..d05940d3d 100644
--- a/necsim/partitioning/mpi/src/partition/mod.rs
+++ b/necsim/partitioning/mpi/src/partition/mod.rs
@@ -13,9 +13,9 @@ mod parallel;
 mod root;
 mod utils;
 
-#[allow(clippy::useless_attribute, clippy::module_name_repetitions)]
+#[allow(clippy::module_name_repetitions)]
 pub use parallel::MpiParallelPartition;
-#[allow(clippy::useless_attribute, clippy::module_name_repetitions)]
+#[allow(clippy::module_name_repetitions)]
 pub use root::MpiRootPartition;
 
 #[allow(clippy::module_name_repetitions)]
diff --git a/necsim/plugins/core/src/import/combinator.rs b/necsim/plugins/core/src/import/combinator.rs
index d948c5e3e..a99fb5784 100644
--- a/necsim/plugins/core/src/import/combinator.rs
+++ b/necsim/plugins/core/src/import/combinator.rs
@@ -3,7 +3,6 @@ use std::{
     iter::{FromIterator, IntoIterator},
     marker::PhantomData,
     path::Path,
-    rc::Rc,
 };
 
 use serde::{Deserialize, Deserializer, Serialize, Serializer};
@@ -68,7 +67,8 @@ impl<ReportSpeciation: Boolean, ReportDispersal: Boolean, ReportProgress: Boolea
         let libraries = self
             .plugins
             .iter()
-            .map(|plugin| Rc::clone(&plugin.library))
+            .map(|plugin| &plugin.library)
+            .cloned()
             .collect::<Vec<_>>();
 
         let result = inner(self);
diff --git a/rust-toolchain b/rust-toolchain
index 73b9f40a2..218c6dd39 100644
--- a/rust-toolchain
+++ b/rust-toolchain
@@ -1,5 +1,5 @@
 [toolchain]
 # Pin to final 1.79.0 nightly
 channel = "nightly-2024-04-28"
-components = [ "cargo", "rustfmt", "clippy", "rust-src" ]
+components = [ "cargo", "rustfmt", "clippy", "rust-src", "llvm-bitcode-linker", "llvm-tools" ]
 targets = [ "x86_64-unknown-linux-gnu", "nvptx64-nvidia-cuda" ]
diff --git a/rustcoalescence/algorithms/cuda/Cargo.toml b/rustcoalescence/algorithms/cuda/Cargo.toml
index be7135a5c..1eeeba998 100644
--- a/rustcoalescence/algorithms/cuda/Cargo.toml
+++ b/rustcoalescence/algorithms/cuda/Cargo.toml
@@ -32,4 +32,4 @@ thiserror = "1.0"
 serde = { version = "1.0", features = ["derive"] }
 serde_state = "0.4"
 serde_derive_state = "0.4"
-rust-cuda = { git = "https://github.com/juntyr/rust-cuda", rev = "f395253", features = ["host"] }
+rust-cuda = { git = "https://github.com/juntyr/rust-cuda", rev = "697dcf5", features = ["host"] }
diff --git a/rustcoalescence/algorithms/cuda/cpu-kernel/Cargo.toml b/rustcoalescence/algorithms/cuda/cpu-kernel/Cargo.toml
index bab5e740e..e5735deed 100644
--- a/rustcoalescence/algorithms/cuda/cpu-kernel/Cargo.toml
+++ b/rustcoalescence/algorithms/cuda/cpu-kernel/Cargo.toml
@@ -23,4 +23,4 @@ necsim-impls-no-std = { path = "../../../../necsim/impls/no-std", features = ["c
 necsim-impls-cuda = { path = "../../../../necsim/impls/cuda" }
 rustcoalescence-algorithms-cuda-gpu-kernel = { path = "../gpu-kernel" }
 
-rust-cuda = { git = "https://github.com/juntyr/rust-cuda", rev = "f395253", features = ["host"] }
+rust-cuda = { git = "https://github.com/juntyr/rust-cuda", rev = "697dcf5", features = ["host"] }
diff --git a/rustcoalescence/algorithms/cuda/cpu-kernel/src/lib.rs b/rustcoalescence/algorithms/cuda/cpu-kernel/src/lib.rs
index 5c908339e..8f206ab43 100644
--- a/rustcoalescence/algorithms/cuda/cpu-kernel/src/lib.rs
+++ b/rustcoalescence/algorithms/cuda/cpu-kernel/src/lib.rs
@@ -1,6 +1,5 @@
 #![deny(clippy::pedantic)]
-#![allow(incomplete_features)]
-#![feature(specialization)]
+#![allow(long_running_const_eval)]
 #![recursion_limit = "1024"]
 
 use necsim_core::{
@@ -16,145 +15,29 @@ use necsim_impls_no_std::cogs::{
     event_sampler::tracking::MinSpeciationTrackingEventSampler,
 };
 
-use rust_cuda::{
-    common::RustToCuda,
-    host::{CudaDropWrapper, LaunchConfig, LaunchPackage, Launcher, TypedKernel},
-    rustacuda::{
-        error::CudaResult,
-        function::{BlockSize, Function, GridSize},
-        stream::Stream,
-    },
-};
-
-use rustcoalescence_algorithms_cuda_gpu_kernel::SimulatableKernel;
+use rust_cuda::lend::RustToCuda;
 
 mod link;
 mod patch;
 
-pub type KernelCompilationCallback = dyn FnMut(&Function) -> CudaResult<()>;
-
-#[allow(clippy::module_name_repetitions)]
-pub struct SimulationKernel<
-    M: MathsCore,
-    H: Habitat<M> + RustToCuda,
-    G: PrimeableRng<M> + RustToCuda,
-    S: LineageStore<M, H> + RustToCuda,
-    X: EmigrationExit<M, H, G, S> + RustToCuda,
-    D: DispersalSampler<M, H, G> + RustToCuda,
-    C: CoalescenceSampler<M, H, S> + RustToCuda,
-    T: TurnoverRate<M, H> + RustToCuda,
-    N: SpeciationProbability<M, H> + RustToCuda,
-    E: MinSpeciationTrackingEventSampler<M, H, G, S, X, D, C, T, N> + RustToCuda,
-    I: ImmigrationEntry<M> + RustToCuda,
-    A: SingularActiveLineageSampler<M, H, G, S, X, D, C, T, N, E, I> + RustToCuda,
+#[allow(clippy::type_complexity)]
+pub struct SimulationKernelPtx<
+    M: MathsCore + Sync,
+    H: Habitat<M> + RustToCuda + Sync,
+    G: PrimeableRng<M> + RustToCuda + Sync,
+    S: LineageStore<M, H> + RustToCuda + Sync,
+    X: EmigrationExit<M, H, G, S> + RustToCuda + Sync,
+    D: DispersalSampler<M, H, G> + RustToCuda + Sync,
+    C: CoalescenceSampler<M, H, S> + RustToCuda + Sync,
+    T: TurnoverRate<M, H> + RustToCuda + Sync,
+    N: SpeciationProbability<M, H> + RustToCuda + Sync,
+    E: MinSpeciationTrackingEventSampler<M, H, G, S, X, D, C, T, N> + RustToCuda + Sync,
+    I: ImmigrationEntry<M> + RustToCuda + Sync,
+    A: SingularActiveLineageSampler<M, H, G, S, X, D, C, T, N, E, I> + RustToCuda + Sync,
     ReportSpeciation: Boolean,
     ReportDispersal: Boolean,
-> {
-    #[allow(clippy::type_complexity)]
-    kernel: TypedKernel<
-        dyn SimulatableKernel<
-            M,
-            H,
-            G,
-            S,
-            X,
-            D,
-            C,
-            T,
-            N,
-            E,
-            I,
-            A,
-            ReportSpeciation,
-            ReportDispersal,
-        >,
-    >,
-    stream: CudaDropWrapper<Stream>,
-    grid: GridSize,
-    block: BlockSize,
-    ptx_jit: bool,
-    watcher: Box<KernelCompilationCallback>,
-}
-
-impl<
-        M: MathsCore,
-        H: Habitat<M> + RustToCuda,
-        G: PrimeableRng<M> + RustToCuda,
-        S: LineageStore<M, H> + RustToCuda,
-        X: EmigrationExit<M, H, G, S> + RustToCuda,
-        D: DispersalSampler<M, H, G> + RustToCuda,
-        C: CoalescenceSampler<M, H, S> + RustToCuda,
-        T: TurnoverRate<M, H> + RustToCuda,
-        N: SpeciationProbability<M, H> + RustToCuda,
-        E: MinSpeciationTrackingEventSampler<M, H, G, S, X, D, C, T, N> + RustToCuda,
-        I: ImmigrationEntry<M> + RustToCuda,
-        A: SingularActiveLineageSampler<M, H, G, S, X, D, C, T, N, E, I> + RustToCuda,
-        ReportSpeciation: Boolean,
-        ReportDispersal: Boolean,
-    > SimulationKernel<M, H, G, S, X, D, C, T, N, E, I, A, ReportSpeciation, ReportDispersal>
-{
-    /// # Errors
-    ///
-    /// Returns a `CudaError` if loading the CUDA kernel failed.
-    pub fn try_new(
-        stream: Stream,
-        grid: GridSize,
-        block: BlockSize,
-        ptx_jit: bool,
-        on_compile: Box<KernelCompilationCallback>,
-    ) -> CudaResult<Self>
-    where
-        Self: SimulatableKernel<
-            M,
-            H,
-            G,
-            S,
-            X,
-            D,
-            C,
-            T,
-            N,
-            E,
-            I,
-            A,
-            ReportSpeciation,
-            ReportDispersal,
-        >,
-    {
-        let stream = CudaDropWrapper::from(stream);
-        let kernel = Self::new_kernel()?;
-
-        Ok(Self {
-            kernel,
-            stream,
-            grid,
-            block,
-            ptx_jit,
-            watcher: on_compile,
-        })
-    }
-}
-
-impl<
-        M: MathsCore,
-        H: Habitat<M> + RustToCuda,
-        G: PrimeableRng<M> + RustToCuda,
-        S: LineageStore<M, H> + RustToCuda,
-        X: EmigrationExit<M, H, G, S> + RustToCuda,
-        D: DispersalSampler<M, H, G> + RustToCuda,
-        C: CoalescenceSampler<M, H, S> + RustToCuda,
-        T: TurnoverRate<M, H> + RustToCuda,
-        N: SpeciationProbability<M, H> + RustToCuda,
-        E: MinSpeciationTrackingEventSampler<M, H, G, S, X, D, C, T, N> + RustToCuda,
-        I: ImmigrationEntry<M> + RustToCuda,
-        A: SingularActiveLineageSampler<M, H, G, S, X, D, C, T, N, E, I> + RustToCuda,
-        ReportSpeciation: Boolean,
-        ReportDispersal: Boolean,
-    > Launcher
-    for SimulationKernel<M, H, G, S, X, D, C, T, N, E, I, A, ReportSpeciation, ReportDispersal>
-{
-    type CompilationWatcher = Box<KernelCompilationCallback>;
-    type KernelTraitObject = dyn SimulatableKernel<
+>(
+    std::marker::PhantomData<(
         M,
         H,
         G,
@@ -169,25 +52,5 @@ impl<
         A,
         ReportSpeciation,
         ReportDispersal,
-    >;
-
-    fn get_launch_package(&mut self) -> LaunchPackage<Self> {
-        LaunchPackage {
-            config: LaunchConfig {
-                grid: self.grid.clone(),
-                block: self.block.clone(),
-                shared_memory_size: 0_u32,
-                ptx_jit: self.ptx_jit,
-            },
-
-            kernel: &mut self.kernel,
-            stream: &mut self.stream,
-
-            watcher: &mut self.watcher,
-        }
-    }
-
-    fn on_compile(kernel: &Function, watcher: &mut Self::CompilationWatcher) -> CudaResult<()> {
-        (watcher)(kernel)
-    }
-}
+    )>,
+);
diff --git a/rustcoalescence/algorithms/cuda/cpu-kernel/src/link.rs b/rustcoalescence/algorithms/cuda/cpu-kernel/src/link.rs
index ee77a5d11..c4a99ac6f 100644
--- a/rustcoalescence/algorithms/cuda/cpu-kernel/src/link.rs
+++ b/rustcoalescence/algorithms/cuda/cpu-kernel/src/link.rs
@@ -11,34 +11,26 @@ use necsim_impls_no_std::cogs::{
     event_sampler::tracking::MinSpeciationTrackingEventSampler,
 };
 
-use rust_cuda::{
-    common::RustToCuda,
-    host::{LaunchConfig, LaunchPackage, Launcher},
-    rustacuda::{error::CudaResult, function::Function},
-};
-
-#[allow(unused_imports)]
-use rustcoalescence_algorithms_cuda_gpu_kernel::{SimulatableKernel, SimulationKernelArgs};
+use rust_cuda::lend::RustToCuda;
 
-#[repr(transparent)]
-pub struct SimulationKernel<
-    M: MathsCore,
-    H: Habitat<M> + RustToCuda,
-    G: PrimeableRng<M> + RustToCuda,
-    S: LineageStore<M, H> + RustToCuda,
-    X: EmigrationExit<M, H, G, S> + RustToCuda,
-    D: DispersalSampler<M, H, G> + RustToCuda,
-    C: CoalescenceSampler<M, H, S> + RustToCuda,
-    T: TurnoverRate<M, H> + RustToCuda,
-    N: SpeciationProbability<M, H> + RustToCuda,
-    E: MinSpeciationTrackingEventSampler<M, H, G, S, X, D, C, T, N> + RustToCuda,
-    I: ImmigrationEntry<M> + RustToCuda,
-    A: SingularActiveLineageSampler<M, H, G, S, X, D, C, T, N, E, I> + RustToCuda,
+#[allow(clippy::type_complexity)]
+pub struct SimulationKernelPtx<
+    M: MathsCore + Sync,
+    H: Habitat<M> + RustToCuda + Sync,
+    G: PrimeableRng<M> + RustToCuda + Sync,
+    S: LineageStore<M, H> + RustToCuda + Sync,
+    X: EmigrationExit<M, H, G, S> + RustToCuda + Sync,
+    D: DispersalSampler<M, H, G> + RustToCuda + Sync,
+    C: CoalescenceSampler<M, H, S> + RustToCuda + Sync,
+    T: TurnoverRate<M, H> + RustToCuda + Sync,
+    N: SpeciationProbability<M, H> + RustToCuda + Sync,
+    E: MinSpeciationTrackingEventSampler<M, H, G, S, X, D, C, T, N> + RustToCuda + Sync,
+    I: ImmigrationEntry<M> + RustToCuda + Sync,
+    A: SingularActiveLineageSampler<M, H, G, S, X, D, C, T, N, E, I> + RustToCuda + Sync,
     ReportSpeciation: Boolean,
     ReportDispersal: Boolean,
 >(
-    #[allow(clippy::type_complexity)]
-    pub(crate)  crate::SimulationKernel<
+    std::marker::PhantomData<(
         M,
         H,
         G,
@@ -53,66 +45,9 @@ pub struct SimulationKernel<
         A,
         ReportSpeciation,
         ReportDispersal,
-    >,
+    )>,
 );
 
-impl<
-        M: MathsCore,
-        H: Habitat<M> + RustToCuda,
-        G: PrimeableRng<M> + RustToCuda,
-        S: LineageStore<M, H> + RustToCuda,
-        X: EmigrationExit<M, H, G, S> + RustToCuda,
-        D: DispersalSampler<M, H, G> + RustToCuda,
-        C: CoalescenceSampler<M, H, S> + RustToCuda,
-        T: TurnoverRate<M, H> + RustToCuda,
-        N: SpeciationProbability<M, H> + RustToCuda,
-        E: MinSpeciationTrackingEventSampler<M, H, G, S, X, D, C, T, N> + RustToCuda,
-        I: ImmigrationEntry<M> + RustToCuda,
-        A: SingularActiveLineageSampler<M, H, G, S, X, D, C, T, N, E, I> + RustToCuda,
-        ReportSpeciation: Boolean,
-        ReportDispersal: Boolean,
-    > Launcher
-    for SimulationKernel<M, H, G, S, X, D, C, T, N, E, I, A, ReportSpeciation, ReportDispersal>
-{
-    type CompilationWatcher = Box<crate::KernelCompilationCallback>;
-    type KernelTraitObject = dyn SimulatableKernel<
-        M,
-        H,
-        G,
-        S,
-        X,
-        D,
-        C,
-        T,
-        N,
-        E,
-        I,
-        A,
-        ReportSpeciation,
-        ReportDispersal,
-    >;
-
-    fn get_launch_package(&mut self) -> LaunchPackage<Self> {
-        LaunchPackage {
-            config: LaunchConfig {
-                grid: self.0.grid.clone(),
-                block: self.0.block.clone(),
-                shared_memory_size: 0_u32,
-                ptx_jit: self.0.ptx_jit,
-            },
-
-            kernel: &mut self.0.kernel,
-            stream: &mut self.0.stream,
-
-            watcher: &mut self.0.watcher,
-        }
-    }
-
-    fn on_compile(kernel: &Function, watcher: &mut Self::CompilationWatcher) -> CudaResult<()> {
-        (watcher)(kernel)
-    }
-}
-
 #[allow(unused_macros)]
 macro_rules! link_kernel {
     ($habitat:ty, $dispersal:ty, $turnover:ty, $speciation:ty) => {
@@ -141,7 +76,7 @@ macro_rules! link_kernel {
         $habitat:ty, $dispersal:ty, $turnover:ty, $speciation:ty,
         $report_speciation:ty, $report_dispersal:ty
     ) => {
-        rustcoalescence_algorithms_cuda_gpu_kernel::link_kernel!(
+        rustcoalescence_algorithms_cuda_gpu_kernel::link! { impl simulate<
             necsim_impls_cuda::cogs::maths::NvptxMathsCore,
             $habitat,
             necsim_impls_cuda::cogs::rng::CudaRng<
@@ -194,9 +129,9 @@ macro_rules! link_kernel {
             >,
             $report_speciation,
             $report_dispersal,
-        );
+        > for SimulationKernelPtx }
 
-        rustcoalescence_algorithms_cuda_gpu_kernel::link_kernel!(
+        rustcoalescence_algorithms_cuda_gpu_kernel::link! { impl simulate<
             necsim_impls_cuda::cogs::maths::NvptxMathsCore,
             $habitat,
             necsim_impls_cuda::cogs::rng::CudaRng<
@@ -309,7 +244,7 @@ macro_rules! link_kernel {
             >,
             $report_speciation,
             $report_dispersal,
-        );
+        > for SimulationKernelPtx }
     };
 }
 
diff --git a/rustcoalescence/algorithms/cuda/cpu-kernel/src/patch.rs b/rustcoalescence/algorithms/cuda/cpu-kernel/src/patch.rs
index e9bb646b6..04404ad9b 100644
--- a/rustcoalescence/algorithms/cuda/cpu-kernel/src/patch.rs
+++ b/rustcoalescence/algorithms/cuda/cpu-kernel/src/patch.rs
@@ -1,239 +1,61 @@
-use std::sync::atomic::AtomicU64;
+use std::ffi::CStr;
 
 use necsim_core::{
     cogs::{
         CoalescenceSampler, DispersalSampler, EmigrationExit, Habitat, ImmigrationEntry,
         LineageStore, MathsCore, PrimeableRng, SpeciationProbability, TurnoverRate,
     },
-    lineage::Lineage,
     reporter::boolean::{Boolean, False, True},
-    simulation::Simulation,
 };
-use necsim_core_bond::{NonNegativeF64, PositiveF64};
-use necsim_impls_cuda::{event_buffer::EventBuffer, value_buffer::ValueBuffer};
 use necsim_impls_no_std::cogs::{
     active_lineage_sampler::singular::SingularActiveLineageSampler,
-    event_sampler::tracking::{MinSpeciationTrackingEventSampler, SpeciationSample},
+    event_sampler::tracking::MinSpeciationTrackingEventSampler,
 };
 
-use rust_cuda::{
-    common::{DeviceAccessible, RustToCuda},
-    host::{HostAndDeviceConstRef, HostAndDeviceMutRef, TypedKernel},
-    rustacuda::error::CudaResult,
-    utils::device_copy::SafeDeviceCopyWrapper,
-};
+use rust_cuda::{kernel::CompiledKernelPtx, lend::RustToCuda};
+
+use rustcoalescence_algorithms_cuda_gpu_kernel::simulate;
 
-use rustcoalescence_algorithms_cuda_gpu_kernel::SimulatableKernel;
+use crate::SimulationKernelPtx;
 
-use crate::SimulationKernel;
+// If `Kernel` is implemented for `ReportSpeciation` x `ReportDispersal`, i.e.
+//  for {`False`, `True`} x {`False`, `True`} then it is implemented for all
+//  `Boolean`s. However, Rust does not recognise that `Boolean` is closed over
+//  {`False`, `True`}. This explicit impl provides the necessary coersion.
 
-#[allow(clippy::missing_transmute_annotations, clippy::too_many_lines)]
 unsafe impl<
-        M: MathsCore,
-        H: Habitat<M> + RustToCuda,
-        G: PrimeableRng<M> + RustToCuda,
-        S: LineageStore<M, H> + RustToCuda,
-        X: EmigrationExit<M, H, G, S> + RustToCuda,
-        D: DispersalSampler<M, H, G> + RustToCuda,
-        C: CoalescenceSampler<M, H, S> + RustToCuda,
-        T: TurnoverRate<M, H> + RustToCuda,
-        N: SpeciationProbability<M, H> + RustToCuda,
-        E: MinSpeciationTrackingEventSampler<M, H, G, S, X, D, C, T, N> + RustToCuda,
-        I: ImmigrationEntry<M> + RustToCuda,
-        A: SingularActiveLineageSampler<M, H, G, S, X, D, C, T, N, E, I> + RustToCuda,
+        M: MathsCore + Sync,
+        H: Habitat<M> + RustToCuda + Sync,
+        G: PrimeableRng<M> + RustToCuda + Sync,
+        S: LineageStore<M, H> + RustToCuda + Sync,
+        X: EmigrationExit<M, H, G, S> + RustToCuda + Sync,
+        D: DispersalSampler<M, H, G> + RustToCuda + Sync,
+        C: CoalescenceSampler<M, H, S> + RustToCuda + Sync,
+        T: TurnoverRate<M, H> + RustToCuda + Sync,
+        N: SpeciationProbability<M, H> + RustToCuda + Sync,
+        E: MinSpeciationTrackingEventSampler<M, H, G, S, X, D, C, T, N> + RustToCuda + Sync,
+        I: ImmigrationEntry<M> + RustToCuda + Sync,
+        A: SingularActiveLineageSampler<M, H, G, S, X, D, C, T, N, E, I> + RustToCuda + Sync,
         ReportSpeciation: Boolean,
         ReportDispersal: Boolean,
-    > SimulatableKernel<M, H, G, S, X, D, C, T, N, E, I, A, ReportSpeciation, ReportDispersal>
-    for SimulationKernel<M, H, G, S, X, D, C, T, N, E, I, A, ReportSpeciation, ReportDispersal>
+    >
+    CompiledKernelPtx<
+        simulate<M, H, G, S, X, D, C, T, N, E, I, A, ReportSpeciation, ReportDispersal>,
+    > for SimulationKernelPtx<M, H, G, S, X, D, C, T, N, E, I, A, ReportSpeciation, ReportDispersal>
 where
-    crate::link::SimulationKernel<M, H, G, S, X, D, C, T, N, E, I, A, False, False>:
-        SimulatableKernel<M, H, G, S, X, D, C, T, N, E, I, A, False, False>,
-    crate::link::SimulationKernel<M, H, G, S, X, D, C, T, N, E, I, A, False, True>:
-        SimulatableKernel<M, H, G, S, X, D, C, T, N, E, I, A, False, True>,
-    crate::link::SimulationKernel<M, H, G, S, X, D, C, T, N, E, I, A, True, False>:
-        SimulatableKernel<M, H, G, S, X, D, C, T, N, E, I, A, True, False>,
-    crate::link::SimulationKernel<M, H, G, S, X, D, C, T, N, E, I, A, True, True>:
-        SimulatableKernel<M, H, G, S, X, D, C, T, N, E, I, A, True, True>,
+    crate::link::SimulationKernelPtx<M, H, G, S, X, D, C, T, N, E, I, A, False, False>:
+        CompiledKernelPtx<simulate<M, H, G, S, X, D, C, T, N, E, I, A, False, False>>,
+    crate::link::SimulationKernelPtx<M, H, G, S, X, D, C, T, N, E, I, A, False, True>:
+        CompiledKernelPtx<simulate<M, H, G, S, X, D, C, T, N, E, I, A, False, True>>,
+    crate::link::SimulationKernelPtx<M, H, G, S, X, D, C, T, N, E, I, A, True, False>:
+        CompiledKernelPtx<simulate<M, H, G, S, X, D, C, T, N, E, I, A, True, False>>,
+    crate::link::SimulationKernelPtx<M, H, G, S, X, D, C, T, N, E, I, A, True, True>:
+        CompiledKernelPtx<simulate<M, H, G, S, X, D, C, T, N, E, I, A, True, True>>,
 {
-    fn get_ptx_str() -> &'static str {
-        match (ReportSpeciation::VALUE, ReportDispersal::VALUE) {
-            (false, false) => crate::link::SimulationKernel::<
-                M,
-                H,
-                G,
-                S,
-                X,
-                D,
-                C,
-                T,
-                N,
-                E,
-                I,
-                A,
-                False,
-                False,
-            >::get_ptx_str(),
-            (false, true) => crate::link::SimulationKernel::<
-                M,
-                H,
-                G,
-                S,
-                X,
-                D,
-                C,
-                T,
-                N,
-                E,
-                I,
-                A,
-                False,
-                True,
-            >::get_ptx_str(),
-            (true, false) => crate::link::SimulationKernel::<
-                M,
-                H,
-                G,
-                S,
-                X,
-                D,
-                C,
-                T,
-                N,
-                E,
-                I,
-                A,
-                True,
-                False,
-            >::get_ptx_str(),
-            (true, true) => crate::link::SimulationKernel::<
-                M,
-                H,
-                G,
-                S,
-                X,
-                D,
-                C,
-                T,
-                N,
-                E,
-                I,
-                A,
-                True,
-                True,
-            >::get_ptx_str(),
-        }
-    }
-
-    fn new_kernel() -> CudaResult<
-        TypedKernel<
-            dyn SimulatableKernel<
-                M,
-                H,
-                G,
-                S,
-                X,
-                D,
-                C,
-                T,
-                N,
-                E,
-                I,
-                A,
-                ReportSpeciation,
-                ReportDispersal,
-            >,
-        >,
-    > {
-        match (ReportSpeciation::VALUE, ReportDispersal::VALUE) {
-            (false, false) => unsafe {
-                std::mem::transmute(crate::link::SimulationKernel::<
-                    M,
-                    H,
-                    G,
-                    S,
-                    X,
-                    D,
-                    C,
-                    T,
-                    N,
-                    E,
-                    I,
-                    A,
-                    False,
-                    False,
-                >::new_kernel())
-            },
-            (false, true) => unsafe {
-                std::mem::transmute(crate::link::SimulationKernel::<
-                    M,
-                    H,
-                    G,
-                    S,
-                    X,
-                    D,
-                    C,
-                    T,
-                    N,
-                    E,
-                    I,
-                    A,
-                    False,
-                    True,
-                >::new_kernel())
-            },
-            (true, false) => unsafe {
-                std::mem::transmute(crate::link::SimulationKernel::<
-                    M,
-                    H,
-                    G,
-                    S,
-                    X,
-                    D,
-                    C,
-                    T,
-                    N,
-                    E,
-                    I,
-                    A,
-                    True,
-                    False,
-                >::new_kernel())
-            },
-            (true, true) => unsafe {
-                std::mem::transmute(crate::link::SimulationKernel::<
-                    M,
-                    H,
-                    G,
-                    S,
-                    X,
-                    D,
-                    C,
-                    T,
-                    N,
-                    E,
-                    I,
-                    A,
-                    True,
-                    True,
-                >::new_kernel())
-            },
-        }
-    }
-
-    fn simulate(
-        &mut self,
-        simulation: &mut Simulation<M, H, G, S, X, D, C, T, N, E, I, A>,
-        task_list: &mut ValueBuffer<Lineage, true, true>,
-        event_buffer_reporter: &mut EventBuffer<ReportSpeciation, ReportDispersal>,
-        min_spec_sample_buffer: &mut ValueBuffer<SpeciationSample, false, true>,
-        next_event_time_buffer: &mut ValueBuffer<PositiveF64, false, true>,
-        total_time_max: &AtomicU64,
-        total_steps_sum: &AtomicU64,
-        max_steps: u64,
-        max_next_event_time: NonNegativeF64,
-    ) -> CudaResult<()> {
+    #[inline]
+    fn get_ptx() -> &'static CStr {
         match (ReportSpeciation::VALUE, ReportDispersal::VALUE) {
-            (false, false) => crate::link::SimulationKernel::<
+            (false, false) => crate::link::SimulationKernelPtx::<
                 M,
                 H,
                 G,
@@ -248,19 +70,8 @@ where
                 A,
                 False,
                 False,
-            >::simulate(
-                unsafe { &mut *std::ptr::from_mut(self).cast() },
-                simulation,
-                task_list,
-                unsafe { &mut *std::ptr::from_mut(event_buffer_reporter).cast() },
-                min_spec_sample_buffer,
-                next_event_time_buffer,
-                total_time_max,
-                total_steps_sum,
-                max_steps,
-                max_next_event_time,
-            ),
-            (false, true) => crate::link::SimulationKernel::<
+            >::get_ptx(),
+            (false, true) => crate::link::SimulationKernelPtx::<
                 M,
                 H,
                 G,
@@ -275,19 +86,8 @@ where
                 A,
                 False,
                 True,
-            >::simulate(
-                unsafe { &mut *std::ptr::from_mut(self).cast() },
-                simulation,
-                task_list,
-                unsafe { &mut *std::ptr::from_mut(event_buffer_reporter).cast() },
-                min_spec_sample_buffer,
-                next_event_time_buffer,
-                total_time_max,
-                total_steps_sum,
-                max_steps,
-                max_next_event_time,
-            ),
-            (true, false) => crate::link::SimulationKernel::<
+            >::get_ptx(),
+            (true, false) => crate::link::SimulationKernelPtx::<
                 M,
                 H,
                 G,
@@ -302,19 +102,8 @@ where
                 A,
                 True,
                 False,
-            >::simulate(
-                unsafe { &mut *std::ptr::from_mut(self).cast() },
-                simulation,
-                task_list,
-                unsafe { &mut *std::ptr::from_mut(event_buffer_reporter).cast() },
-                min_spec_sample_buffer,
-                next_event_time_buffer,
-                total_time_max,
-                total_steps_sum,
-                max_steps,
-                max_next_event_time,
-            ),
-            (true, true) => crate::link::SimulationKernel::<
+            >::get_ptx(),
+            (true, true) => crate::link::SimulationKernelPtx::<
                 M,
                 H,
                 G,
@@ -329,53 +118,14 @@ where
                 A,
                 True,
                 True,
-            >::simulate(
-                unsafe { &mut *std::ptr::from_mut(self).cast() },
-                simulation,
-                task_list,
-                unsafe { &mut *std::ptr::from_mut(event_buffer_reporter).cast() },
-                min_spec_sample_buffer,
-                next_event_time_buffer,
-                total_time_max,
-                total_steps_sum,
-                max_steps,
-                max_next_event_time,
-            ),
+            >::get_ptx(),
         }
     }
 
-    fn simulate_raw(
-        &mut self,
-        simulation: HostAndDeviceMutRef<
-            DeviceAccessible<
-                <Simulation<M, H, G, S, X, D, C, T, N, E, I, A> as RustToCuda>::CudaRepresentation,
-            >,
-        >,
-        task_list: HostAndDeviceMutRef<
-            DeviceAccessible<<ValueBuffer<Lineage, true, true> as RustToCuda>::CudaRepresentation>,
-        >,
-        event_buffer_reporter: HostAndDeviceMutRef<
-            DeviceAccessible<
-                <EventBuffer<ReportSpeciation, ReportDispersal> as RustToCuda>::CudaRepresentation,
-            >,
-        >,
-        min_spec_sample_buffer: HostAndDeviceMutRef<
-            DeviceAccessible<
-                <ValueBuffer<SpeciationSample, false, true> as RustToCuda>::CudaRepresentation,
-            >,
-        >,
-        next_event_time_buffer: HostAndDeviceMutRef<
-            DeviceAccessible<
-                <ValueBuffer<PositiveF64, false, true> as RustToCuda>::CudaRepresentation,
-            >,
-        >,
-        total_time_max: HostAndDeviceConstRef<SafeDeviceCopyWrapper<AtomicU64>>,
-        total_steps_sum: HostAndDeviceConstRef<SafeDeviceCopyWrapper<AtomicU64>>,
-        max_steps: SafeDeviceCopyWrapper<u64>,
-        max_next_event_time: SafeDeviceCopyWrapper<NonNegativeF64>,
-    ) -> CudaResult<()> {
+    #[inline]
+    fn get_entry_point() -> &'static CStr {
         match (ReportSpeciation::VALUE, ReportDispersal::VALUE) {
-            (false, false) => crate::link::SimulationKernel::<
+            (false, false) => crate::link::SimulationKernelPtx::<
                 M,
                 H,
                 G,
@@ -390,19 +140,8 @@ where
                 A,
                 False,
                 False,
-            >::simulate_raw(
-                unsafe { &mut *std::ptr::from_mut(self).cast() },
-                simulation,
-                task_list,
-                unsafe { std::mem::transmute(event_buffer_reporter) },
-                min_spec_sample_buffer,
-                next_event_time_buffer,
-                total_time_max,
-                total_steps_sum,
-                max_steps,
-                max_next_event_time,
-            ),
-            (false, true) => crate::link::SimulationKernel::<
+            >::get_entry_point(),
+            (false, true) => crate::link::SimulationKernelPtx::<
                 M,
                 H,
                 G,
@@ -417,19 +156,8 @@ where
                 A,
                 False,
                 True,
-            >::simulate_raw(
-                unsafe { &mut *std::ptr::from_mut(self).cast() },
-                simulation,
-                task_list,
-                unsafe { std::mem::transmute(event_buffer_reporter) },
-                min_spec_sample_buffer,
-                next_event_time_buffer,
-                total_time_max,
-                total_steps_sum,
-                max_steps,
-                max_next_event_time,
-            ),
-            (true, false) => crate::link::SimulationKernel::<
+            >::get_entry_point(),
+            (true, false) => crate::link::SimulationKernelPtx::<
                 M,
                 H,
                 G,
@@ -444,19 +172,8 @@ where
                 A,
                 True,
                 False,
-            >::simulate_raw(
-                unsafe { &mut *std::ptr::from_mut(self).cast() },
-                simulation,
-                task_list,
-                unsafe { std::mem::transmute(event_buffer_reporter) },
-                min_spec_sample_buffer,
-                next_event_time_buffer,
-                total_time_max,
-                total_steps_sum,
-                max_steps,
-                max_next_event_time,
-            ),
-            (true, true) => crate::link::SimulationKernel::<
+            >::get_entry_point(),
+            (true, true) => crate::link::SimulationKernelPtx::<
                 M,
                 H,
                 G,
@@ -471,18 +188,7 @@ where
                 A,
                 True,
                 True,
-            >::simulate_raw(
-                unsafe { &mut *std::ptr::from_mut(self).cast() },
-                simulation,
-                task_list,
-                unsafe { std::mem::transmute(event_buffer_reporter) },
-                min_spec_sample_buffer,
-                next_event_time_buffer,
-                total_time_max,
-                total_steps_sum,
-                max_steps,
-                max_next_event_time,
-            ),
+            >::get_entry_point(),
         }
     }
 }
diff --git a/rustcoalescence/algorithms/cuda/gpu-kernel/.cargo/config.toml b/rustcoalescence/algorithms/cuda/gpu-kernel/.cargo/config.toml
index e310e544a..20ad0d47d 100644
--- a/rustcoalescence/algorithms/cuda/gpu-kernel/.cargo/config.toml
+++ b/rustcoalescence/algorithms/cuda/gpu-kernel/.cargo/config.toml
@@ -2,7 +2,7 @@
 pipelining = false
 
 [target.nvptx64-nvidia-cuda]
-rustflags = ["-Clink-args=--arch=sm_35", "-Cpanic=abort", "-Clinker-plugin-lto", "-Ccodegen-units=1", "-Clink-arg=-O3", "-Clink-arg=--lto"]
+rustflags = ["-Zunstable-options", "-Clinker-flavor=llbc", "-Ctarget-cpu=sm_35", "-Cpanic=abort", "-Ccodegen-units=1", "-Clink-arg=-O3"]
 
 [unstable]
 build-std = ["core", "alloc"]
diff --git a/rustcoalescence/algorithms/cuda/gpu-kernel/Cargo.toml b/rustcoalescence/algorithms/cuda/gpu-kernel/Cargo.toml
index a45a7c862..dce06b130 100644
--- a/rustcoalescence/algorithms/cuda/gpu-kernel/Cargo.toml
+++ b/rustcoalescence/algorithms/cuda/gpu-kernel/Cargo.toml
@@ -16,4 +16,8 @@ necsim-core-bond = { path = "../../../../necsim/core/bond" }
 necsim-impls-no-std = { path = "../../../../necsim/impls/no-std", features = ["cuda"] }
 necsim-impls-cuda = { path = "../../../../necsim/impls/cuda" }
 
-rust-cuda = { git = "https://github.com/juntyr/rust-cuda", rev = "f395253", features = ["derive"] }
+[target.'cfg(target_os = "cuda")'.dependencies]
+rust-cuda = { git = "https://github.com/juntyr/rust-cuda", rev = "697dcf5", features = ["derive", "device", "kernel"] }
+
+[target.'cfg(not(target_os = "cuda"))'.dependencies]
+rust-cuda = { git = "https://github.com/juntyr/rust-cuda", rev = "697dcf5", features = ["derive", "kernel"] }
diff --git a/rustcoalescence/algorithms/cuda/gpu-kernel/src/lib.rs b/rustcoalescence/algorithms/cuda/gpu-kernel/src/lib.rs
index 3365963fd..afb17efe7 100644
--- a/rustcoalescence/algorithms/cuda/gpu-kernel/src/lib.rs
+++ b/rustcoalescence/algorithms/cuda/gpu-kernel/src/lib.rs
@@ -1,12 +1,10 @@
 #![deny(clippy::pedantic)]
 #![no_std]
+#![feature(type_alias_impl_trait)]
+#![feature(decl_macro)]
 #![cfg_attr(target_os = "cuda", feature(abi_ptx))]
-#![cfg_attr(target_os = "cuda", feature(alloc_error_handler))]
-#![cfg_attr(target_os = "cuda", feature(panic_info_message))]
-#![cfg_attr(target_os = "cuda", feature(atomic_from_mut))]
 #![cfg_attr(target_os = "cuda", feature(asm_experimental_arch))]
-#![cfg_attr(target_os = "cuda", feature(stdarch_nvptx))]
-#![cfg_attr(target_os = "cuda", feature(control_flow_enum))]
+#![cfg_attr(target_os = "cuda", feature(alloc_error_handler))]
 #![allow(long_running_const_eval)]
 #![recursion_limit = "1024"]
 
@@ -14,81 +12,71 @@ extern crate alloc;
 
 #[cfg(target_os = "cuda")]
 use core::ops::ControlFlow;
+use core::sync::atomic::AtomicU64;
 
 use necsim_core::{
     cogs::{
         CoalescenceSampler, DispersalSampler, EmigrationExit, Habitat, ImmigrationEntry,
         LineageStore, MathsCore, PrimeableRng, SpeciationProbability, TurnoverRate,
     },
+    lineage::Lineage,
     reporter::boolean::Boolean,
+    simulation::Simulation,
 };
+use necsim_core_bond::{NonNegativeF64, PositiveF64};
 
+use necsim_impls_cuda::{event_buffer::EventBuffer, value_buffer::ValueBuffer};
 use necsim_impls_no_std::cogs::{
     active_lineage_sampler::singular::SingularActiveLineageSampler,
     event_sampler::tracking::{MinSpeciationTrackingEventSampler, SpeciationSample},
 };
 
-use rust_cuda::common::RustToCuda;
+use rust_cuda::{
+    kernel::param::{DeepPerThreadBorrow, PerThreadShallowCopy, PtxJit, ShallowInteriorMutable},
+    lend::RustToCuda,
+};
 
-#[rust_cuda::common::kernel(
-    pub use link_kernel! as impl SimulatableKernel<SimulationKernelArgs> for SimulationKernel
+#[rust_cuda::kernel::kernel(pub use link! for impl)]
+#[kernel(
+    allow(ptx::double_precision_use),
+    allow(ptx::local_memory_use), // FIXME
+    forbid(ptx::register_spills),
 )]
 #[allow(clippy::too_many_arguments)]
 #[allow(clippy::type_complexity)]
 pub fn simulate<
-    M: MathsCore,
-    H: Habitat<M> + RustToCuda,
-    G: PrimeableRng<M> + RustToCuda,
-    S: LineageStore<M, H> + RustToCuda,
-    X: EmigrationExit<M, H, G, S> + RustToCuda,
-    D: DispersalSampler<M, H, G> + RustToCuda,
-    C: CoalescenceSampler<M, H, S> + RustToCuda,
-    T: TurnoverRate<M, H> + RustToCuda,
-    N: SpeciationProbability<M, H> + RustToCuda,
-    E: MinSpeciationTrackingEventSampler<M, H, G, S, X, D, C, T, N> + RustToCuda,
-    I: ImmigrationEntry<M> + RustToCuda,
-    A: SingularActiveLineageSampler<M, H, G, S, X, D, C, T, N, E, I> + RustToCuda,
+    M: MathsCore + Sync,
+    H: Habitat<M> + RustToCuda + Sync,
+    G: PrimeableRng<M> + RustToCuda + Sync,
+    S: LineageStore<M, H> + RustToCuda + Sync,
+    X: EmigrationExit<M, H, G, S> + RustToCuda + Sync,
+    D: DispersalSampler<M, H, G> + RustToCuda + Sync,
+    C: CoalescenceSampler<M, H, S> + RustToCuda + Sync,
+    T: TurnoverRate<M, H> + RustToCuda + Sync,
+    N: SpeciationProbability<M, H> + RustToCuda + Sync,
+    E: MinSpeciationTrackingEventSampler<M, H, G, S, X, D, C, T, N> + RustToCuda + Sync,
+    I: ImmigrationEntry<M> + RustToCuda + Sync,
+    A: SingularActiveLineageSampler<M, H, G, S, X, D, C, T, N, E, I> + RustToCuda + Sync,
     ReportSpeciation: Boolean,
     ReportDispersal: Boolean,
 >(
-    #[rustfmt::skip]
-    #[kernel(pass = LendRustToCuda, jit)]
-    simulation: &mut ShallowCopy<
-        necsim_core::simulation::Simulation<M, H, G, S, X, D, C, T, N, E, I, A>,
-    >,
-    #[rustfmt::skip]
-    #[kernel(pass = LendRustToCuda, jit)]
-    task_list: &mut ShallowCopy<
-        necsim_impls_cuda::value_buffer::ValueBuffer<necsim_core::lineage::Lineage, true, true>,
-    >,
-    #[rustfmt::skip]
-    #[kernel(pass = LendRustToCuda, jit)]
-    event_buffer_reporter: &mut ShallowCopy<
-        necsim_impls_cuda::event_buffer::EventBuffer<ReportSpeciation, ReportDispersal>,
+    simulation: &PtxJit<DeepPerThreadBorrow<Simulation<M, H, G, S, X, D, C, T, N, E, I, A>>>,
+    task_list: &mut PtxJit<DeepPerThreadBorrow<ValueBuffer<Lineage, true, true>>>,
+    event_buffer_reporter: &mut PtxJit<
+        DeepPerThreadBorrow<EventBuffer<ReportSpeciation, ReportDispersal>>,
     >,
-    #[rustfmt::skip]
-    #[kernel(pass = LendRustToCuda, jit)]
-    min_spec_sample_buffer: &mut ShallowCopy<
-        necsim_impls_cuda::value_buffer::ValueBuffer<SpeciationSample, false, true>,
+    min_spec_sample_buffer: &mut PtxJit<
+        DeepPerThreadBorrow<ValueBuffer<SpeciationSample, false, true>>,
     >,
-    #[rustfmt::skip]
-    #[kernel(pass = LendRustToCuda, jit)]
-    next_event_time_buffer: &mut ShallowCopy<
-        necsim_impls_cuda::value_buffer::ValueBuffer<necsim_core_bond::PositiveF64, false, true>,
-    >,
-    #[rustfmt::skip]
-    #[kernel(pass = SafeDeviceCopy)]
-    total_time_max: &core::sync::atomic::AtomicU64,
-    #[rustfmt::skip]
-    #[kernel(pass = SafeDeviceCopy)]
-    total_steps_sum: &core::sync::atomic::AtomicU64,
-    #[rustfmt::skip]
-    #[kernel(pass = SafeDeviceCopy)]
-    max_steps: u64,
-    #[rustfmt::skip]
-    #[kernel(pass = SafeDeviceCopy)]
-    max_next_event_time: necsim_core_bond::NonNegativeF64,
+    next_event_time_buffer: &mut PtxJit<DeepPerThreadBorrow<ValueBuffer<PositiveF64, false, true>>>,
+    total_time_max: &ShallowInteriorMutable<AtomicU64>,
+    total_steps_sum: &ShallowInteriorMutable<AtomicU64>,
+    max_steps: PerThreadShallowCopy<u64>,
+    max_next_event_time: PerThreadShallowCopy<NonNegativeF64>,
 ) {
+    // TODO: use simulation with non-allocating clone
+    let mut simulation = unsafe { core::mem::ManuallyDrop::new(core::ptr::read(simulation)) };
+
     task_list.with_value_for_core(|task| {
         // Discard the prior task (the simulation is just a temporary local copy)
         core::mem::drop(
@@ -103,13 +91,16 @@ pub fn simulate<
         let mut final_next_event_time = None;
 
         let (time, steps) = simulation.simulate_incremental_early_stop(
-            |_, steps, next_event_time| {
+            |_, steps, next_event_time, reporter| {
                 final_next_event_time = Some(next_event_time);
 
-                if steps >= max_steps || next_event_time >= max_next_event_time {
-                    ControlFlow::Break(())
-                } else {
+                if steps < max_steps
+                    && next_event_time < max_next_event_time
+                    && reporter.can_buffer_next_event()
+                {
                     ControlFlow::Continue(())
+                } else {
+                    ControlFlow::Break(())
                 }
             },
             event_buffer_reporter,
@@ -133,37 +124,34 @@ pub fn simulate<
 
 #[cfg(target_os = "cuda")]
 mod cuda_prelude {
-    use core::arch::nvptx;
-
-    use rust_cuda::device::utils;
+    use rust_cuda::device::alloc::PTXAllocator;
 
     #[global_allocator]
-    static _GLOBAL_ALLOCATOR: utils::PTXAllocator = utils::PTXAllocator;
+    static _GLOBAL_ALLOCATOR: PTXAllocator = PTXAllocator;
 
     #[cfg(not(debug_assertions))]
     #[panic_handler]
     fn panic(_panic_info: &::core::panic::PanicInfo) -> ! {
-        unsafe { nvptx::trap() }
+        rust_cuda::device::utils::abort()
     }
 
     #[cfg(debug_assertions)]
     #[panic_handler]
-    fn panic(panic_info: &::core::panic::PanicInfo) -> ! {
-        use rust_cuda::println;
-
-        println!(
-            "Panic occurred at {:?}: {:?}!",
-            panic_info.location(),
-            panic_info
-                .message()
-                .unwrap_or(&format_args!("unknown reason"))
-        );
-
-        unsafe { nvptx::trap() }
+    fn panic(info: &::core::panic::PanicInfo) -> ! {
+        rust_cuda::device::utils::pretty_print_panic_info(info, true, true);
+        rust_cuda::device::utils::abort()
     }
 
+    #[cfg(not(debug_assertions))]
     #[alloc_error_handler]
     fn alloc_error_handler(_: core::alloc::Layout) -> ! {
-        unsafe { nvptx::trap() }
+        rust_cuda::device::utils::abort()
+    }
+
+    #[cfg(debug_assertions)]
+    #[alloc_error_handler]
+    fn alloc_error_handler(layout: core::alloc::Layout) -> ! {
+        rust_cuda::device::utils::pretty_print_alloc_error(layout);
+        rust_cuda::device::utils::abort()
     }
 }
diff --git a/rustcoalescence/algorithms/cuda/src/cuda.rs b/rustcoalescence/algorithms/cuda/src/cuda.rs
index c523bf2d2..d8222ebb1 100644
--- a/rustcoalescence/algorithms/cuda/src/cuda.rs
+++ b/rustcoalescence/algorithms/cuda/src/cuda.rs
@@ -1,4 +1,4 @@
-use rust_cuda::rustacuda::{
+use rust_cuda::deps::rustacuda::{
     context::{Context, CurrentContext, ResourceLimit},
     prelude::*,
 };
@@ -13,7 +13,7 @@ pub fn with_initialised_cuda<O, E: Into<CudaError>, F: FnOnce() -> Result<O, E>>
     inner: F,
 ) -> Result<O, CudaError> {
     // Initialize the CUDA API
-    rust_cuda::rustacuda::init(CudaFlags::empty())?;
+    rust_cuda::deps::rustacuda::init(CudaFlags::empty())?;
 
     // Get the first device
     let device = Device::get_device(device)?;
diff --git a/rustcoalescence/algorithms/cuda/src/error.rs b/rustcoalescence/algorithms/cuda/src/error.rs
index e69898247..f81a9e3c1 100644
--- a/rustcoalescence/algorithms/cuda/src/error.rs
+++ b/rustcoalescence/algorithms/cuda/src/error.rs
@@ -1,4 +1,4 @@
-use rust_cuda::rustacuda::error::CudaError as RustaCudaError;
+use rust_cuda::deps::rustacuda::error::CudaError as RustaCudaError;
 use serde::{Deserialize, Serialize};
 
 #[derive(thiserror::Error, Debug, Clone, Serialize, Deserialize)]
diff --git a/rustcoalescence/algorithms/cuda/src/info.rs b/rustcoalescence/algorithms/cuda/src/info.rs
index 1abf4ec07..78a5452ea 100644
--- a/rustcoalescence/algorithms/cuda/src/info.rs
+++ b/rustcoalescence/algorithms/cuda/src/info.rs
@@ -1,4 +1,4 @@
-use rust_cuda::rustacuda::{
+use rust_cuda::deps::rustacuda::{
     context::{CurrentContext, ResourceLimit},
     function::{Function, FunctionAttribute},
 };
diff --git a/rustcoalescence/algorithms/cuda/src/initialiser/fixup.rs b/rustcoalescence/algorithms/cuda/src/initialiser/fixup.rs
index 06401c685..6c8dee90a 100644
--- a/rustcoalescence/algorithms/cuda/src/initialiser/fixup.rs
+++ b/rustcoalescence/algorithms/cuda/src/initialiser/fixup.rs
@@ -28,7 +28,7 @@ use rustcoalescence_algorithms::{
 };
 use rustcoalescence_scenarios::Scenario;
 
-use rust_cuda::common::RustToCuda;
+use rust_cuda::lend::RustToCuda;
 
 use crate::CudaError;
 
@@ -42,19 +42,21 @@ pub struct FixUpInitialiser<L: ExactSizeIterator<Item = Lineage>> {
 
 impl<
         L: ExactSizeIterator<Item = Lineage>,
-        M: MathsCore,
-        G: PrimeableRng<M> + RustToCuda,
+        M: MathsCore + Sync,
+        G: PrimeableRng<M> + RustToCuda + Sync,
         O: Scenario<M, G>,
     > CudaLineageStoreSampleInitialiser<M, G, O, ResumeError<CudaError>> for FixUpInitialiser<L>
 where
-    O::Habitat: RustToCuda,
-    O::DispersalSampler<InMemoryPackedAliasDispersalSampler<M, O::Habitat, G>>: RustToCuda,
-    O::TurnoverRate: RustToCuda,
-    O::SpeciationProbability: RustToCuda,
+    O::Habitat: RustToCuda + Sync,
+    O::DispersalSampler<InMemoryPackedAliasDispersalSampler<M, O::Habitat, G>>: RustToCuda + Sync,
+    O::TurnoverRate: RustToCuda + Sync,
+    O::SpeciationProbability: RustToCuda + Sync,
 {
     type ActiveLineageSampler<
-        X: EmigrationExit<M, O::Habitat, G, IndependentLineageStore<M, O::Habitat>> + RustToCuda,
-        J: EventTimeSampler<M, O::Habitat, G, O::TurnoverRate> + RustToCuda,
+        X: EmigrationExit<M, O::Habitat, G, IndependentLineageStore<M, O::Habitat>>
+            + RustToCuda
+            + Sync,
+        J: EventTimeSampler<M, O::Habitat, G, O::TurnoverRate> + RustToCuda + Sync,
     > = IndependentActiveLineageSampler<
         M,
         O::Habitat,
@@ -76,8 +78,10 @@ where
     fn init<
         'h,
         T: TrustedOriginSampler<'h, M, Habitat = O::Habitat>,
-        J: EventTimeSampler<M, O::Habitat, G, O::TurnoverRate> + RustToCuda,
-        X: EmigrationExit<M, O::Habitat, G, IndependentLineageStore<M, O::Habitat>> + RustToCuda,
+        J: EventTimeSampler<M, O::Habitat, G, O::TurnoverRate> + RustToCuda + Sync,
+        X: EmigrationExit<M, O::Habitat, G, IndependentLineageStore<M, O::Habitat>>
+            + RustToCuda
+            + Sync,
     >(
         self,
         origin_sampler: T,
diff --git a/rustcoalescence/algorithms/cuda/src/initialiser/genesis.rs b/rustcoalescence/algorithms/cuda/src/initialiser/genesis.rs
index 5f851c286..72b836902 100644
--- a/rustcoalescence/algorithms/cuda/src/initialiser/genesis.rs
+++ b/rustcoalescence/algorithms/cuda/src/initialiser/genesis.rs
@@ -14,7 +14,7 @@ use necsim_impls_no_std::cogs::{
 
 use rustcoalescence_scenarios::Scenario;
 
-use rust_cuda::common::RustToCuda;
+use rust_cuda::lend::RustToCuda;
 
 use crate::CudaError;
 
@@ -23,17 +23,19 @@ use super::CudaLineageStoreSampleInitialiser;
 #[allow(clippy::module_name_repetitions)]
 pub struct GenesisInitialiser;
 
-impl<M: MathsCore, G: PrimeableRng<M> + RustToCuda, O: Scenario<M, G>>
+impl<M: MathsCore + Sync, G: PrimeableRng<M> + RustToCuda + Sync, O: Scenario<M, G>>
     CudaLineageStoreSampleInitialiser<M, G, O, CudaError> for GenesisInitialiser
 where
-    O::Habitat: RustToCuda,
-    O::DispersalSampler<InMemoryPackedAliasDispersalSampler<M, O::Habitat, G>>: RustToCuda,
-    O::TurnoverRate: RustToCuda,
-    O::SpeciationProbability: RustToCuda,
+    O::Habitat: RustToCuda + Sync,
+    O::DispersalSampler<InMemoryPackedAliasDispersalSampler<M, O::Habitat, G>>: RustToCuda + Sync,
+    O::TurnoverRate: RustToCuda + Sync,
+    O::SpeciationProbability: RustToCuda + Sync,
 {
     type ActiveLineageSampler<
-        X: EmigrationExit<M, O::Habitat, G, IndependentLineageStore<M, O::Habitat>> + RustToCuda,
-        J: EventTimeSampler<M, O::Habitat, G, O::TurnoverRate> + RustToCuda,
+        X: EmigrationExit<M, O::Habitat, G, IndependentLineageStore<M, O::Habitat>>
+            + RustToCuda
+            + Sync,
+        J: EventTimeSampler<M, O::Habitat, G, O::TurnoverRate> + RustToCuda + Sync,
     > = IndependentActiveLineageSampler<
         M,
         O::Habitat,
@@ -50,8 +52,10 @@ where
     fn init<
         'h,
         T: TrustedOriginSampler<'h, M, Habitat = O::Habitat>,
-        J: EventTimeSampler<M, O::Habitat, G, O::TurnoverRate> + RustToCuda,
-        X: EmigrationExit<M, O::Habitat, G, IndependentLineageStore<M, O::Habitat>> + RustToCuda,
+        J: EventTimeSampler<M, O::Habitat, G, O::TurnoverRate> + RustToCuda + Sync,
+        X: EmigrationExit<M, O::Habitat, G, IndependentLineageStore<M, O::Habitat>>
+            + RustToCuda
+            + Sync,
     >(
         self,
         origin_sampler: T,
diff --git a/rustcoalescence/algorithms/cuda/src/initialiser/mod.rs b/rustcoalescence/algorithms/cuda/src/initialiser/mod.rs
index a1a39e87e..8a0d9a27c 100644
--- a/rustcoalescence/algorithms/cuda/src/initialiser/mod.rs
+++ b/rustcoalescence/algorithms/cuda/src/initialiser/mod.rs
@@ -17,7 +17,7 @@ use necsim_impls_no_std::cogs::{
 
 use rustcoalescence_scenarios::Scenario;
 
-use rust_cuda::common::RustToCuda;
+use rust_cuda::lend::RustToCuda;
 
 use crate::CudaError;
 
@@ -28,38 +28,40 @@ pub mod resume;
 #[allow(clippy::module_name_repetitions)]
 pub trait CudaLineageStoreSampleInitialiser<
     M: MathsCore,
-    G: PrimeableRng<M> + RustToCuda,
+    G: PrimeableRng<M> + RustToCuda + Sync,
     O: Scenario<M, G>,
     Error: From<CudaError>,
 > where
-    O::Habitat: RustToCuda,
-    O::DispersalSampler<InMemoryPackedAliasDispersalSampler<M, O::Habitat, G>>: RustToCuda,
-    O::TurnoverRate: RustToCuda,
-    O::SpeciationProbability: RustToCuda,
+    O::Habitat: RustToCuda + Sync,
+    O::DispersalSampler<InMemoryPackedAliasDispersalSampler<M, O::Habitat, G>>: RustToCuda + Sync,
+    O::TurnoverRate: RustToCuda + Sync,
+    O::SpeciationProbability: RustToCuda + Sync,
 {
-    type DispersalSampler: DispersalSampler<M, O::Habitat, G> + RustToCuda;
+    type DispersalSampler: DispersalSampler<M, O::Habitat, G> + RustToCuda + Sync;
     type ActiveLineageSampler<
         X: EmigrationExit<
             M,
             O::Habitat,
             G,
             IndependentLineageStore<M, O::Habitat>,
-        > + RustToCuda,
-        J: EventTimeSampler<M, O::Habitat, G, O::TurnoverRate> + RustToCuda,
+        > + RustToCuda + Sync,
+        J: EventTimeSampler<M, O::Habitat, G, O::TurnoverRate> + RustToCuda + Sync,
     >: SingularActiveLineageSampler<
         M, O::Habitat, G, IndependentLineageStore<M, O::Habitat>,
         X, Self::DispersalSampler, IndependentCoalescenceSampler<M, O::Habitat>, O::TurnoverRate,
         O::SpeciationProbability, IndependentEventSampler<
             M, O::Habitat, G, X, Self::DispersalSampler, O::TurnoverRate, O::SpeciationProbability
         >, NeverImmigrationEntry,
-    > + RustToCuda;
+    > + RustToCuda + Sync;
 
     #[allow(clippy::type_complexity)]
     fn init<
         'h,
         T: TrustedOriginSampler<'h, M, Habitat = O::Habitat>,
-        J: EventTimeSampler<M, O::Habitat, G, O::TurnoverRate> + RustToCuda,
-        X: EmigrationExit<M, O::Habitat, G, IndependentLineageStore<M, O::Habitat>> + RustToCuda,
+        J: EventTimeSampler<M, O::Habitat, G, O::TurnoverRate> + RustToCuda + Sync,
+        X: EmigrationExit<M, O::Habitat, G, IndependentLineageStore<M, O::Habitat>>
+            + RustToCuda
+            + Sync,
     >(
         self,
         origin_sampler: T,
diff --git a/rustcoalescence/algorithms/cuda/src/initialiser/resume.rs b/rustcoalescence/algorithms/cuda/src/initialiser/resume.rs
index 2cba7640b..478690d96 100644
--- a/rustcoalescence/algorithms/cuda/src/initialiser/resume.rs
+++ b/rustcoalescence/algorithms/cuda/src/initialiser/resume.rs
@@ -17,7 +17,7 @@ use necsim_impls_no_std::cogs::{
 use rustcoalescence_algorithms::result::ResumeError;
 use rustcoalescence_scenarios::Scenario;
 
-use rust_cuda::common::RustToCuda;
+use rust_cuda::lend::RustToCuda;
 
 use crate::CudaError;
 
@@ -31,19 +31,21 @@ pub struct ResumeInitialiser<L: ExactSizeIterator<Item = Lineage>> {
 
 impl<
         L: ExactSizeIterator<Item = Lineage>,
-        M: MathsCore,
-        G: PrimeableRng<M> + RustToCuda,
+        M: MathsCore + Sync,
+        G: PrimeableRng<M> + RustToCuda + Sync,
         O: Scenario<M, G>,
     > CudaLineageStoreSampleInitialiser<M, G, O, ResumeError<CudaError>> for ResumeInitialiser<L>
 where
-    O::Habitat: RustToCuda,
-    O::DispersalSampler<InMemoryPackedAliasDispersalSampler<M, O::Habitat, G>>: RustToCuda,
-    O::TurnoverRate: RustToCuda,
-    O::SpeciationProbability: RustToCuda,
+    O::Habitat: RustToCuda + Sync,
+    O::DispersalSampler<InMemoryPackedAliasDispersalSampler<M, O::Habitat, G>>: RustToCuda + Sync,
+    O::TurnoverRate: RustToCuda + Sync,
+    O::SpeciationProbability: RustToCuda + Sync,
 {
     type ActiveLineageSampler<
-        X: EmigrationExit<M, O::Habitat, G, IndependentLineageStore<M, O::Habitat>> + RustToCuda,
-        J: EventTimeSampler<M, O::Habitat, G, O::TurnoverRate> + RustToCuda,
+        X: EmigrationExit<M, O::Habitat, G, IndependentLineageStore<M, O::Habitat>>
+            + RustToCuda
+            + Sync,
+        J: EventTimeSampler<M, O::Habitat, G, O::TurnoverRate> + RustToCuda + Sync,
     > = IndependentActiveLineageSampler<
         M,
         O::Habitat,
@@ -60,8 +62,10 @@ where
     fn init<
         'h,
         T: TrustedOriginSampler<'h, M, Habitat = O::Habitat>,
-        J: EventTimeSampler<M, O::Habitat, G, O::TurnoverRate> + RustToCuda,
-        X: EmigrationExit<M, O::Habitat, G, IndependentLineageStore<M, O::Habitat>> + RustToCuda,
+        J: EventTimeSampler<M, O::Habitat, G, O::TurnoverRate> + RustToCuda + Sync,
+        X: EmigrationExit<M, O::Habitat, G, IndependentLineageStore<M, O::Habitat>>
+            + RustToCuda
+            + Sync,
     >(
         self,
         origin_sampler: T,
diff --git a/rustcoalescence/algorithms/cuda/src/launch.rs b/rustcoalescence/algorithms/cuda/src/launch.rs
index 12589699a..44e0e66f6 100644
--- a/rustcoalescence/algorithms/cuda/src/launch.rs
+++ b/rustcoalescence/algorithms/cuda/src/launch.rs
@@ -1,9 +1,12 @@
 use std::marker::PhantomData;
 
-use necsim_core::{cogs::MathsCore, reporter::Reporter, simulation::SimulationBuilder};
+use necsim_core::{
+    cogs::{MathsCore, PrimeableRng},
+    reporter::Reporter,
+    simulation::SimulationBuilder,
+};
 use necsim_core_bond::NonNegativeF64;
 
-use necsim_impls_cuda::cogs::rng::CudaRng;
 use necsim_impls_no_std::{
     cogs::{
         active_lineage_sampler::independent::event_time_sampler::exp::ExpEventTimeSampler,
@@ -16,7 +19,6 @@ use necsim_impls_no_std::{
         origin_sampler::{
             decomposition::DecompositionOriginSampler, pre_sampler::OriginPreSampler,
         },
-        rng::wyhash::WyHash,
     },
     parallelisation::Status,
 };
@@ -25,15 +27,16 @@ use necsim_partitioning_core::LocalPartition;
 use rustcoalescence_algorithms::result::SimulationOutcome;
 use rustcoalescence_scenarios::Scenario;
 
-use rustcoalescence_algorithms_cuda_cpu_kernel::SimulationKernel;
-use rustcoalescence_algorithms_cuda_gpu_kernel::SimulatableKernel;
+use rustcoalescence_algorithms_cuda_gpu_kernel::simulate;
 
 use rust_cuda::{
-    common::RustToCuda,
-    rustacuda::{
+    deps::rustacuda::{
         function::{BlockSize, GridSize},
         prelude::{Stream, StreamFlags},
     },
+    host::CudaDropWrapper,
+    kernel::{CompiledKernelPtx, LaunchConfig, Launcher, TypedPtxKernel},
+    lend::RustToCuda,
 };
 
 use crate::{
@@ -49,75 +52,54 @@ use crate::{
 #[allow(clippy::too_many_lines)]
 pub fn initialise_and_simulate<
     'p,
-    M: MathsCore,
-    O: Scenario<M, CudaRng<M, WyHash<M>>>,
+    M: MathsCore + Sync,
+    G: PrimeableRng<M> + RustToCuda + Sync,
+    O: Scenario<M, G>,
     R: Reporter,
     P: LocalPartition<'p, R>,
     I: Iterator<Item = u64>,
-    L: CudaLineageStoreSampleInitialiser<M, CudaRng<M, WyHash<M>>, O, Error>,
+    L: CudaLineageStoreSampleInitialiser<M, G, O, Error>,
     Error: From<CudaError>,
->(
-    args: &CudaArguments,
-    rng: CudaRng<M, WyHash<M>>,
-    scenario: O,
-    pre_sampler: OriginPreSampler<M, I>,
-    pause_before: Option<NonNegativeF64>,
-    local_partition: &mut P,
-    lineage_store_sampler_initialiser: L,
-) -> Result<SimulationOutcome<M, CudaRng<M, WyHash<M>>>, Error>
-where
-    O::Habitat: RustToCuda,
-    O::DispersalSampler<InMemoryPackedAliasDispersalSampler<M, O::Habitat, CudaRng<M, WyHash<M>>>>:
-        RustToCuda,
-    O::TurnoverRate: RustToCuda,
-    O::SpeciationProbability: RustToCuda,
-    SimulationKernel<
-        M,
-        O::Habitat,
-        CudaRng<M, WyHash<M>>,
-        IndependentLineageStore<M, O::Habitat>,
-        NeverEmigrationExit,
-        L::DispersalSampler,
-        IndependentCoalescenceSampler<M, O::Habitat>,
-        O::TurnoverRate,
-        O::SpeciationProbability,
-        IndependentEventSampler<
-            M,
-            O::Habitat,
-            CudaRng<M, WyHash<M>>,
-            NeverEmigrationExit,
-            L::DispersalSampler,
-            O::TurnoverRate,
-            O::SpeciationProbability,
-        >,
-        NeverImmigrationEntry,
-        L::ActiveLineageSampler<NeverEmigrationExit, ExpEventTimeSampler>,
-        R::ReportSpeciation,
-        R::ReportDispersal,
-    >: SimulatableKernel<
-        M,
-        O::Habitat,
-        CudaRng<M, WyHash<M>>,
-        IndependentLineageStore<M, O::Habitat>,
-        NeverEmigrationExit,
-        L::DispersalSampler,
-        IndependentCoalescenceSampler<M, O::Habitat>,
-        O::TurnoverRate,
-        O::SpeciationProbability,
-        IndependentEventSampler<
+    Ptx: CompiledKernelPtx<
+        simulate<
             M,
             O::Habitat,
-            CudaRng<M, WyHash<M>>,
+            G,
+            IndependentLineageStore<M, O::Habitat>,
             NeverEmigrationExit,
             L::DispersalSampler,
+            IndependentCoalescenceSampler<M, O::Habitat>,
             O::TurnoverRate,
             O::SpeciationProbability,
+            IndependentEventSampler<
+                M,
+                O::Habitat,
+                G,
+                NeverEmigrationExit,
+                L::DispersalSampler,
+                O::TurnoverRate,
+                O::SpeciationProbability,
+            >,
+            NeverImmigrationEntry,
+            L::ActiveLineageSampler<NeverEmigrationExit, ExpEventTimeSampler>,
+            R::ReportSpeciation,
+            R::ReportDispersal,
         >,
-        NeverImmigrationEntry,
-        L::ActiveLineageSampler<NeverEmigrationExit, ExpEventTimeSampler>,
-        R::ReportSpeciation,
-        R::ReportDispersal,
     >,
+>(
+    args: &CudaArguments,
+    rng: G,
+    scenario: O,
+    pre_sampler: OriginPreSampler<M, I>,
+    pause_before: Option<NonNegativeF64>,
+    local_partition: &mut P,
+    lineage_store_sampler_initialiser: L,
+) -> Result<SimulationOutcome<M, G>, Error>
+where
+    O::Habitat: RustToCuda + Sync,
+    O::DispersalSampler<InMemoryPackedAliasDispersalSampler<M, O::Habitat, G>>: RustToCuda + Sync,
+    O::TurnoverRate: RustToCuda + Sync,
+    O::SpeciationProbability: RustToCuda + Sync,
 {
     let (
         habitat,
@@ -126,8 +108,7 @@ where
         speciation_probability,
         origin_sampler_auxiliary,
         decomposition_auxiliary,
-    ) = scenario
-        .build::<InMemoryPackedAliasDispersalSampler<M, O::Habitat, CudaRng<M, WyHash<M>>>>();
+    ) = scenario.build::<InMemoryPackedAliasDispersalSampler<M, O::Habitat, G>>();
     let coalescence_sampler = IndependentCoalescenceSampler::default();
     let event_sampler = IndependentEventSampler::default();
 
@@ -196,26 +177,36 @@ where
     };
 
     let (mut status, time, steps, lineages) = with_initialised_cuda(args.device, || {
-        let kernel = SimulationKernel::try_new(
-            Stream::new(StreamFlags::NON_BLOCKING, None)?,
-            grid_size.clone(),
-            block_size.clone(),
-            args.ptx_jit,
-            Box::new(|kernel| {
-                crate::info::print_kernel_function_attributes("simulate", kernel);
-                Ok(())
-            }),
-        )?;
-
-        parallelisation::monolithic::simulate(
-            &mut simulation,
-            kernel,
-            (grid_size, block_size, args.dedup_cache, args.step_slice),
-            lineages,
-            event_slice,
-            pause_before,
-            local_partition,
-        )
+        let mut stream = CudaDropWrapper::from(Stream::new(StreamFlags::NON_BLOCKING, None)?);
+
+        let mut kernel = TypedPtxKernel::new::<Ptx>(Some(Box::new(|kernel| {
+            crate::info::print_kernel_function_attributes("simulate", kernel);
+            Ok(())
+        })));
+
+        let config = LaunchConfig {
+            grid: grid_size,
+            block: block_size,
+            ptx_jit: args.ptx_jit,
+        };
+
+        rust_cuda::host::Stream::with(&mut stream, |stream| {
+            let launcher = Launcher {
+                stream,
+                kernel: &mut kernel,
+                config,
+            };
+
+            parallelisation::monolithic::simulate(
+                &mut simulation,
+                launcher,
+                (args.dedup_cache, args.step_slice),
+                lineages,
+                event_slice,
+                pause_before,
+                local_partition,
+            )
+        })
     })
     .map_err(CudaError::from)?;
 
diff --git a/rustcoalescence/algorithms/cuda/src/lib.rs b/rustcoalescence/algorithms/cuda/src/lib.rs
index e2c221dca..8aa09353f 100644
--- a/rustcoalescence/algorithms/cuda/src/lib.rs
+++ b/rustcoalescence/algorithms/cuda/src/lib.rs
@@ -5,7 +5,12 @@
 #[macro_use]
 extern crate serde_derive_state;
 
-use necsim_core::{cogs::MathsCore, lineage::Lineage, reporter::Reporter};
+use initialiser::CudaLineageStoreSampleInitialiser;
+use necsim_core::{
+    cogs::{MathsCore, PrimeableRng},
+    lineage::Lineage,
+    reporter::Reporter,
+};
 use necsim_core_bond::{NonNegativeF64, PositiveF64};
 
 use necsim_impls_cuda::cogs::{maths::NvptxMathsCore, rng::CudaRng};
@@ -37,10 +42,10 @@ use rustcoalescence_algorithms::{
 };
 use rustcoalescence_scenarios::Scenario;
 
-use rustcoalescence_algorithms_cuda_cpu_kernel::SimulationKernel;
-use rustcoalescence_algorithms_cuda_gpu_kernel::SimulatableKernel;
+use rustcoalescence_algorithms_cuda_cpu_kernel::SimulationKernelPtx;
+use rustcoalescence_algorithms_cuda_gpu_kernel::simulate;
 
-use rust_cuda::common::RustToCuda;
+use rust_cuda::{kernel::CompiledKernelPtx, lend::RustToCuda};
 
 mod arguments;
 mod cuda;
@@ -68,42 +73,38 @@ impl AlgorithmParamters for CudaAlgorithm {
 
 impl AlgorithmDefaults for CudaAlgorithm {
     type MathsCore = NvptxMathsCore;
+    type Rng<M: MathsCore> = CudaRng<M, WyHash<M>>;
 }
 
-#[allow(clippy::trait_duplication_in_bounds)]
 impl<
         'p,
-        M: MathsCore,
-        O: Scenario<M, CudaRng<M, WyHash<M>>>,
+        M: MathsCore + Sync,
+        G: PrimeableRng<M> + RustToCuda + Sync,
+        O: Scenario<M, G>,
         R: Reporter,
         P: LocalPartition<'p, R>,
-    > Algorithm<'p, M, O, R, P> for CudaAlgorithm
+    > Algorithm<'p, M, G, O, R, P> for CudaAlgorithm
 where
-    O::Habitat: RustToCuda,
-    O::DispersalSampler<InMemoryPackedAliasDispersalSampler<M, O::Habitat, CudaRng<M, WyHash<M>>>>:
-        RustToCuda,
-    O::TurnoverRate: RustToCuda,
-    O::SpeciationProbability: RustToCuda,
-    SimulationKernel<
+    O::Habitat: RustToCuda + Sync,
+    O::DispersalSampler<InMemoryPackedAliasDispersalSampler<M, O::Habitat, G>>: RustToCuda + Sync,
+    O::TurnoverRate: RustToCuda + Sync,
+    O::SpeciationProbability: RustToCuda + Sync,
+    SimulationKernelPtx<
         M,
         O::Habitat,
-        CudaRng<M, WyHash<M>>,
+        G,
         IndependentLineageStore<M, O::Habitat>,
         NeverEmigrationExit,
-        O::DispersalSampler<
-            InMemoryPackedAliasDispersalSampler<M, O::Habitat, CudaRng<M, WyHash<M>>>,
-        >,
+        O::DispersalSampler<InMemoryPackedAliasDispersalSampler<M, O::Habitat, G>>,
         IndependentCoalescenceSampler<M, O::Habitat>,
         O::TurnoverRate,
         O::SpeciationProbability,
         IndependentEventSampler<
             M,
             O::Habitat,
-            CudaRng<M, WyHash<M>>,
+            G,
             NeverEmigrationExit,
-            O::DispersalSampler<
-                InMemoryPackedAliasDispersalSampler<M, O::Habitat, CudaRng<M, WyHash<M>>>,
-            >,
+            O::DispersalSampler<InMemoryPackedAliasDispersalSampler<M, O::Habitat, G>>,
             O::TurnoverRate,
             O::SpeciationProbability,
         >,
@@ -111,70 +112,62 @@ where
         IndependentActiveLineageSampler<
             M,
             O::Habitat,
-            CudaRng<M, WyHash<M>>,
+            G,
             NeverEmigrationExit,
-            O::DispersalSampler<
-                InMemoryPackedAliasDispersalSampler<M, O::Habitat, CudaRng<M, WyHash<M>>>,
-            >,
+            O::DispersalSampler<InMemoryPackedAliasDispersalSampler<M, O::Habitat, G>>,
             O::TurnoverRate,
             O::SpeciationProbability,
             ExpEventTimeSampler,
         >,
         R::ReportSpeciation,
         R::ReportDispersal,
-    >: SimulatableKernel<
-        M,
-        O::Habitat,
-        CudaRng<M, WyHash<M>>,
-        IndependentLineageStore<M, O::Habitat>,
-        NeverEmigrationExit,
-        O::DispersalSampler<
-            InMemoryPackedAliasDispersalSampler<M, O::Habitat, CudaRng<M, WyHash<M>>>,
-        >,
-        IndependentCoalescenceSampler<M, O::Habitat>,
-        O::TurnoverRate,
-        O::SpeciationProbability,
-        IndependentEventSampler<
+    >: CompiledKernelPtx<
+        simulate<
             M,
             O::Habitat,
-            CudaRng<M, WyHash<M>>,
+            G,
+            IndependentLineageStore<M, O::Habitat>,
             NeverEmigrationExit,
-            O::DispersalSampler<
-                InMemoryPackedAliasDispersalSampler<M, O::Habitat, CudaRng<M, WyHash<M>>>,
-            >,
+            O::DispersalSampler<InMemoryPackedAliasDispersalSampler<M, O::Habitat, G>>,
+            IndependentCoalescenceSampler<M, O::Habitat>,
             O::TurnoverRate,
             O::SpeciationProbability,
-        >,
-        NeverImmigrationEntry,
-        IndependentActiveLineageSampler<
-            M,
-            O::Habitat,
-            CudaRng<M, WyHash<M>>,
-            NeverEmigrationExit,
-            O::DispersalSampler<
-                InMemoryPackedAliasDispersalSampler<M, O::Habitat, CudaRng<M, WyHash<M>>>,
+            IndependentEventSampler<
+                M,
+                O::Habitat,
+                G,
+                NeverEmigrationExit,
+                O::DispersalSampler<InMemoryPackedAliasDispersalSampler<M, O::Habitat, G>>,
+                O::TurnoverRate,
+                O::SpeciationProbability,
             >,
-            O::TurnoverRate,
-            O::SpeciationProbability,
-            ExpEventTimeSampler,
+            NeverImmigrationEntry,
+            IndependentActiveLineageSampler<
+                M,
+                O::Habitat,
+                G,
+                NeverEmigrationExit,
+                O::DispersalSampler<InMemoryPackedAliasDispersalSampler<M, O::Habitat, G>>,
+                O::TurnoverRate,
+                O::SpeciationProbability,
+                ExpEventTimeSampler,
+            >,
+            R::ReportSpeciation,
+            R::ReportDispersal,
         >,
-        R::ReportSpeciation,
-        R::ReportDispersal,
     >,
-    SimulationKernel<
+    SimulationKernelPtx<
         M,
         O::Habitat,
-        CudaRng<M, WyHash<M>>,
+        G,
         IndependentLineageStore<M, O::Habitat>,
         NeverEmigrationExit,
         TrespassingDispersalSampler<
             M,
             O::Habitat,
-            CudaRng<M, WyHash<M>>,
-            O::DispersalSampler<
-                InMemoryPackedAliasDispersalSampler<M, O::Habitat, CudaRng<M, WyHash<M>>>,
-            >,
-            UniformAntiTrespassingDispersalSampler<M, O::Habitat, CudaRng<M, WyHash<M>>>,
+            G,
+            O::DispersalSampler<InMemoryPackedAliasDispersalSampler<M, O::Habitat, G>>,
+            UniformAntiTrespassingDispersalSampler<M, O::Habitat, G>,
         >,
         IndependentCoalescenceSampler<M, O::Habitat>,
         O::TurnoverRate,
@@ -182,16 +175,14 @@ where
         IndependentEventSampler<
             M,
             O::Habitat,
-            CudaRng<M, WyHash<M>>,
+            G,
             NeverEmigrationExit,
             TrespassingDispersalSampler<
                 M,
                 O::Habitat,
-                CudaRng<M, WyHash<M>>,
-                O::DispersalSampler<
-                    InMemoryPackedAliasDispersalSampler<M, O::Habitat, CudaRng<M, WyHash<M>>>,
-                >,
-                UniformAntiTrespassingDispersalSampler<M, O::Habitat, CudaRng<M, WyHash<M>>>,
+                G,
+                O::DispersalSampler<InMemoryPackedAliasDispersalSampler<M, O::Habitat, G>>,
+                UniformAntiTrespassingDispersalSampler<M, O::Habitat, G>,
             >,
             O::TurnoverRate,
             O::SpeciationProbability,
@@ -200,16 +191,14 @@ where
         IndependentActiveLineageSampler<
             M,
             O::Habitat,
-            CudaRng<M, WyHash<M>>,
+            G,
             NeverEmigrationExit,
             TrespassingDispersalSampler<
                 M,
                 O::Habitat,
-                CudaRng<M, WyHash<M>>,
-                O::DispersalSampler<
-                    InMemoryPackedAliasDispersalSampler<M, O::Habitat, CudaRng<M, WyHash<M>>>,
-                >,
-                UniformAntiTrespassingDispersalSampler<M, O::Habitat, CudaRng<M, WyHash<M>>>,
+                G,
+                O::DispersalSampler<InMemoryPackedAliasDispersalSampler<M, O::Habitat, G>>,
+                UniformAntiTrespassingDispersalSampler<M, O::Habitat, G>,
             >,
             O::TurnoverRate,
             O::SpeciationProbability,
@@ -217,66 +206,61 @@ where
         >,
         R::ReportSpeciation,
         R::ReportDispersal,
-    >: SimulatableKernel<
-        M,
-        O::Habitat,
-        CudaRng<M, WyHash<M>>,
-        IndependentLineageStore<M, O::Habitat>,
-        NeverEmigrationExit,
-        TrespassingDispersalSampler<
-            M,
-            O::Habitat,
-            CudaRng<M, WyHash<M>>,
-            O::DispersalSampler<
-                InMemoryPackedAliasDispersalSampler<M, O::Habitat, CudaRng<M, WyHash<M>>>,
-            >,
-            UniformAntiTrespassingDispersalSampler<M, O::Habitat, CudaRng<M, WyHash<M>>>,
-        >,
-        IndependentCoalescenceSampler<M, O::Habitat>,
-        O::TurnoverRate,
-        O::SpeciationProbability,
-        IndependentEventSampler<
+    >: CompiledKernelPtx<
+        simulate<
             M,
             O::Habitat,
-            CudaRng<M, WyHash<M>>,
+            G,
+            IndependentLineageStore<M, O::Habitat>,
             NeverEmigrationExit,
             TrespassingDispersalSampler<
                 M,
                 O::Habitat,
-                CudaRng<M, WyHash<M>>,
-                O::DispersalSampler<
-                    InMemoryPackedAliasDispersalSampler<M, O::Habitat, CudaRng<M, WyHash<M>>>,
-                >,
-                UniformAntiTrespassingDispersalSampler<M, O::Habitat, CudaRng<M, WyHash<M>>>,
+                G,
+                O::DispersalSampler<InMemoryPackedAliasDispersalSampler<M, O::Habitat, G>>,
+                UniformAntiTrespassingDispersalSampler<M, O::Habitat, G>,
             >,
+            IndependentCoalescenceSampler<M, O::Habitat>,
             O::TurnoverRate,
             O::SpeciationProbability,
-        >,
-        NeverImmigrationEntry,
-        IndependentActiveLineageSampler<
-            M,
-            O::Habitat,
-            CudaRng<M, WyHash<M>>,
-            NeverEmigrationExit,
-            TrespassingDispersalSampler<
+            IndependentEventSampler<
                 M,
                 O::Habitat,
-                CudaRng<M, WyHash<M>>,
-                O::DispersalSampler<
-                    InMemoryPackedAliasDispersalSampler<M, O::Habitat, CudaRng<M, WyHash<M>>>,
+                G,
+                NeverEmigrationExit,
+                TrespassingDispersalSampler<
+                    M,
+                    O::Habitat,
+                    G,
+                    O::DispersalSampler<InMemoryPackedAliasDispersalSampler<M, O::Habitat, G>>,
+                    UniformAntiTrespassingDispersalSampler<M, O::Habitat, G>,
                 >,
-                UniformAntiTrespassingDispersalSampler<M, O::Habitat, CudaRng<M, WyHash<M>>>,
+                O::TurnoverRate,
+                O::SpeciationProbability,
             >,
-            O::TurnoverRate,
-            O::SpeciationProbability,
-            ConstEventTimeSampler,
+            NeverImmigrationEntry,
+            IndependentActiveLineageSampler<
+                M,
+                O::Habitat,
+                G,
+                NeverEmigrationExit,
+                TrespassingDispersalSampler<
+                    M,
+                    O::Habitat,
+                    G,
+                    O::DispersalSampler<InMemoryPackedAliasDispersalSampler<M, O::Habitat, G>>,
+                    UniformAntiTrespassingDispersalSampler<M, O::Habitat, G>,
+                >,
+                O::TurnoverRate,
+                O::SpeciationProbability,
+                ConstEventTimeSampler,
+            >,
+            R::ReportSpeciation,
+            R::ReportDispersal,
         >,
-        R::ReportSpeciation,
-        R::ReportDispersal,
     >,
 {
     type LineageStore = IndependentLineageStore<M, O::Habitat>;
-    type Rng = CudaRng<M, WyHash<M>>;
 
     fn get_logical_partition(args: &Self::Arguments, _local_partition: &P) -> Partition {
         match &args.parallelism_mode {
@@ -290,13 +274,28 @@ where
 
     fn initialise_and_simulate<I: Iterator<Item = u64>>(
         args: Self::Arguments,
-        rng: Self::Rng,
+        rng: G,
         scenario: O,
         pre_sampler: OriginPreSampler<M, I>,
         pause_before: Option<NonNegativeF64>,
         local_partition: &mut P,
-    ) -> Result<SimulationOutcome<M, Self::Rng>, Self::Error> {
-        launch::initialise_and_simulate(
+    ) -> Result<SimulationOutcome<M, G>, Self::Error> {
+        launch::initialise_and_simulate::<_, _, _, _, _, _, _, _, SimulationKernelPtx<
+            _,
+            _,
+            _,
+            _,
+            _,
+            <GenesisInitialiser as CudaLineageStoreSampleInitialiser<_, _, O, _>>::DispersalSampler,
+            _,
+            _,
+            _,
+            _,
+            _,
+            <GenesisInitialiser as CudaLineageStoreSampleInitialiser<_, _, O, _>>::ActiveLineageSampler<_, _>,
+            _,
+            _,
+        >>(
             &args,
             rng,
             scenario,
@@ -311,18 +310,32 @@ where
     ///
     /// Returns a `ContinueError::Sample` if initialising the resuming
     ///  simulation failed
-    #[allow(clippy::too_many_lines)]
     fn resume_and_simulate<I: Iterator<Item = u64>, L: ExactSizeIterator<Item = Lineage>>(
         args: Self::Arguments,
-        rng: Self::Rng,
+        rng: G,
         scenario: O,
         pre_sampler: OriginPreSampler<M, I>,
         lineages: L,
         resume_after: Option<NonNegativeF64>,
         pause_before: Option<NonNegativeF64>,
         local_partition: &mut P,
-    ) -> Result<SimulationOutcome<M, Self::Rng>, ResumeError<Self::Error>> {
-        launch::initialise_and_simulate(
+    ) -> Result<SimulationOutcome<M, G>, ResumeError<Self::Error>> {
+        launch::initialise_and_simulate::<_, _, _, _, _, _, _, _, SimulationKernelPtx<
+            _,
+            _,
+            _,
+            _,
+            _,
+            <ResumeInitialiser<L> as CudaLineageStoreSampleInitialiser<_, _, O, _>>::DispersalSampler,
+            _,
+            _,
+            _,
+            _,
+            _,
+            <ResumeInitialiser<L> as CudaLineageStoreSampleInitialiser<_, _, O, _>>::ActiveLineageSampler<_, _>,
+            _,
+            _,
+        >>(
             &args,
             rng,
             scenario,
@@ -340,24 +353,38 @@ where
     ///
     /// Returns a `ContinueError<Self::Error>` if fixing up the restarting
     ///  simulation (incl. running the algorithm) failed
-    #[allow(clippy::too_many_lines)]
     fn fixup_for_restart<I: Iterator<Item = u64>, L: ExactSizeIterator<Item = Lineage>>(
         args: Self::Arguments,
-        rng: Self::Rng,
+        rng: G,
         scenario: O,
         pre_sampler: OriginPreSampler<M, I>,
         lineages: L,
         restart_at: PositiveF64,
         fixup_strategy: RestartFixUpStrategy,
         local_partition: &mut P,
-    ) -> Result<SimulationOutcome<M, Self::Rng>, ResumeError<Self::Error>> {
-        launch::initialise_and_simulate(
+    ) -> Result<SimulationOutcome<M, G>, ResumeError<Self::Error>> {
+        launch::initialise_and_simulate::<_, _, _, _, _, _, _, _, SimulationKernelPtx<
+            _,
+            _,
+            _,
+            _,
+            _,
+            <FixUpInitialiser<L> as CudaLineageStoreSampleInitialiser<_, _, O, _>>::DispersalSampler,
+            _,
+            _,
+            _,
+            _,
+            _,
+            <FixUpInitialiser<L> as CudaLineageStoreSampleInitialiser<_, _, O, _>>::ActiveLineageSampler<_, ConstEventTimeSampler>,
+            _,
+            _,
+        >>(
             &args,
             rng,
             scenario,
             pre_sampler,
-            Some(PositiveF64::max_after(restart_at.into(), restart_at.into()).into()),
-            local_partition,
+            Some(PositiveF64::max_after(restart_at.into(),
+        restart_at.into()).into()),     local_partition,
             FixUpInitialiser {
                 lineages,
                 restart_at,
diff --git a/rustcoalescence/algorithms/cuda/src/parallelisation/monolithic.rs b/rustcoalescence/algorithms/cuda/src/parallelisation/monolithic.rs
index 66e1ff479..213f6aa11 100644
--- a/rustcoalescence/algorithms/cuda/src/parallelisation/monolithic.rs
+++ b/rustcoalescence/algorithms/cuda/src/parallelisation/monolithic.rs
@@ -1,9 +1,9 @@
 use std::{collections::VecDeque, convert::TryInto, num::NonZeroU64, sync::atomic::AtomicU64};
 
 use rust_cuda::{
-    common::RustToCuda,
-    host::{HostAndDeviceMutRef, LendToCuda},
-    rustacuda::function::{BlockSize, GridSize},
+    host::HostAndDeviceMutRef,
+    kernel::Launcher,
+    lend::{LendToCuda, RustToCuda},
     utils::exchange::wrapper::ExchangeWrapperOnHost,
 };
 
@@ -37,8 +37,7 @@ use necsim_partitioning_core::LocalPartition;
 
 use necsim_impls_cuda::{event_buffer::EventBuffer, value_buffer::ValueBuffer};
 
-use rustcoalescence_algorithms_cuda_cpu_kernel::SimulationKernel;
-use rustcoalescence_algorithms_cuda_gpu_kernel::SimulatableKernel;
+use rustcoalescence_algorithms_cuda_gpu_kernel::simulate;
 
 use crate::error::CudaError;
 
@@ -48,25 +47,24 @@ type Result<T, E = CudaError> = std::result::Result<T, E>;
 pub fn simulate<
     'l,
     'p,
-    M: MathsCore,
-    H: Habitat<M> + RustToCuda,
-    G: PrimeableRng<M> + RustToCuda,
-    S: LineageStore<M, H> + RustToCuda,
-    X: EmigrationExit<M, H, G, S> + RustToCuda,
-    D: DispersalSampler<M, H, G> + RustToCuda,
-    C: CoalescenceSampler<M, H, S> + RustToCuda,
-    T: TurnoverRate<M, H> + RustToCuda,
-    N: SpeciationProbability<M, H> + RustToCuda,
-    E: MinSpeciationTrackingEventSampler<M, H, G, S, X, D, C, T, N> + RustToCuda,
-    I: ImmigrationEntry<M> + RustToCuda,
-    A: SingularActiveLineageSampler<M, H, G, S, X, D, C, T, N, E, I>
-        + RustToCuda,
+    M: MathsCore + Sync,
+    H: Habitat<M> + RustToCuda + Sync,
+    G: PrimeableRng<M> + RustToCuda + Sync,
+    S: LineageStore<M, H> + RustToCuda + Sync,
+    X: EmigrationExit<M, H, G, S> + RustToCuda + Sync,
+    D: DispersalSampler<M, H, G> + RustToCuda + Sync,
+    C: CoalescenceSampler<M, H, S> + RustToCuda + Sync,
+    T: TurnoverRate<M, H> + RustToCuda + Sync,
+    N: SpeciationProbability<M, H> + RustToCuda + Sync,
+    E: MinSpeciationTrackingEventSampler<M, H, G, S, X, D, C, T, N> + RustToCuda + Sync,
+    I: ImmigrationEntry<M> + RustToCuda + Sync,
+    A: SingularActiveLineageSampler<M, H, G, S, X, D, C, T, N, E, I> + RustToCuda + Sync,
     P: Reporter,
     L: LocalPartition<'p, P>,
     LI: IntoIterator<Item = Lineage>,
 >(
     simulation: &mut Simulation<M, H, G, S, X, D, C, T, N, E, I, A>,
-    mut kernel: SimulationKernel<
+    mut launcher: Launcher<simulate<
         M,
         H,
         G,
@@ -81,45 +79,18 @@ pub fn simulate<
         A,
         <<WaterLevelReporterStrategy as WaterLevelReporterConstructor<L::IsLive, P, L>>::WaterLevelReporter as Reporter>::ReportSpeciation,
         <<WaterLevelReporterStrategy as WaterLevelReporterConstructor<L::IsLive, P, L>>::WaterLevelReporter as Reporter>::ReportDispersal,
-    >,
-    config: (GridSize, BlockSize, DedupCache, NonZeroU64),
+    >>,
+    config: (DedupCache, NonZeroU64),
     lineages: LI,
     event_slice: EventSlice,
     pause_before: Option<NonNegativeF64>,
     local_partition: &'l mut L,
-) -> Result<(Status, NonNegativeF64, u64, impl IntoIterator<Item = Lineage>)>
-    where SimulationKernel<
-        M,
-        H,
-        G,
-        S,
-        X,
-        D,
-        C,
-        T,
-        N,
-        E,
-        I,
-        A,
-        <<WaterLevelReporterStrategy as WaterLevelReporterConstructor<'l, 'p, L::IsLive, P, L>>::WaterLevelReporter as Reporter>::ReportSpeciation,
-        <<WaterLevelReporterStrategy as WaterLevelReporterConstructor<'l, 'p, L::IsLive, P, L>>::WaterLevelReporter as Reporter>::ReportDispersal,
-    >: SimulatableKernel<
-        M,
-        H,
-        G,
-        S,
-        X,
-        D,
-        C,
-        T,
-        N,
-        E,
-        I,
-        A,
-        <<WaterLevelReporterStrategy as WaterLevelReporterConstructor<'l, 'p, L::IsLive, P, L>>::WaterLevelReporter as Reporter>::ReportSpeciation,
-        <<WaterLevelReporterStrategy as WaterLevelReporterConstructor<'l, 'p, L::IsLive, P, L>>::WaterLevelReporter as Reporter>::ReportDispersal,
-    >,
-{
+) -> Result<(
+    Status,
+    NonNegativeF64,
+    u64,
+    impl IntoIterator<Item = Lineage>,
+)> {
     let mut slow_lineages = lineages
         .into_iter()
         .map(|lineage| {
@@ -143,7 +114,7 @@ pub fn simulate<
         L,
     >>::WaterLevelReporter::new(event_slice.get(), local_partition);
 
-    let (grid_size, block_size, dedup_cache, step_slice) = config;
+    let (dedup_cache, step_slice) = config;
 
     #[allow(clippy::or_fun_call)]
     let intial_max_time = slow_lineages
@@ -153,10 +124,13 @@ pub fn simulate<
         .unwrap_or(NonNegativeF64::zero());
 
     // Initialise the total_time_max and total_steps_sum atomics
-    let mut total_time_max = AtomicU64::new(intial_max_time.get().to_bits()).into();
-    let mut total_steps_sum = AtomicU64::new(0_u64).into();
+    let mut total_time_max = AtomicU64::new(intial_max_time.get().to_bits());
+    let mut total_steps_sum = AtomicU64::new(0_u64);
 
-    let mut task_list = ExchangeWrapperOnHost::new(ValueBuffer::new(&block_size, &grid_size)?)?;
+    let mut task_list = ExchangeWrapperOnHost::new(ValueBuffer::new(
+        &launcher.config.block,
+        &launcher.config.grid,
+    )?)?;
     let mut event_buffer: ExchangeWrapperOnHost<
         EventBuffer<
             <<WaterLevelReporterStrategy as WaterLevelReporterConstructor<
@@ -171,14 +145,17 @@ pub fn simulate<
             >>::WaterLevelReporter as Reporter>::ReportDispersal,
         >,
     > = ExchangeWrapperOnHost::new(EventBuffer::new(
-        &block_size,
-        &grid_size,
+        &launcher.config.block, &launcher.config.grid,
         step_slice.get().try_into().unwrap_or(usize::MAX),
     )?)?;
-    let mut min_spec_sample_buffer =
-        ExchangeWrapperOnHost::new(ValueBuffer::new(&block_size, &grid_size)?)?;
-    let mut next_event_time_buffer =
-        ExchangeWrapperOnHost::new(ValueBuffer::new(&block_size, &grid_size)?)?;
+    let mut min_spec_sample_buffer = ExchangeWrapperOnHost::new(ValueBuffer::new(
+        &launcher.config.block,
+        &launcher.config.grid,
+    )?)?;
+    let mut next_event_time_buffer = ExchangeWrapperOnHost::new(ValueBuffer::new(
+        &launcher.config.block,
+        &launcher.config.grid,
+    )?)?;
 
     let mut min_spec_samples = dedup_cache.construct(slow_lineages.len());
 
@@ -195,8 +172,7 @@ pub fn simulate<
 
     HostAndDeviceMutRef::with_new(&mut total_time_max, |total_time_max| -> Result<()> {
         HostAndDeviceMutRef::with_new(&mut total_steps_sum, |total_steps_sum| -> Result<()> {
-            // TODO: Pipeline async launches and callbacks of simulation/event analysis
-            simulation.lend_to_cuda_mut(|mut simulation_cuda_repr| -> Result<()> {
+            simulation.lend_to_cuda(|simulation_cuda_repr| -> Result<()> {
                 while !slow_lineages.is_empty()
                     && pause_before.map_or(true, |pause_before| level_time < pause_before)
                 {
@@ -242,8 +218,16 @@ pub fn simulate<
                     proxy.advance_water_level(level_time);
 
                     // Simulate all slow lineages until they have finished or exceeded the
-                    // new water  level
+                    //  new water  level
                     while !slow_lineages.is_empty() {
+                        // Move the event buffer and min speciation sample buffer to CUDA
+                        let mut event_buffer_cuda_async =
+                            event_buffer.move_to_device_async(launcher.stream)?;
+                        let mut min_spec_sample_buffer_cuda_async =
+                            min_spec_sample_buffer.move_to_device_async(launcher.stream)?;
+                        let mut next_event_time_buffer_cuda_async =
+                            next_event_time_buffer.move_to_device_async(launcher.stream)?;
+
                         // Upload the new tasks from the front of the task queue
                         for mut task in task_list.iter_mut() {
                             let next_slow_lineage = loop {
@@ -261,31 +245,44 @@ pub fn simulate<
                             task.replace(next_slow_lineage);
                         }
 
-                        // Move the task list, event buffer and min speciation sample buffer
-                        // to CUDA
-                        let mut event_buffer_cuda = event_buffer.move_to_device()?;
-                        let mut min_spec_sample_buffer_cuda =
-                            min_spec_sample_buffer.move_to_device()?;
-                        let mut next_event_time_buffer_cuda =
-                            next_event_time_buffer.move_to_device()?;
-                        let mut task_list_cuda = task_list.move_to_device()?;
-
-                        kernel.simulate_raw(
-                            simulation_cuda_repr.as_mut(),
-                            task_list_cuda.as_mut(),
-                            event_buffer_cuda.as_mut(),
-                            min_spec_sample_buffer_cuda.as_mut(),
-                            next_event_time_buffer_cuda.as_mut(),
-                            total_time_max.as_ref(),
-                            total_steps_sum.as_ref(),
-                            step_slice.get().into(),
-                            level_time.into(),
+                        // Move the task list to CUDA
+                        let mut task_list_cuda_async =
+                            task_list.move_to_device_async(launcher.stream)?;
+
+                        let launch = launcher.launch9_async(
+                            simulation_cuda_repr.as_async(launcher.stream).extract_ref(),
+                            task_list_cuda_async.as_mut_async(),
+                            event_buffer_cuda_async.as_mut_async(),
+                            min_spec_sample_buffer_cuda_async.as_mut_async(),
+                            next_event_time_buffer_cuda_async.as_mut_async(),
+                            total_time_max
+                                .as_ref()
+                                .as_async(launcher.stream)
+                                .extract_ref(),
+                            total_steps_sum
+                                .as_ref()
+                                .as_async(launcher.stream)
+                                .extract_ref(),
+                            step_slice.get(),
+                            level_time,
                         )?;
 
-                        min_spec_sample_buffer = min_spec_sample_buffer_cuda.move_to_host()?;
-                        next_event_time_buffer = next_event_time_buffer_cuda.move_to_host()?;
-                        task_list = task_list_cuda.move_to_host()?;
-                        event_buffer = event_buffer_cuda.move_to_host()?;
+                        let min_spec_sample_buffer_host_async =
+                            min_spec_sample_buffer_cuda_async
+                                .move_to_host_async(launcher.stream)?;
+                        let next_event_time_buffer_host_async =
+                            next_event_time_buffer_cuda_async
+                                .move_to_host_async(launcher.stream)?;
+                        let task_list_host_async =
+                            task_list_cuda_async.move_to_host_async(launcher.stream)?;
+                        let event_buffer_host_async =
+                            event_buffer_cuda_async.move_to_host_async(launcher.stream)?;
+
+                        task_list = task_list_host_async.synchronize()?;
+                        next_event_time_buffer = next_event_time_buffer_host_async.synchronize()?;
+                        min_spec_sample_buffer = min_spec_sample_buffer_host_async.synchronize()?;
+
+                        launch.synchronize()?;
 
                         // Fetch the completion of the tasks
                         for ((mut spec_sample, mut next_event_time), mut task) in
@@ -303,8 +300,7 @@ pub fn simulate<
                             {
                                 if !duplicate_individual {
                                     // Reclassify lineages as either slow (still below
-                                    // water) or
-                                    // fast
+                                    //  the metaphorical water level) or fast
                                     if next_event_time < level_time {
                                         slow_lineages.push_back((task, next_event_time.into()));
                                     } else {
@@ -314,6 +310,8 @@ pub fn simulate<
                             }
                         }
 
+                        event_buffer = event_buffer_host_async.synchronize()?;
+                        // TODO: explore partial sorting on the GPU
                         event_buffer.report_events_unordered(&mut proxy);
 
                         proxy.local_partition().get_reporter().report_progress(
@@ -336,10 +334,9 @@ pub fn simulate<
     })?;
 
     // Safety: Max of NonNegativeF64 values from the GPU
-    let total_time_max = unsafe {
-        NonNegativeF64::new_unchecked(f64::from_bits(total_time_max.into_inner().into_inner()))
-    };
-    let total_steps_sum = total_steps_sum.into_inner().into_inner();
+    let total_time_max =
+        unsafe { NonNegativeF64::new_unchecked(f64::from_bits(total_time_max.into_inner())) };
+    let total_steps_sum = total_steps_sum.into_inner();
 
     local_partition.report_progress_sync(slow_lineages.len() as u64);
 
@@ -348,5 +345,9 @@ pub fn simulate<
         local_partition.reduce_global_time_steps(total_time_max, total_steps_sum);
     let lineages = slow_lineages.into_iter().map(|(lineage, _)| lineage);
 
+    // Note: The simulation requires no mutation, since all components are
+    //       either immutable or have singular swap states, and the list
+    //       of all lineages (which does change) is returned separately
+
     Ok((status, global_time, global_steps, lineages))
 }
diff --git a/rustcoalescence/algorithms/gillespie/src/event_skipping/mod.rs b/rustcoalescence/algorithms/gillespie/src/event_skipping/mod.rs
index 4dc2bfa4e..1c37f0db9 100644
--- a/rustcoalescence/algorithms/gillespie/src/event_skipping/mod.rs
+++ b/rustcoalescence/algorithms/gillespie/src/event_skipping/mod.rs
@@ -1,5 +1,5 @@
 use necsim_core::{
-    cogs::{GloballyCoherentLineageStore, MathsCore, SeparableDispersalSampler},
+    cogs::{GloballyCoherentLineageStore, MathsCore, SeparableDispersalSampler, SplittableRng},
     lineage::Lineage,
     reporter::Reporter,
 };
@@ -39,18 +39,24 @@ impl AlgorithmParamters for EventSkippingAlgorithm {
 
 impl AlgorithmDefaults for EventSkippingAlgorithm {
     type MathsCore = IntrinsicsMathsCore;
+    type Rng<M: MathsCore> = Pcg<M>;
 }
 
-impl<'p, O: Scenario<M, Pcg<M>>, R: Reporter, P: LocalPartition<'p, R>, M: MathsCore>
-    Algorithm<'p, M, O, R, P> for EventSkippingAlgorithm
+impl<
+        'p,
+        O: Scenario<M, G>,
+        R: Reporter,
+        P: LocalPartition<'p, R>,
+        M: MathsCore,
+        G: SplittableRng<M>,
+    > Algorithm<'p, M, G, O, R, P> for EventSkippingAlgorithm
 where
     O::LineageStore<GillespieLineageStore<M, O::Habitat>>:
         GloballyCoherentLineageStore<M, O::Habitat>,
-    O::DispersalSampler<InMemorySeparableAliasDispersalSampler<M, O::Habitat, Pcg<M>>>:
-        SeparableDispersalSampler<M, O::Habitat, Pcg<M>>,
+    O::DispersalSampler<InMemorySeparableAliasDispersalSampler<M, O::Habitat, G>>:
+        SeparableDispersalSampler<M, O::Habitat, G>,
 {
     type LineageStore = O::LineageStore<GillespieLineageStore<M, O::Habitat>>;
-    type Rng = Pcg<M>;
 
     fn get_logical_partition(args: &Self::Arguments, local_partition: &P) -> Partition {
         get_gillespie_logical_partition(args, local_partition)
@@ -58,12 +64,12 @@ where
 
     fn initialise_and_simulate<I: Iterator<Item = u64>>(
         args: Self::Arguments,
-        rng: Self::Rng,
+        rng: G,
         scenario: O,
         pre_sampler: OriginPreSampler<M, I>,
         pause_before: Option<NonNegativeF64>,
         local_partition: &mut P,
-    ) -> Result<SimulationOutcome<M, Self::Rng>, Self::Error> {
+    ) -> Result<SimulationOutcome<M, G>, Self::Error> {
         launch::initialise_and_simulate(
             args,
             rng,
@@ -81,14 +87,14 @@ where
     ///  simulation failed
     fn resume_and_simulate<I: Iterator<Item = u64>, L: ExactSizeIterator<Item = Lineage>>(
         args: Self::Arguments,
-        rng: Self::Rng,
+        rng: G,
         scenario: O,
         pre_sampler: OriginPreSampler<M, I>,
         lineages: L,
         resume_after: Option<NonNegativeF64>,
         pause_before: Option<NonNegativeF64>,
         local_partition: &mut P,
-    ) -> Result<SimulationOutcome<M, Self::Rng>, ResumeError<Self::Error>> {
+    ) -> Result<SimulationOutcome<M, G>, ResumeError<Self::Error>> {
         launch::initialise_and_simulate(
             args,
             rng,
@@ -109,14 +115,14 @@ where
     ///  simulation (incl. running the algorithm) failed
     fn fixup_for_restart<I: Iterator<Item = u64>, L: ExactSizeIterator<Item = Lineage>>(
         args: Self::Arguments,
-        rng: Self::Rng,
+        rng: G,
         scenario: O,
         pre_sampler: OriginPreSampler<M, I>,
         lineages: L,
         restart_at: PositiveF64,
         fixup_strategy: RestartFixUpStrategy,
         local_partition: &mut P,
-    ) -> Result<SimulationOutcome<M, Self::Rng>, ResumeError<Self::Error>> {
+    ) -> Result<SimulationOutcome<M, G>, ResumeError<Self::Error>> {
         launch::initialise_and_simulate(
             args,
             rng,
diff --git a/rustcoalescence/algorithms/gillespie/src/gillespie/classical/mod.rs b/rustcoalescence/algorithms/gillespie/src/gillespie/classical/mod.rs
index 06114bca7..892b7e285 100644
--- a/rustcoalescence/algorithms/gillespie/src/gillespie/classical/mod.rs
+++ b/rustcoalescence/algorithms/gillespie/src/gillespie/classical/mod.rs
@@ -1,5 +1,5 @@
 use necsim_core::{
-    cogs::{LocallyCoherentLineageStore, MathsCore},
+    cogs::{LocallyCoherentLineageStore, MathsCore, SplittableRng},
     lineage::Lineage,
     reporter::Reporter,
 };
@@ -9,7 +9,6 @@ use necsim_impls_no_std::cogs::{
     lineage_store::coherent::locally::classical::ClassicalLineageStore,
     origin_sampler::pre_sampler::OriginPreSampler, turnover_rate::uniform::UniformTurnoverRate,
 };
-use necsim_impls_std::cogs::rng::pcg::Pcg;
 use necsim_partitioning_core::LocalPartition;
 
 use rustcoalescence_algorithms::{
@@ -31,24 +30,24 @@ use initialiser::{
 // Optimised 'Classical' implementation for the `UniformTurnoverSampler`
 impl<
         'p,
-        O: Scenario<M, Pcg<M>, TurnoverRate = UniformTurnoverRate>,
+        O: Scenario<M, G, TurnoverRate = UniformTurnoverRate>,
         R: Reporter,
         P: LocalPartition<'p, R>,
         M: MathsCore,
-    > Algorithm<'p, M, O, R, P> for GillespieAlgorithm
+        G: SplittableRng<M>,
+    > Algorithm<'p, M, G, O, R, P> for GillespieAlgorithm
 where
     O::LineageStore<ClassicalLineageStore<M, O::Habitat>>:
         LocallyCoherentLineageStore<M, O::Habitat>,
 {
-    #[allow(clippy::too_many_lines)]
     fn initialise_and_simulate<I: Iterator<Item = u64>>(
         args: Self::Arguments,
-        rng: Self::Rng,
+        rng: G,
         scenario: O,
         pre_sampler: OriginPreSampler<M, I>,
         pause_before: Option<NonNegativeF64>,
         local_partition: &mut P,
-    ) -> Result<SimulationOutcome<M, Self::Rng>, Self::Error> {
+    ) -> Result<SimulationOutcome<M, G>, Self::Error> {
         launch::initialise_and_simulate(
             args,
             rng,
@@ -66,14 +65,14 @@ where
     ///  simulation failed
     fn resume_and_simulate<I: Iterator<Item = u64>, L: ExactSizeIterator<Item = Lineage>>(
         args: Self::Arguments,
-        rng: Self::Rng,
+        rng: G,
         scenario: O,
         pre_sampler: OriginPreSampler<M, I>,
         lineages: L,
         resume_after: Option<NonNegativeF64>,
         pause_before: Option<NonNegativeF64>,
         local_partition: &mut P,
-    ) -> Result<SimulationOutcome<M, Self::Rng>, ResumeError<Self::Error>> {
+    ) -> Result<SimulationOutcome<M, G>, ResumeError<Self::Error>> {
         launch::initialise_and_simulate(
             args,
             rng,
@@ -92,17 +91,16 @@ where
     ///
     /// Returns a `ContinueError<Self::Error>` if fixing up the restarting
     ///  simulation (incl. running the algorithm) failed
-    #[allow(clippy::too_many_lines)]
     fn fixup_for_restart<I: Iterator<Item = u64>, L: ExactSizeIterator<Item = Lineage>>(
         args: Self::Arguments,
-        rng: Self::Rng,
+        rng: G,
         scenario: O,
         pre_sampler: OriginPreSampler<M, I>,
         lineages: L,
         restart_at: PositiveF64,
         fixup_strategy: RestartFixUpStrategy,
         local_partition: &mut P,
-    ) -> Result<SimulationOutcome<M, Self::Rng>, ResumeError<Self::Error>> {
+    ) -> Result<SimulationOutcome<M, G>, ResumeError<Self::Error>> {
         launch::initialise_and_simulate(
             args,
             rng,
diff --git a/rustcoalescence/algorithms/gillespie/src/gillespie/mod.rs b/rustcoalescence/algorithms/gillespie/src/gillespie/mod.rs
index f485eb6a6..c1f775555 100644
--- a/rustcoalescence/algorithms/gillespie/src/gillespie/mod.rs
+++ b/rustcoalescence/algorithms/gillespie/src/gillespie/mod.rs
@@ -1,4 +1,6 @@
+use necsim_core::cogs::MathsCore;
 use necsim_impls_no_std::cogs::maths::intrinsics::IntrinsicsMathsCore;
+use necsim_impls_std::cogs::rng::pcg::Pcg;
 
 use rustcoalescence_algorithms::{AlgorithmDefaults, AlgorithmParamters};
 
@@ -17,4 +19,5 @@ impl AlgorithmParamters for GillespieAlgorithm {
 
 impl AlgorithmDefaults for GillespieAlgorithm {
     type MathsCore = IntrinsicsMathsCore;
+    type Rng<M: MathsCore> = Pcg<M>;
 }
diff --git a/rustcoalescence/algorithms/gillespie/src/gillespie/turnover/mod.rs b/rustcoalescence/algorithms/gillespie/src/gillespie/turnover/mod.rs
index 7f44e6280..a08985da7 100644
--- a/rustcoalescence/algorithms/gillespie/src/gillespie/turnover/mod.rs
+++ b/rustcoalescence/algorithms/gillespie/src/gillespie/turnover/mod.rs
@@ -1,5 +1,5 @@
 use necsim_core::{
-    cogs::{LocallyCoherentLineageStore, MathsCore},
+    cogs::{LocallyCoherentLineageStore, MathsCore, SplittableRng},
     lineage::Lineage,
     reporter::Reporter,
 };
@@ -9,7 +9,6 @@ use necsim_impls_no_std::cogs::{
     lineage_store::coherent::locally::classical::ClassicalLineageStore,
     origin_sampler::pre_sampler::OriginPreSampler,
 };
-use necsim_impls_std::cogs::rng::pcg::Pcg;
 use necsim_partitioning_core::{partition::Partition, LocalPartition};
 
 use rustcoalescence_algorithms::{
@@ -31,28 +30,32 @@ use initialiser::{
 };
 
 // Default 'Gillespie' implementation for any turnover sampler
-impl<'p, O: Scenario<M, Pcg<M>>, R: Reporter, P: LocalPartition<'p, R>, M: MathsCore>
-    Algorithm<'p, M, O, R, P> for GillespieAlgorithm
+impl<
+        'p,
+        O: Scenario<M, G>,
+        R: Reporter,
+        P: LocalPartition<'p, R>,
+        M: MathsCore,
+        G: SplittableRng<M>,
+    > Algorithm<'p, M, G, O, R, P> for GillespieAlgorithm
 where
     O::LineageStore<ClassicalLineageStore<M, O::Habitat>>:
         LocallyCoherentLineageStore<M, O::Habitat>,
 {
     type LineageStore = O::LineageStore<ClassicalLineageStore<M, O::Habitat>>;
-    type Rng = Pcg<M>;
 
     default fn get_logical_partition(args: &Self::Arguments, local_partition: &P) -> Partition {
         get_gillespie_logical_partition(args, local_partition)
     }
 
-    #[allow(clippy::shadow_unrelated, clippy::too_many_lines)]
     default fn initialise_and_simulate<I: Iterator<Item = u64>>(
         args: Self::Arguments,
-        rng: Self::Rng,
+        rng: G,
         scenario: O,
         pre_sampler: OriginPreSampler<M, I>,
         pause_before: Option<NonNegativeF64>,
         local_partition: &mut P,
-    ) -> Result<SimulationOutcome<M, Self::Rng>, Self::Error> {
+    ) -> Result<SimulationOutcome<M, G>, Self::Error> {
         launch::initialise_and_simulate(
             args,
             rng,
@@ -68,20 +71,19 @@ where
     ///
     /// Returns a `ContinueError::Sample` if initialising the resuming
     ///  simulation failed
-    #[allow(clippy::too_many_lines)]
     default fn resume_and_simulate<
         I: Iterator<Item = u64>,
         L: ExactSizeIterator<Item = Lineage>,
     >(
         args: Self::Arguments,
-        rng: Self::Rng,
+        rng: G,
         scenario: O,
         pre_sampler: OriginPreSampler<M, I>,
         lineages: L,
         resume_after: Option<NonNegativeF64>,
         pause_before: Option<NonNegativeF64>,
         local_partition: &mut P,
-    ) -> Result<SimulationOutcome<M, Self::Rng>, ResumeError<Self::Error>> {
+    ) -> Result<SimulationOutcome<M, G>, ResumeError<Self::Error>> {
         launch::initialise_and_simulate(
             args,
             rng,
@@ -100,17 +102,16 @@ where
     ///
     /// Returns a `ContinueError<Self::Error>` if fixing up the restarting
     ///  simulation (incl. running the algorithm) failed
-    #[allow(clippy::too_many_lines)]
     default fn fixup_for_restart<I: Iterator<Item = u64>, L: ExactSizeIterator<Item = Lineage>>(
         args: Self::Arguments,
-        rng: Self::Rng,
+        rng: G,
         scenario: O,
         pre_sampler: OriginPreSampler<M, I>,
         lineages: L,
         restart_at: PositiveF64,
         fixup_strategy: RestartFixUpStrategy,
         local_partition: &mut P,
-    ) -> Result<SimulationOutcome<M, Self::Rng>, ResumeError<Self::Error>> {
+    ) -> Result<SimulationOutcome<M, G>, ResumeError<Self::Error>> {
         launch::initialise_and_simulate(
             args,
             rng,
diff --git a/rustcoalescence/algorithms/independent/src/lib.rs b/rustcoalescence/algorithms/independent/src/lib.rs
index 8a7d0473d..7550642c9 100644
--- a/rustcoalescence/algorithms/independent/src/lib.rs
+++ b/rustcoalescence/algorithms/independent/src/lib.rs
@@ -4,7 +4,11 @@
 #[macro_use]
 extern crate serde_derive_state;
 
-use necsim_core::{cogs::MathsCore, lineage::Lineage, reporter::Reporter};
+use necsim_core::{
+    cogs::{MathsCore, PrimeableRng},
+    lineage::Lineage,
+    reporter::Reporter,
+};
 use necsim_core_bond::{NonNegativeF64, PositiveF64};
 
 use necsim_impls_no_std::cogs::{
@@ -39,13 +43,19 @@ impl AlgorithmParamters for IndependentAlgorithm {
 
 impl AlgorithmDefaults for IndependentAlgorithm {
     type MathsCore = IntrinsicsMathsCore;
+    type Rng<M: MathsCore> = WyHash<M>;
 }
 
-impl<'p, O: Scenario<M, WyHash<M>>, R: Reporter, P: LocalPartition<'p, R>, M: MathsCore>
-    Algorithm<'p, M, O, R, P> for IndependentAlgorithm
+impl<
+        'p,
+        O: Scenario<M, G>,
+        R: Reporter,
+        P: LocalPartition<'p, R>,
+        M: MathsCore,
+        G: PrimeableRng<M>,
+    > Algorithm<'p, M, G, O, R, P> for IndependentAlgorithm
 {
     type LineageStore = IndependentLineageStore<M, O::Habitat>;
-    type Rng = WyHash<M>;
 
     fn get_logical_partition(args: &Self::Arguments, local_partition: &P) -> Partition {
         match &args.parallelism_mode {
@@ -62,12 +72,12 @@ impl<'p, O: Scenario<M, WyHash<M>>, R: Reporter, P: LocalPartition<'p, R>, M: Ma
 
     fn initialise_and_simulate<I: Iterator<Item = u64>>(
         args: Self::Arguments,
-        rng: Self::Rng,
+        rng: G,
         scenario: O,
         pre_sampler: OriginPreSampler<M, I>,
         pause_before: Option<NonNegativeF64>,
         local_partition: &mut P,
-    ) -> Result<SimulationOutcome<M, Self::Rng>, Self::Error> {
+    ) -> Result<SimulationOutcome<M, G>, Self::Error> {
         launch::initialise_and_simulate(
             &args,
             rng,
@@ -85,14 +95,14 @@ impl<'p, O: Scenario<M, WyHash<M>>, R: Reporter, P: LocalPartition<'p, R>, M: Ma
     ///  simulation failed
     fn resume_and_simulate<I: Iterator<Item = u64>, L: ExactSizeIterator<Item = Lineage>>(
         args: Self::Arguments,
-        rng: Self::Rng,
+        rng: G,
         scenario: O,
         pre_sampler: OriginPreSampler<M, I>,
         lineages: L,
         resume_after: Option<NonNegativeF64>,
         pause_before: Option<NonNegativeF64>,
         local_partition: &mut P,
-    ) -> Result<SimulationOutcome<M, Self::Rng>, ResumeError<Self::Error>> {
+    ) -> Result<SimulationOutcome<M, G>, ResumeError<Self::Error>> {
         launch::initialise_and_simulate(
             &args,
             rng,
@@ -111,17 +121,16 @@ impl<'p, O: Scenario<M, WyHash<M>>, R: Reporter, P: LocalPartition<'p, R>, M: Ma
     ///
     /// Returns a `ContinueError<Self::Error>` if fixing up the restarting
     ///  simulation (incl. running the algorithm) failed
-    #[allow(clippy::too_many_lines)]
     fn fixup_for_restart<I: Iterator<Item = u64>, L: ExactSizeIterator<Item = Lineage>>(
         args: Self::Arguments,
-        rng: Self::Rng,
+        rng: G,
         scenario: O,
         pre_sampler: OriginPreSampler<M, I>,
         lineages: L,
         restart_at: PositiveF64,
         fixup_strategy: RestartFixUpStrategy,
         local_partition: &mut P,
-    ) -> Result<SimulationOutcome<M, Self::Rng>, ResumeError<Self::Error>> {
+    ) -> Result<SimulationOutcome<M, G>, ResumeError<Self::Error>> {
         launch::initialise_and_simulate(
             &args,
             rng,
diff --git a/rustcoalescence/algorithms/src/lib.rs b/rustcoalescence/algorithms/src/lib.rs
index 7ed7ce88a..da7ada445 100644
--- a/rustcoalescence/algorithms/src/lib.rs
+++ b/rustcoalescence/algorithms/src/lib.rs
@@ -27,17 +27,18 @@ pub trait AlgorithmParamters {
 
 pub trait AlgorithmDefaults {
     type MathsCore: MathsCore;
+    type Rng<M: MathsCore>: RngCore<M>;
 }
 
 pub trait Algorithm<
     'p,
     M: MathsCore,
-    O: Scenario<M, Self::Rng>,
+    G: RngCore<M>,
+    O: Scenario<M, G>,
     R: Reporter,
     P: LocalPartition<'p, R>,
 >: Sized + AlgorithmParamters + AlgorithmDefaults
 {
-    type Rng: RngCore<M>;
     type LineageStore: LineageStore<M, O::Habitat>;
 
     fn get_logical_partition(args: &Self::Arguments, local_partition: &P) -> Partition;
@@ -48,12 +49,12 @@ pub trait Algorithm<
     ///  the algorithm failed
     fn initialise_and_simulate<I: Iterator<Item = u64>>(
         args: Self::Arguments,
-        rng: Self::Rng,
+        rng: G,
         scenario: O,
         pre_sampler: OriginPreSampler<M, I>,
         pause_before: Option<NonNegativeF64>,
         local_partition: &mut P,
-    ) -> Result<SimulationOutcome<M, Self::Rng>, Self::Error>;
+    ) -> Result<SimulationOutcome<M, G>, Self::Error>;
 
     /// # Errors
     ///
@@ -62,14 +63,14 @@ pub trait Algorithm<
     #[allow(clippy::type_complexity, clippy::too_many_arguments)]
     fn resume_and_simulate<I: Iterator<Item = u64>, L: ExactSizeIterator<Item = Lineage>>(
         args: Self::Arguments,
-        rng: Self::Rng,
+        rng: G,
         scenario: O,
         pre_sampler: OriginPreSampler<M, I>,
         lineages: L,
         resume_after: Option<NonNegativeF64>,
         pause_before: Option<NonNegativeF64>,
         local_partition: &mut P,
-    ) -> Result<SimulationOutcome<M, Self::Rng>, ResumeError<Self::Error>>;
+    ) -> Result<SimulationOutcome<M, G>, ResumeError<Self::Error>>;
 
     /// # Errors
     ///
@@ -78,12 +79,12 @@ pub trait Algorithm<
     #[allow(clippy::type_complexity, clippy::too_many_arguments)]
     fn fixup_for_restart<I: Iterator<Item = u64>, L: ExactSizeIterator<Item = Lineage>>(
         args: Self::Arguments,
-        rng: Self::Rng,
+        rng: G,
         scenario: O,
         pre_sampler: OriginPreSampler<M, I>,
         lineages: L,
         restart_at: PositiveF64,
         fixup_strategy: RestartFixUpStrategy,
         local_partition: &mut P,
-    ) -> Result<SimulationOutcome<M, Self::Rng>, ResumeError<Self::Error>>;
+    ) -> Result<SimulationOutcome<M, G>, ResumeError<Self::Error>>;
 }
diff --git a/rustcoalescence/src/cli/simulate/dispatch/valid/algorithm_scenario.rs b/rustcoalescence/src/cli/simulate/dispatch/valid/algorithm_scenario.rs
index 22d818fd6..33d35657e 100644
--- a/rustcoalescence/src/cli/simulate/dispatch/valid/algorithm_scenario.rs
+++ b/rustcoalescence/src/cli/simulate/dispatch/valid/algorithm_scenario.rs
@@ -41,41 +41,42 @@ use super::{super::super::BufferingSimulateArgsBuilder, rng};
 
 macro_rules! match_scenario_algorithm {
     (
-        ($algorithm:expr, $scenario:expr => $algscen:ident) {
+        ($algorithm:expr, $scenario:expr => $algscen:ident : $algscenty:ident) {
             $($(#[$algmeta:meta])* $algpat:pat => $algcode:block),*
             <=>
-            $($(#[$scenmeta:meta])* $scenpat:pat => $scencode:block),*
+            $($(#[$scenmeta:meta])* $scenpat:pat => $scencode:block => $scenty:ident),*
         }
     ) => {
         match_scenario_algorithm! {
-            impl ($algorithm, $scenario => $algscen) {
+            impl ($algorithm, $scenario => $algscen : $algscenty) {
                 $($(#[$algmeta])* $algpat => $algcode),*
                 <=>
-                $($(#[$scenmeta])* $scenpat => $scencode),*
+                $($(#[$scenmeta])* $scenpat => $scencode => $scenty),*
                 <=>
             }
         }
     };
     (
-        impl ($algorithm:expr, $scenario:expr => $algscen:ident) {
+        impl ($algorithm:expr, $scenario:expr => $algscen:ident : $algscenty:ident) {
             $(#[$algmeta:meta])* $algpat:pat => $algcode:block,
             $($(#[$algmetarem:meta])* $algpatrem:pat => $algcoderem:block),+
             <=>
-            $($(#[$scenmeta:meta])* $scenpat:pat => $scencode:block),*
+            $($(#[$scenmeta:meta])* $scenpat:pat => $scencode:block => $scenty:ident),*
             <=>
             $($tail:tt)*
         }
     ) => {
         match_scenario_algorithm! {
-            impl ($algorithm, $scenario => $algscen) {
+            impl ($algorithm, $scenario => $algscen : $algscenty) {
                 $($(#[$algmetarem])* $algpatrem => $algcoderem),+
                 <=>
-                $($(#[$scenmeta])* $scenpat => $scencode),*
+                $($(#[$scenmeta])* $scenpat => $scencode => $scenty),*
                 <=>
                 $($tail)*
                 $(#[$algmeta])* $algpat => {
                     match $scenario {
                         $($(#[$scenmeta])* $scenpat => {
+                            type $algscenty<M, G> = $scenty<M, G>;
                             let $algscen = $scencode;
                             $algcode
                         }),*
@@ -85,10 +86,10 @@ macro_rules! match_scenario_algorithm {
         }
     };
     (
-        impl ($algorithm:expr, $scenario:expr => $algscen:ident) {
+        impl ($algorithm:expr, $scenario:expr => $algscen:ident : $algscenty:ident) {
             $(#[$algmeta:meta])* $algpat:pat => $algcode:block
             <=>
-            $($(#[$scenmeta:meta])* $scenpat:pat => $scencode:block),*
+            $($(#[$scenmeta:meta])* $scenpat:pat => $scencode:block => $scenty:ident),*
             <=>
             $($tail:tt)*
         }
@@ -98,6 +99,7 @@ macro_rules! match_scenario_algorithm {
             $(#[$algmeta])* $algpat => {
                 match $scenario {
                     $($(#[$scenmeta])* $scenpat => {
+                        type $algscenty<M, G> = $scenty<M, G>;
                         let $algscen = $scencode;
                         $algcode
                     }),*
@@ -107,7 +109,7 @@ macro_rules! match_scenario_algorithm {
     };
 }
 
-#[allow(clippy::too_many_arguments)]
+#[allow(clippy::too_many_arguments, clippy::too_many_lines)]
 pub(super) fn dispatch<'p, R: Reporter, P: LocalPartition<'p, R>>(
     local_partition: P,
 
@@ -121,13 +123,14 @@ pub(super) fn dispatch<'p, R: Reporter, P: LocalPartition<'p, R>>(
     normalised_args: &mut BufferingSimulateArgsBuilder,
 ) -> anyhow::Result<SimulationOutcome> {
     match_scenario_algorithm!(
-        (algorithm, scenario => scenario)
+        (algorithm, scenario => scenario: ScenarioTy)
     {
         #[cfg(feature = "gillespie-algorithms")]
         AlgorithmArgs::Gillespie(algorithm_args) => {
             rng::dispatch::<
                 <GillespieAlgorithm as AlgorithmDefaults>::MathsCore,
-                GillespieAlgorithm, _, R, P,
+                <GillespieAlgorithm as AlgorithmDefaults>::Rng<_>,
+                GillespieAlgorithm, ScenarioTy<_, _>, R, P,
             >(
                 local_partition, sample, algorithm_args, scenario,
                 pause_before, ron_args, normalised_args,
@@ -137,7 +140,8 @@ pub(super) fn dispatch<'p, R: Reporter, P: LocalPartition<'p, R>>(
         AlgorithmArgs::EventSkipping(algorithm_args) => {
             rng::dispatch::<
                 <EventSkippingAlgorithm as AlgorithmDefaults>::MathsCore,
-                EventSkippingAlgorithm, _, R, P,
+                <EventSkippingAlgorithm as AlgorithmDefaults>::Rng<_>,
+                EventSkippingAlgorithm, ScenarioTy<_, _>, R, P,
             >(
                 local_partition, sample, algorithm_args, scenario,
                 pause_before, ron_args, normalised_args,
@@ -147,7 +151,8 @@ pub(super) fn dispatch<'p, R: Reporter, P: LocalPartition<'p, R>>(
         AlgorithmArgs::Independent(algorithm_args) => {
             rng::dispatch::<
                 <IndependentAlgorithm as AlgorithmDefaults>::MathsCore,
-                IndependentAlgorithm, _, R, P,
+                <IndependentAlgorithm as AlgorithmDefaults>::Rng<_>,
+                IndependentAlgorithm, ScenarioTy<_, _>, R, P,
             >(
                 local_partition, sample, algorithm_args, scenario,
                 pause_before, ron_args, normalised_args,
@@ -157,7 +162,8 @@ pub(super) fn dispatch<'p, R: Reporter, P: LocalPartition<'p, R>>(
         AlgorithmArgs::Cuda(algorithm_args) => {
             rng::dispatch::<
                 <CudaAlgorithm as AlgorithmDefaults>::MathsCore,
-                CudaAlgorithm, _, R, P,
+                <CudaAlgorithm as AlgorithmDefaults>::Rng<_>,
+                CudaAlgorithm, ScenarioTy<_, _>, R, P,
             >(
                 local_partition, sample, algorithm_args, scenario,
                 pause_before, ron_args, normalised_args,
@@ -170,14 +176,14 @@ pub(super) fn dispatch<'p, R: Reporter, P: LocalPartition<'p, R>>(
                 scenario_args,
                 speciation_probability_per_generation,
             )?
-        },
+        } => SpatiallyExplicitUniformTurnoverScenario,
         #[cfg(feature = "spatially-explicit-turnover-map-scenario")]
         ScenarioArgs::SpatiallyExplicitTurnoverMap(scenario_args) => {
             SpatiallyExplicitTurnoverMapScenario::initialise(
                 scenario_args,
                 speciation_probability_per_generation,
             )?
-        },
+        } => SpatiallyExplicitTurnoverMapScenario,
         #[cfg(feature = "non-spatial-scenario")]
         ScenarioArgs::NonSpatial(scenario_args) => {
             NonSpatialScenario::initialise(
@@ -185,7 +191,7 @@ pub(super) fn dispatch<'p, R: Reporter, P: LocalPartition<'p, R>>(
                 speciation_probability_per_generation,
             )
             .into_ok()
-        },
+        } => NonSpatialScenario,
         #[cfg(feature = "almost-infinite-normal-dispersal-scenario")]
         ScenarioArgs::AlmostInfiniteNormalDispersal(scenario_args) => {
             AlmostInfiniteNormalDispersalScenario::initialise(
@@ -193,7 +199,7 @@ pub(super) fn dispatch<'p, R: Reporter, P: LocalPartition<'p, R>>(
                 speciation_probability_per_generation,
             )
             .into_ok()
-        },
+        } => AlmostInfiniteNormalDispersalScenario,
         #[cfg(feature = "almost-infinite-clark2dt-dispersal-scenario")]
         ScenarioArgs::AlmostInfiniteClark2DtDispersal(scenario_args) => {
             AlmostInfiniteClark2DtDispersalScenario::initialise(
@@ -201,7 +207,7 @@ pub(super) fn dispatch<'p, R: Reporter, P: LocalPartition<'p, R>>(
                 speciation_probability_per_generation,
             )
             .into_ok()
-        },
+        } => AlmostInfiniteClark2DtDispersalScenario,
         #[cfg(feature = "spatially-implicit-scenario")]
         ScenarioArgs::SpatiallyImplicit(scenario_args) => {
             SpatiallyImplicitScenario::initialise(
@@ -209,7 +215,7 @@ pub(super) fn dispatch<'p, R: Reporter, P: LocalPartition<'p, R>>(
                 speciation_probability_per_generation,
             )
             .into_ok()
-        },
+        } => SpatiallyImplicitScenario,
         #[cfg(feature = "wrapping-noise-scenario")]
         ScenarioArgs::WrappingNoise(scenario_args) => {
             WrappingNoiseScenario::initialise(
@@ -217,6 +223,6 @@ pub(super) fn dispatch<'p, R: Reporter, P: LocalPartition<'p, R>>(
                 speciation_probability_per_generation,
             )
             .into_ok()
-        }
+        } => WrappingNoiseScenario
     })
 }
diff --git a/rustcoalescence/src/cli/simulate/dispatch/valid/info.rs b/rustcoalescence/src/cli/simulate/dispatch/valid/info.rs
index aac4223c9..1b390136c 100644
--- a/rustcoalescence/src/cli/simulate/dispatch/valid/info.rs
+++ b/rustcoalescence/src/cli/simulate/dispatch/valid/info.rs
@@ -5,7 +5,7 @@ use anyhow::{Context, Result};
 use rustcoalescence_algorithms::{result::SimulationOutcome, Algorithm};
 
 use necsim_core::{
-    cogs::MathsCore,
+    cogs::{MathsCore, RngCore},
     reporter::{boolean::Boolean, Reporter},
 };
 use necsim_core_bond::NonNegativeF64;
@@ -25,23 +25,23 @@ use super::{super::super::BufferingSimulateArgsBuilder, launch};
 pub(super) fn dispatch<
     'p,
     M: MathsCore,
-    A: Algorithm<'p, M, O, R, P>,
-    O: Scenario<M, A::Rng>,
+    G: RngCore<M>,
+    A: Algorithm<'p, M, G, O, R, P>,
+    O: Scenario<M, G>,
     R: Reporter,
     P: LocalPartition<'p, R>,
 >(
     algorithm_args: A::Arguments,
-    rng: A::Rng,
+    rng: G,
     scenario: O,
     sample: Sample,
     pause_before: Option<NonNegativeF64>,
     mut local_partition: P,
 
     normalised_args: &BufferingSimulateArgsBuilder,
-) -> anyhow::Result<SimulationOutcome<M, A::Rng>>
+) -> anyhow::Result<SimulationOutcome<M, G>>
 where
-    Result<SimulationOutcome<M, A::Rng>, A::Error>:
-        anyhow::Context<SimulationOutcome<M, A::Rng>, A::Error>,
+    Result<SimulationOutcome<M, G>, A::Error>: anyhow::Context<SimulationOutcome<M, G>, A::Error>,
 {
     let config_str = normalised_args
         .build()
@@ -118,7 +118,7 @@ where
         warn!("The simulation will report no events.");
     }
 
-    let result = launch::simulate::<M, A, O, R, P>(
+    let result = launch::simulate::<M, G, A, O, R, P>(
         algorithm_args,
         rng,
         scenario,
diff --git a/rustcoalescence/src/cli/simulate/dispatch/valid/launch.rs b/rustcoalescence/src/cli/simulate/dispatch/valid/launch.rs
index e070202d6..ceb5ff4c8 100644
--- a/rustcoalescence/src/cli/simulate/dispatch/valid/launch.rs
+++ b/rustcoalescence/src/cli/simulate/dispatch/valid/launch.rs
@@ -2,7 +2,10 @@ use anyhow::Context;
 
 use rustcoalescence_algorithms::{result::SimulationOutcome, Algorithm};
 
-use necsim_core::{cogs::MathsCore, reporter::Reporter};
+use necsim_core::{
+    cogs::{MathsCore, RngCore},
+    reporter::Reporter,
+};
 use necsim_core_bond::{NonNegativeF64, PositiveF64};
 use necsim_impls_no_std::cogs::origin_sampler::pre_sampler::OriginPreSampler;
 use necsim_partitioning_core::LocalPartition;
@@ -14,18 +17,19 @@ use crate::args::config::sample::{Sample, SampleMode, SampleModeRestart, SampleO
 pub(super) fn simulate<
     'p,
     M: MathsCore,
-    A: Algorithm<'p, M, O, R, P>,
-    O: Scenario<M, A::Rng>,
+    G: RngCore<M>,
+    A: Algorithm<'p, M, G, O, R, P>,
+    O: Scenario<M, G>,
     R: Reporter,
     P: LocalPartition<'p, R>,
 >(
     algorithm_args: A::Arguments,
-    rng: A::Rng,
+    rng: G,
     scenario: O,
     sample: Sample,
     pause_before: Option<NonNegativeF64>,
     local_partition: &mut P,
-) -> anyhow::Result<SimulationOutcome<M, A::Rng>> {
+) -> anyhow::Result<SimulationOutcome<M, G>> {
     let lineages = match sample.origin {
         SampleOrigin::Habitat => {
             return A::initialise_and_simulate(
diff --git a/rustcoalescence/src/cli/simulate/dispatch/valid/rng.rs b/rustcoalescence/src/cli/simulate/dispatch/valid/rng.rs
index 929556339..2907eddc8 100644
--- a/rustcoalescence/src/cli/simulate/dispatch/valid/rng.rs
+++ b/rustcoalescence/src/cli/simulate/dispatch/valid/rng.rs
@@ -27,8 +27,9 @@ use super::{
 pub(super) fn dispatch<
     'p,
     M: MathsCore,
-    A: Algorithm<'p, M, O, R, P>,
-    O: Scenario<M, A::Rng>,
+    G: RngCore<M>,
+    A: Algorithm<'p, M, G, O, R, P>,
+    O: Scenario<M, G>,
     R: Reporter,
     P: LocalPartition<'p, R>,
 >(
@@ -43,17 +44,16 @@ pub(super) fn dispatch<
     normalised_args: &mut BufferingSimulateArgsBuilder,
 ) -> anyhow::Result<SimulationOutcome>
 where
-    Result<AlgorithmOutcome<M, A::Rng>, A::Error>:
-        anyhow::Context<AlgorithmOutcome<M, A::Rng>, A::Error>,
+    Result<AlgorithmOutcome<M, G>, A::Error>: anyhow::Context<AlgorithmOutcome<M, G>, A::Error>,
 {
-    let rng: A::Rng = match parse::rng::parse_and_normalise(
+    let rng: G = match parse::rng::parse_and_normalise(
         ron_args,
         normalised_args,
         &mut A::get_logical_partition(&algorithm_args, &local_partition),
     )? {
         RngArgs::Seed(seed) => SeedableRng::seed_from_u64(seed),
         RngArgs::Sponge(bytes) => {
-            let mut seed = <A::Rng as RngCore<M>>::Seed::default();
+            let mut seed = G::Seed::default();
 
             let mut sponge = Keccak::v256();
             sponge.update(&bytes);
@@ -64,7 +64,7 @@ where
         RngArgs::State(state) => state.into(),
     };
 
-    let result = info::dispatch::<M, A, O, R, P>(
+    let result = info::dispatch::<M, G, A, O, R, P>(
         algorithm_args,
         rng,
         scenario,