From f4718dc4bc407173c93c57a98d520310b3e6c785 Mon Sep 17 00:00:00 2001
From: "Christopher H. Jordan" <christopherjordan87@gmail.com>
Date: Sun, 11 Jun 2023 21:54:31 +0800
Subject: [PATCH] HIP support.

---
 .github/workflows/releases.yml            |   2 +-
 .github/workflows/run-tests.yml           |   5 +-
 CHANGELOG.md                              |   2 +
 Cargo.lock                                |  20 ++
 Cargo.toml                                |  19 +-
 benches/bench.rs                          |  12 +-
 build.rs                                  | 240 +++++++------
 mdbook/src/installation/from_source.md    |  41 ++-
 mdbook/src/installation/pre_compiled.md   |   9 +-
 mdbook/src/user/beam.md                   |   6 +-
 src/beam/error.rs                         |   4 +-
 src/beam/fee.rs                           |  28 +-
 src/beam/mod.rs                           |  66 ++--
 src/beam/tests.rs                         |  12 +-
 src/cli/beam.rs                           |  38 +-
 src/cli/common/mod.rs                     |  33 +-
 src/cli/di_calibrate/error.rs             | 177 ---------
 src/cli/di_calibrate/tests.rs             |  16 +-
 src/cli/error.rs                          |  18 +-
 src/cli/vis_simulate/tests.rs             |  10 +-
 src/cuda/utils.cu                         |  40 ---
 src/{cuda => gpu}/common.cuh              | 124 ++++---
 src/{cuda => gpu}/compile_flags.txt       |   1 +
 src/{cuda => gpu}/mod.rs                  | 231 ++++++++----
 src/{cuda => gpu}/model.cu                |  54 ++-
 src/{cuda => gpu}/model.h                 |   0
 src/{cuda => gpu}/model_double.rs         |   0
 src/{cuda => gpu}/model_single.rs         |   0
 src/{cuda => gpu}/tests.rs                |  17 +-
 src/{cuda => gpu}/types.h                 |   0
 src/{cuda => gpu}/types_double.rs         |   0
 src/{cuda => gpu}/types_single.rs         |   0
 src/{cuda => gpu}/update_rust_bindings.sh |   6 +-
 src/gpu/utils.cu                          |  68 ++++
 src/{cuda => gpu}/utils.h                 |   6 +-
 src/{cuda => gpu}/utils.rs                |  69 ++--
 src/{cuda => gpu}/utils_bindings.rs       |   2 +-
 src/lib.rs                                |   8 +-
 src/model/error.rs                        |   4 +-
 src/model/{cuda.rs => gpu.rs}             | 416 +++++++++++-----------
 src/model/mod.rs                          |  61 ++--
 src/model/tests/cpu.rs                    |   4 +-
 src/model/tests/{cuda.rs => gpu.rs}       |  50 +--
 src/model/tests/mod.rs                    |  30 +-
 src/params/vis_subtract.rs                |   4 +-
 src/solutions/mod.rs                      |   3 +-
 46 files changed, 1036 insertions(+), 920 deletions(-)
 delete mode 100644 src/cli/di_calibrate/error.rs
 delete mode 100644 src/cuda/utils.cu
 rename src/{cuda => gpu}/common.cuh (63%)
 rename src/{cuda => gpu}/compile_flags.txt (93%)
 rename src/{cuda => gpu}/mod.rs (57%)
 rename src/{cuda => gpu}/model.cu (95%)
 rename src/{cuda => gpu}/model.h (100%)
 rename src/{cuda => gpu}/model_double.rs (100%)
 rename src/{cuda => gpu}/model_single.rs (100%)
 rename src/{cuda => gpu}/tests.rs (78%)
 rename src/{cuda => gpu}/types.h (100%)
 rename src/{cuda => gpu}/types_double.rs (100%)
 rename src/{cuda => gpu}/types_single.rs (100%)
 rename src/{cuda => gpu}/update_rust_bindings.sh (90%)
 create mode 100644 src/gpu/utils.cu
 rename src/{cuda => gpu}/utils.h (77%)
 rename src/{cuda => gpu}/utils.rs (52%)
 rename src/{cuda => gpu}/utils_bindings.rs (96%)
 rename src/model/{cuda.rs => gpu.rs} (75%)
 rename src/model/tests/{cuda.rs => gpu.rs} (94%)

diff --git a/.github/workflows/releases.yml b/.github/workflows/releases.yml
index b3dd9770..fae5853d 100644
--- a/.github/workflows/releases.yml
+++ b/.github/workflows/releases.yml
@@ -67,7 +67,7 @@ jobs:
           LICENSE COPYING-hdf5 LICENSE-erfa LICENSE-cfitsio LICENSE-NVIDIA README.md \
           hyperdrive
 
-          cargo build --profile production --locked --no-default-features --features=hdf5-static,cfitsio-static,cuda-single
+          cargo build --profile production --locked --no-default-features --features=hdf5-static,cfitsio-static,cuda,gpu-single
           mv target/production/hyperdrive .
           tar -acvf mwa_hyperdrive-$(git describe --tags)-Linux-x86-64-v3-CUDA-single.tar.gz \
           LICENSE COPYING-hdf5 LICENSE-erfa LICENSE-cfitsio LICENSE-NVIDIA README.md \
diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml
index 7fdea7e6..c92bb76b 100644
--- a/.github/workflows/run-tests.yml
+++ b/.github/workflows/run-tests.yml
@@ -80,4 +80,7 @@ jobs:
           cargo clean
           cargo +${MIN_RUST} test --locked
           cargo +${MIN_RUST} test --locked --features=all-static
-          cargo +${MIN_RUST} test --locked --all-features --no-run
+          # Can't test with --all-features, cuda and hip aren't allowed together
+          # hip is also difficult to install so ignore it
+          cargo +${MIN_RUST} test --locked --features=all-static,cuda --no-run
+          cargo +${MIN_RUST} test --locked --features=all-static,cuda,gpu-single --no-run
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 50bf0f90..207b5895 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,6 +8,8 @@ Versioning](https://semver.org/spec/v2.0.0.html).
 
 ## [0.3.0] - Unreleased
 ### Added
+- Support for HIP, which allows AMD GPUs to be used instead of only NVIDIA GPUs
+  via CUDA.
 - Support for the "DipAmps" column in a metafits file. This allows users to
   control dipole gains in beam code.
 - Support for averaging incoming visibilities in time and frequency *before*
diff --git a/Cargo.lock b/Cargo.lock
index 9b497d45..b7845b1b 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1078,6 +1078,24 @@ dependencies = [
  "serde_derive",
 ]
 
+[[package]]
+name = "hip-runtime-sys"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "901a5d54cfff799dd9e6f6e1d53883bb50afdd92edce7680b1ca299d1805f65a"
+dependencies = [
+ "libc",
+]
+
+[[package]]
+name = "hip-sys"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f399629f98b6249efc10039e949baa70e2ff2bf84d3c30e7a0ca49f179a04287"
+dependencies = [
+ "hip-runtime-sys",
+]
+
 [[package]]
 name = "humantime"
 version = "2.1.0"
@@ -1456,6 +1474,7 @@ dependencies = [
  "cuda-runtime-sys",
  "hdf5",
  "hdf5-sys",
+ "hip-sys",
  "marlu",
  "ndarray",
  "num-complex",
@@ -1490,6 +1509,7 @@ dependencies = [
  "flate2",
  "glob",
  "hifitime",
+ "hip-sys",
  "indexmap 1.9.3",
  "indicatif",
  "indoc",
diff --git a/Cargo.toml b/Cargo.toml
index 613b91a0..42ab3ed6 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -30,13 +30,14 @@ inherits = "production"
 default = ["plotting"]
 
 # Use CUDA code with double-precision floats.
-cuda = ["mwa_hyperbeam/cuda", "cuda-runtime-sys"]
+cuda = ["mwa_hyperbeam/cuda", "cuda-runtime-sys", "cc"]
 
-# Use CUDA code with single-precision floats.
-cuda-single = [
-    "cuda",
-    "mwa_hyperbeam/gpu-single",
-]
+# Use HIP code with double-precision floats.
+hip = ["mwa_hyperbeam/hip", "hip-sys", "cc"]
+
+# Opt-out of GPU double precision, use only single precision (faster on desktop
+# GPUs).
+gpu-single = ["mwa_hyperbeam/gpu-single"]
 
 # Enable plotting.
 plotting = ["plotters"]
@@ -100,6 +101,9 @@ vec1 = { version = "1.5.0", features = ["serde"] }
 # "cuda" feature
 cuda-runtime-sys = { version = "0.3.0-alpha.1", optional = true }
 
+# "hip" feature
+hip-sys = { version = "0.1.0", optional = true }
+
 # "plotting" feature
 plotters = { version = "0.3.5", default-features = false, features = [
     "bitmap_backend",
@@ -121,7 +125,8 @@ tempfile = "3.6.0"
 
 [build-dependencies]
 built = { version = "0.6.0", features = ["chrono", "git2"] }
-cc = { version = "1.0.72", features = ["parallel"] }
+cc = { version = "1.0.72", features = ["parallel"], optional = true }
+hip-sys = { version = "0.1.0", optional = true }
 
 [[bench]]
 name = "bench"
diff --git a/benches/bench.rs b/benches/bench.rs
index d10f7c88..0320f27c 100644
--- a/benches/bench.rs
+++ b/benches/bench.rs
@@ -92,7 +92,7 @@ fn model_benchmarks(c: &mut Criterion) {
         );
     }
 
-    #[cfg(feature = "cuda")]
+    #[cfg(any(feature = "cuda", feature = "hip"))]
     for (num_sources, num_chans) in [
         (1, 768),
         (32, 768),
@@ -130,7 +130,7 @@ fn model_benchmarks(c: &mut Criterion) {
                         },
                     );
                 }
-                let modeller = model::SkyModellerCuda::new(
+                let modeller = model::SkyModellerGpu::new(
                     &*beam,
                     &source_list,
                     Polarisations::default(),
@@ -209,7 +209,7 @@ fn model_benchmarks(c: &mut Criterion) {
         })
     });
 
-    #[cfg(feature = "cuda")]
+    #[cfg(any(feature = "cuda", feature = "hip"))]
     for (num_sources, num_chans) in [
         (1, 768),
         (32, 768),
@@ -251,7 +251,7 @@ fn model_benchmarks(c: &mut Criterion) {
                         },
                     );
                 }
-                let modeller = model::SkyModellerCuda::new(
+                let modeller = model::SkyModellerGpu::new(
                     &*beam,
                     &source_list,
                     Polarisations::default(),
@@ -342,7 +342,7 @@ fn model_benchmarks(c: &mut Criterion) {
         },
     );
 
-    #[cfg(feature = "cuda")]
+    #[cfg(any(feature = "cuda", feature = "hip"))]
     for (num_sources, num_chans) in [
         (1, 768),
         (32, 768),
@@ -393,7 +393,7 @@ fn model_benchmarks(c: &mut Criterion) {
                         },
                     );
                 }
-                let modeller = model::SkyModellerCuda::new(
+                let modeller = model::SkyModellerGpu::new(
                     &*beam,
                     &source_list,
                     Polarisations::default(),
diff --git a/build.rs b/build.rs
index c9b3a4e9..332a1a2e 100644
--- a/build.rs
+++ b/build.rs
@@ -28,140 +28,168 @@ fn write_built(out_dir: &Path) {
 fn main() {
     println!("cargo:rerun-if-changed=build.rs");
 
+    #[cfg(all(feature = "cuda", feature = "hip"))]
+    compile_error!("Both 'cuda' and 'hip' features are enabled; only one can be used.");
+    #[cfg(all(not(feature = "cuda"), not(feature = "hip"), feature = "gpu-single"))]
+    compile_error!(
+        "The 'gpu-single' feature must be used with either of the 'cuda' or 'hip' features."
+    );
+
     let out_dir = PathBuf::from(env::var("OUT_DIR").expect("OUT_DIR env. variable not defined!"));
     write_built(&out_dir);
 
-    #[cfg(feature = "cuda")]
-    cuda::build_and_link();
+    #[cfg(any(feature = "cuda", feature = "hip"))]
+    gpu::build_and_link();
 }
 
-#[cfg(feature = "cuda")]
-mod cuda {
-    use std::env;
-
-    const DEFAULT_CUDA_ARCHES: &[u16] = &[60, 70, 80];
-    const DEFAULT_CUDA_SMS: &[u16] = &[60, 70, 75, 80, 86];
-
-    fn parse_and_validate_compute(c: &str, var: &str) -> Vec<u16> {
-        let mut out = vec![];
-        for compute in c.trim().split(',') {
-            // Check that there's only two numeric characters.
-            if compute.len() != 2 {
-                panic!("When parsing {var}, found '{compute}', which is not a two-digit number!")
+#[cfg(any(feature = "cuda", feature = "hip"))]
+mod gpu {
+    use std::{env, path::PathBuf};
+
+    /// Search for any C/C++/CUDA/HIP files, populate the provided buffer with
+    /// them, and have rerun-if-changed on all of them.
+    #[cfg(any(feature = "cuda", feature = "hip"))]
+    fn get_gpu_files<P: AsRef<std::path::Path>>(dir: P, files: &mut Vec<PathBuf>) {
+        for path in std::fs::read_dir(dir).expect("dir exists") {
+            let path = path.expect("is readable").path();
+            if path.is_dir() {
+                get_gpu_files(&path, files)
             }
 
-            match compute.parse() {
-                Ok(p) => out.push(p),
-                Err(_) => panic!("'{compute}', part of {var}, couldn't be parsed into a number!"),
+            match path.extension().and_then(|os_str| os_str.to_str()) {
+                Some("cu") => {
+                    println!("cargo:rerun-if-changed={}", path.display());
+                    files.push(path);
+                }
+                Some("h" | "cuh") => println!("cargo:rerun-if-changed={}", path.display()),
+                _ => (),
             }
         }
-        out
     }
 
     pub(super) fn build_and_link() {
-        // Attempt to read HYPERDRIVE_CUDA_COMPUTE. HYPERBEAM_CUDA_COMPUTE can be
-        // used instead, too.
-        println!("cargo:rerun-if-env-changed=HYPERDRIVE_CUDA_COMPUTE");
-        println!("cargo:rerun-if-env-changed=HYPERBEAM_CUDA_COMPUTE");
-        let (arches, sms) = match (
-            env::var("HYPERDRIVE_CUDA_COMPUTE"),
-            env::var("HYPERBEAM_CUDA_COMPUTE"),
-        ) {
-            // When a user-supplied variable exists, use it as the CUDA arch and
-            // compute level.
-            (Ok(c), _) | (Err(_), Ok(c)) => {
-                let compute = parse_and_validate_compute(&c, "HYPERDRIVE_CUDA_COMPUTE");
-                let sms = compute.clone();
-                (compute, sms)
-            }
-            (Err(_), Err(_)) => {
-                // Print out all of the default arches and computes as a
-                // warning.
-                println!("cargo:warning=No HYPERDRIVE_CUDA_COMPUTE; Passing arch=compute_{DEFAULT_CUDA_ARCHES:?} and code=sm_{DEFAULT_CUDA_SMS:?} to nvcc");
-                (DEFAULT_CUDA_ARCHES.to_vec(), DEFAULT_CUDA_SMS.to_vec())
-            }
-        };
-
-        // Compile all CUDA source files into a single library. Find .cu, .h and
-        // .cuh files; if any of them change, tell cargo to recompile.
-        let mut cuda_files = vec![];
-        for entry in std::fs::read_dir("src/cuda").expect("src/cuda directory doesn't exist!") {
-            let entry = entry.expect("Couldn't access file in src/cuda directory");
-            let path = entry.path();
-            // Skip this entry if it isn't a file.
-            if !path.is_file() {
-                continue;
+        let mut gpu_files = vec![];
+        get_gpu_files("src/gpu", &mut gpu_files);
+
+        #[cfg(feature = "cuda")]
+        let mut gpu_target = {
+            const DEFAULT_CUDA_ARCHES: &[u16] = &[60, 70, 80];
+            const DEFAULT_CUDA_SMS: &[u16] = &[60, 70, 75, 80, 86];
+
+            fn parse_and_validate_compute(c: &str, var: &str) -> Vec<u16> {
+                let mut out = vec![];
+                for compute in c.trim().split(',') {
+                    // Check that there's only two numeric characters.
+                    if compute.len() != 2 {
+                        panic!("When parsing {var}, found '{compute}', which is not a two-digit number!")
+                    }
+
+                    match compute.parse() {
+                        Ok(p) => out.push(p),
+                        Err(_) => {
+                            panic!("'{compute}', part of {var}, couldn't be parsed into a number!")
+                        }
+                    }
+                }
+                out
             }
 
-            match path.extension().and_then(|os_str| os_str.to_str()) {
-                // Track this file if it's extension is .cu
-                Some("cu") => {
-                    println!("cargo:rerun-if-changed={}", path.display());
-                    // Add this .cu file to be compiled later.
-                    cuda_files.push(path);
+            // Attempt to read HYPERDRIVE_CUDA_COMPUTE. HYPERBEAM_CUDA_COMPUTE can be
+            // used instead, too.
+            println!("cargo:rerun-if-env-changed=HYPERDRIVE_CUDA_COMPUTE");
+            println!("cargo:rerun-if-env-changed=HYPERBEAM_CUDA_COMPUTE");
+            let (arches, sms) = match (
+                env::var("HYPERDRIVE_CUDA_COMPUTE"),
+                env::var("HYPERBEAM_CUDA_COMPUTE"),
+            ) {
+                // When a user-supplied variable exists, use it as the CUDA arch and
+                // compute level.
+                (Ok(c), _) | (Err(_), Ok(c)) => {
+                    let compute = parse_and_validate_compute(&c, "HYPERDRIVE_CUDA_COMPUTE");
+                    let sms = compute.clone();
+                    (compute, sms)
                 }
-
-                Some("h" | "cuh") => {
-                    println!("cargo:rerun-if-changed={}", path.display());
+                (Err(_), Err(_)) => {
+                    // Print out all of the default arches and computes as a
+                    // warning.
+                    println!("cargo:warning=No HYPERDRIVE_CUDA_COMPUTE; Passing arch=compute_{DEFAULT_CUDA_ARCHES:?} and code=sm_{DEFAULT_CUDA_SMS:?} to nvcc");
+                    (DEFAULT_CUDA_ARCHES.to_vec(), DEFAULT_CUDA_SMS.to_vec())
+                }
+            };
+
+            let mut cuda_target = cc::Build::new();
+            cuda_target.cuda(true).cudart("shared"); // We handle linking cudart statically
+
+            // If $CXX is not set but $CUDA_PATH is, search for
+            // $CUDA_PATH/bin/g++ and if it exists, set that as $CXX.
+            if env::var_os("CXX").is_none() {
+                // Unlike above, we care about $CUDA_PATH being unicode.
+                if let Ok(cuda_path) = env::var("CUDA_PATH") {
+                    // Look for the g++ that CUDA wants.
+                    let compiler = std::path::PathBuf::from(cuda_path).join("bin/g++");
+                    if compiler.exists() {
+                        println!("cargo:warning=Setting $CXX to {}", compiler.display());
+                        env::set_var("CXX", compiler.into_os_string());
+                    }
                 }
-
-                _ => (),
             }
-        }
 
-        let mut cuda_target = cc::Build::new();
-        cuda_target
-            .cuda(true)
-            .cudart("shared") // We handle linking cudart statically
-            .define(
-                // The DEBUG env. variable is set by cargo. If running "cargo build
-                // --release", DEBUG is "false", otherwise "true". C/C++/CUDA like
-                // the compile option "NDEBUG" to be defined when using assert.h, so
-                // if appropriate, define that here. We also define "DEBUG" so that
-                // can be used.
-                match env::var("DEBUG").as_deref() {
-                    Ok("false") => "NDEBUG",
-                    _ => "DEBUG",
-                },
-                None,
-            );
-        // If $CXX is not set but $CUDA_PATH is, search for $CUDA_PATH/bin/g++
-        // and if it exists, set that as $CXX.
-        if env::var_os("CXX").is_none() {
-            // Unlike above, we care about $CUDA_PATH being unicode.
-            if let Ok(cuda_path) = env::var("CUDA_PATH") {
-                // Look for the g++ that CUDA wants.
-                let compiler = std::path::PathBuf::from(cuda_path).join("bin/g++");
-                if compiler.exists() {
-                    println!("cargo:warning=Setting $CXX to {}", compiler.display());
-                    env::set_var("CXX", compiler.into_os_string());
+            // Loop over each arch and sm
+            for arch in arches {
+                for &sm in &sms {
+                    if sm < arch {
+                        continue;
+                    }
+
+                    cuda_target.flag("-gencode");
+                    cuda_target.flag(&format!("arch=compute_{arch},code=sm_{sm}"));
                 }
             }
-        }
 
-        // Loop over each arch and sm
-        for arch in arches {
-            for &sm in &sms {
-                if sm < arch {
-                    continue;
-                }
+            cuda_target
+        };
 
-                cuda_target.flag("-gencode");
-                cuda_target.flag(&format!("arch=compute_{arch},code=sm_{sm}"));
+        #[cfg(feature = "hip")]
+        let mut gpu_target = {
+            let hip_path = hip_sys::hiprt::get_hip_path();
+
+            // It seems that various ROCm releases change where hipcc is...
+            let mut compiler = hip_path.join("bin/hipcc");
+            if !compiler.exists() {
+                // Try the dir above, which might be the ROCm dir.
+                compiler = hip_path.join("../bin/hipcc");
             }
-        }
+            let mut hip_target = cc::Build::new();
+            hip_target
+                .compiler(compiler)
+                .include(hip_path.join("include/hip"));
+            hip_target
+        };
 
-        // If we're told to, use single-precision floats. The default in the CUDA
+        gpu_target.define(
+            // The DEBUG env. variable is set by cargo. If running "cargo build
+            // --release", DEBUG is "false", otherwise "true". C/C++/CUDA like
+            // the compile option "NDEBUG" to be defined when using assert.h, so
+            // if appropriate, define that here. We also define "DEBUG" so that
+            // can be used.
+            match env::var("DEBUG").as_deref() {
+                Ok("false") => "NDEBUG",
+                _ => "DEBUG",
+            },
+            None,
+        );
+
+        // If we're told to, use single-precision floats. The default in the GPU
         // code is to use double-precision.
-        #[cfg(feature = "cuda-single")]
-        cuda_target.define("SINGLE", None);
+        #[cfg(feature = "gpu-single")]
+        gpu_target.define("SINGLE", None);
 
         // Break in case of emergency.
-        // cuda_target.debug(true);
+        // gpu_target.debug(true);
 
-        for f in cuda_files {
-            cuda_target.file(f);
+        for f in gpu_files {
+            gpu_target.file(f);
         }
-        cuda_target.compile("hyperdrive_cu");
+        gpu_target.compile("hyperdrive_gpu");
     }
 }
diff --git a/mdbook/src/installation/from_source.md b/mdbook/src/installation/from_source.md
index 415438aa..97d6faca 100644
--- a/mdbook/src/installation/from_source.md
+++ b/mdbook/src/installation/from_source.md
@@ -35,8 +35,8 @@
   time.
 ```
 
-```admonish tip title="CUDA (for accelerated sky modelling)"
-- Only required if either the `cuda` or `cuda-single` feature is enabled
+```admonish tip title="CUDA (for accelerated sky modelling with NVIDIA GPUs)"
+- Only required if the `cuda` feature is enabled
 - Requires a [CUDA-capable device](https://developer.nvidia.com/cuda-gpus)
 - Arch: `cuda`
 - Ubuntu and others: [Download link](https://developer.nvidia.com/cuda-zone)
@@ -45,6 +45,20 @@
 - Can link statically; use the `cuda-static` or `all-static` features.
 ```
 
+```admonish tip title="HIP (for accelerated sky modelling with AMD GPUs)"
+- Only required if either the `hip` feature is enabled
+- Requires a [HIP-capable device](https://docs.amd.com/en/latest/release/gpu_os_support.html) (N.B. This seems to be incomplete)
+- Arch:
+  - See [https://wiki.archlinux.org/title/GPGPU#ROCm](https://wiki.archlinux.org/title/GPGPU#ROCm)
+  - It is possible to get pre-compiled products from the [arch4edu repo](https://github.com/arch4edu/arch4edu).
+- Ubuntu and others: [Download link](https://docs.amd.com/projects/HIP/en/docs-5.3.0/how_to_guides/install.html)
+- The installation dir can be specified manually with `HIP_PATH`
+  - If not specified, `/opt/rocm/hip` is used.
+- N.B. Despite HIP installations being able to run HIP code on NVIDIA GPUs,
+  this is not supported by `hyperdrive`; please compile with the CUDA
+  instructions above.
+```
+
 ## Installing Rust
 
 ~~~admonish tip title="TL;DR"
@@ -103,7 +117,7 @@ export HYPERDRIVE_CUDA_COMPUTE=75
 Now you can compile `hyperdrive` with CUDA enabled (single-precision floats):
 
 ```shell
-cargo install --path . --locked --features=cuda-single
+cargo install --path . --locked --features=cuda,gpu-single
 ```
 
 If you're using "datacentre" products (e.g. a V100 available on the
@@ -123,6 +137,27 @@ compiler happy. You can select a custom C++ compiler with the `CXX` variable,
 e.g. `CXX=/opt/cuda/bin/g++`.
 ~~~
 
+~~~admonish danger title="HIP"
+Do you have a HIP-capable AMD GPU? Ensure you have installed HIP (instructions
+are above), and compile with the `hip` feature (single-precision floats):
+
+```shell
+cargo install --path . --locked --features=hip,gpu-single
+```
+
+If you're using "datacentre" products (e.g. the GPUs on the "setonix"
+supercomputer), you probably want double-precision floats:
+
+```shell
+cargo install --path . --locked --features=hip
+```
+
+You can still compile with double-precision on a desktop GPU, but it will be
+much slower than single-precision.
+
+If you are encountering problems, you may need to set your `HIP_PATH` variable.
+~~~
+
 ~~~admonish tip title="Static dependencies"
 The aforementioned C libraries can each be compiled by `cargo`. `all-static`
 will statically-link all dependencies (including CUDA, if CUDA is enabled) such
diff --git a/mdbook/src/installation/pre_compiled.md b/mdbook/src/installation/pre_compiled.md
index 8aa22f24..24aee048 100644
--- a/mdbook/src/installation/pre_compiled.md
+++ b/mdbook/src/installation/pre_compiled.md
@@ -14,6 +14,11 @@ the "CUDA-single" release. You can still use the double-precision version on a
 desktop GPU, but it will be much slower than single-precision. Instructions to
 install CUDA are on [the next page](from_source.md#cuda).
 
+It is possible to run `hyperdrive` with HIP (i.e. the AMD equivalent to
+NVIDIA's CUDA), but HIP does not appear to offer static libraries, so no static
+feature is provided, and users will need to compile hyperdrive themselves with
+instructions on [the next page](from_source.md#gpu).
+
 ~~~admonish
 The pre-compiled binaries are made by GitHub actions using:
 ```shell
@@ -21,6 +26,6 @@ cargo build --release --locked --no-default-features --features=hdf5-static,cfit
 ```
 This means they cannot plot calibration solutions.
 "CUDA-double" binaries have the `cuda` feature and "CUDA-single" binaries have
-the `cuda-single` feature. CUDA cannot legally be statically linked so a local
-installation of CUDA is required.
+the `cuda` and `gpu-single` features. CUDA cannot legally be statically linked
+so a local installation of CUDA is required.
 ~~~
diff --git a/mdbook/src/user/beam.md b/mdbook/src/user/beam.md
index ab0d8a92..31c8ba83 100644
--- a/mdbook/src/user/beam.md
+++ b/mdbook/src/user/beam.md
@@ -9,9 +9,9 @@ in steps of `--step`, then for each of these zenith angles, moving from 0 to
 \\( 2 \pi \\) in steps of `--step` for the azimuth. Using a smaller `--step`
 will generate many more responses, so be aware that it might take a while.
 
-~~~admonish danger title="CUDA"
-If CUDA is available to you, the `--cuda` flag will generate the beam responses
-on the GPU, vastly decreasing the time taken.
+~~~admonish danger title="CUDA/HIP"
+If CUDA or HIP is available to you, the `--gpu` flag will generate the beam
+responses on the GPU, vastly decreasing the time taken.
 ~~~
 
 ~~~admonish example title="`Python` example to plot beam responses"
diff --git a/src/beam/error.rs b/src/beam/error.rs
index 36df727f..aeba9a45 100644
--- a/src/beam/error.rs
+++ b/src/beam/error.rs
@@ -41,7 +41,7 @@ pub enum BeamError {
     #[error("hyperbeam init error: {0}")]
     HyperbeamInit(#[from] mwa_hyperbeam::fee::InitFEEBeamError),
 
-    #[cfg(feature = "cuda")]
+    #[cfg(any(feature = "cuda", feature = "hip"))]
     #[error(transparent)]
-    Cuda(#[from] crate::cuda::CudaError),
+    Gpu(#[from] crate::gpu::GpuError),
 }
diff --git a/src/beam/fee.rs b/src/beam/fee.rs
index 64c32d55..374ac3c0 100644
--- a/src/beam/fee.rs
+++ b/src/beam/fee.rs
@@ -12,8 +12,8 @@ use ndarray::prelude::*;
 
 use super::{Beam, BeamError, BeamType, Delays};
 
-#[cfg(feature = "cuda")]
-use super::{BeamCUDA, CudaFloat, DevicePointer};
+#[cfg(any(feature = "cuda", feature = "hip"))]
+use super::{BeamGpu, DevicePointer, GpuFloat};
 
 /// A wrapper of the `FEEBeam` struct in hyperbeam that implements the [`Beam`]
 /// trait.
@@ -301,9 +301,9 @@ impl Beam for FEEBeam {
         self.hyperbeam_object.empty_cache();
     }
 
-    #[cfg(feature = "cuda")]
-    fn prepare_cuda_beam(&self, freqs_hz: &[u32]) -> Result<Box<dyn BeamCUDA>, BeamError> {
-        let cuda_beam = unsafe {
+    #[cfg(any(feature = "cuda", feature = "hip"))]
+    fn prepare_gpu_beam(&self, freqs_hz: &[u32]) -> Result<Box<dyn BeamGpu>, BeamError> {
+        let gpu_beam = unsafe {
             self.hyperbeam_object.gpu_prepare(
                 freqs_hz,
                 self.delays.view(),
@@ -311,29 +311,29 @@ impl Beam for FEEBeam {
                 true,
             )?
         };
-        Ok(Box::new(FEEBeamCUDA {
-            hyperbeam_object: cuda_beam,
+        Ok(Box::new(FEEBeamGpu {
+            hyperbeam_object: gpu_beam,
         }))
     }
 }
 
-#[cfg(feature = "cuda")]
-pub(crate) struct FEEBeamCUDA {
+#[cfg(any(feature = "cuda", feature = "hip"))]
+struct FEEBeamGpu {
     hyperbeam_object: mwa_hyperbeam::fee::FEEBeamGpu,
 }
 
-#[cfg(feature = "cuda")]
-impl BeamCUDA for FEEBeamCUDA {
+#[cfg(any(feature = "cuda", feature = "hip"))]
+impl BeamGpu for FEEBeamGpu {
     unsafe fn calc_jones_pair(
         &self,
-        az_rad: &[CudaFloat],
-        za_rad: &[CudaFloat],
+        az_rad: &[GpuFloat],
+        za_rad: &[GpuFloat],
         latitude_rad: f64,
         d_jones: *mut std::ffi::c_void,
     ) -> Result<(), BeamError> {
         let d_az_rad = DevicePointer::copy_to_device(az_rad)?;
         let d_za_rad = DevicePointer::copy_to_device(za_rad)?;
-        let d_array_latitude_rad = DevicePointer::copy_to_device(&[latitude_rad as CudaFloat])?;
+        let d_array_latitude_rad = DevicePointer::copy_to_device(&[latitude_rad as GpuFloat])?;
         self.hyperbeam_object.calc_jones_device_pair_inner(
             d_az_rad.get(),
             d_za_rad.get(),
diff --git a/src/beam/mod.rs b/src/beam/mod.rs
index 6be20284..8029d591 100644
--- a/src/beam/mod.rs
+++ b/src/beam/mod.rs
@@ -28,8 +28,8 @@ use marlu::{AzEl, Jones};
 use ndarray::prelude::*;
 use strum::IntoEnumIterator;
 
-#[cfg(feature = "cuda")]
-use crate::cuda::{CudaFloat, DevicePointer};
+#[cfg(any(feature = "cuda", feature = "hip"))]
+use crate::gpu::{DevicePointer, GpuFloat};
 
 /// Supported beam types.
 #[derive(
@@ -135,33 +135,33 @@ pub trait Beam: Sync + Send {
     /// If this [`Beam`] supports it, empty the coefficient cache.
     fn empty_coeff_cache(&self);
 
-    #[cfg(feature = "cuda")]
+    #[cfg(any(feature = "cuda", feature = "hip"))]
     /// Using the tile information from this [`Beam`] and frequencies to be
-    /// used, return a [`BeamCUDA`]. This object only needs frequencies to
+    /// used, return a [`BeamGpu`]. This object only needs frequencies to
     /// calculate beam response [`Jones`] matrices.
-    fn prepare_cuda_beam(&self, freqs_hz: &[u32]) -> Result<Box<dyn BeamCUDA>, BeamError>;
+    fn prepare_gpu_beam(&self, freqs_hz: &[u32]) -> Result<Box<dyn BeamGpu>, BeamError>;
 }
 
-/// A trait abstracting beam code functions on a CUDA-capable device.
-#[cfg(feature = "cuda")]
-pub trait BeamCUDA {
+/// A trait abstracting beam code functions on a GPU.
+#[cfg(any(feature = "cuda", feature = "hip"))]
+pub trait BeamGpu {
     /// Calculate the Jones matrices for each `az` and `za` direction and
     /// frequency (these were defined when the [`BeamCUDA`] was created). The
     /// results are ordered tile, frequency, direction, slowest to fastest.
     ///
     /// # Safety
     ///
-    /// This function interfaces directly with the CUDA API. Rust errors attempt
-    /// to catch problems but there are no guarantees.
+    /// This function interfaces directly with the CUDA/HIP API. Rust errors
+    /// attempt to catch problems but there are no guarantees.
     unsafe fn calc_jones_pair(
         &self,
-        az_rad: &[CudaFloat],
-        za_rad: &[CudaFloat],
+        az_rad: &[GpuFloat],
+        za_rad: &[GpuFloat],
         latitude_rad: f64,
         d_jones: *mut std::ffi::c_void,
     ) -> Result<(), BeamError>;
 
-    /// Get the type of beam used to create this [`BeamCUDA`].
+    /// Get the type of beam used to create this [`BeamGpu`].
     fn get_beam_type(&self) -> BeamType;
 
     /// Get a pointer to the device tile map. This is necessary to access
@@ -172,11 +172,11 @@ pub trait BeamCUDA {
     /// de-duplicated beam Jones matrices on the device.
     fn get_freq_map(&self) -> *const i32;
 
-    /// Get the number of de-duplicated tiles associated with this [`BeamCUDA`].
+    /// Get the number of de-duplicated tiles associated with this [`BeamGpu`].
     fn get_num_unique_tiles(&self) -> i32;
 
     /// Get the number of de-duplicated frequencies associated with this
-    /// [`BeamCUDA`].
+    /// [`BeamGpu`].
     fn get_num_unique_freqs(&self) -> i32;
 }
 
@@ -323,9 +323,9 @@ impl Beam for NoBeam {
 
     fn empty_coeff_cache(&self) {}
 
-    #[cfg(feature = "cuda")]
-    fn prepare_cuda_beam(&self, freqs_hz: &[u32]) -> Result<Box<dyn BeamCUDA>, BeamError> {
-        let obj = NoBeamCUDA {
+    #[cfg(any(feature = "cuda", feature = "hip"))]
+    fn prepare_gpu_beam(&self, freqs_hz: &[u32]) -> Result<Box<dyn BeamGpu>, BeamError> {
+        let obj = NoBeamGpu {
             tile_map: DevicePointer::copy_to_device(&vec![0; self.num_tiles])?,
             freq_map: DevicePointer::copy_to_device(&vec![0; freqs_hz.len()])?,
         };
@@ -335,27 +335,37 @@ impl Beam for NoBeam {
 
 /// A beam implementation that returns only identity Jones matrices for all beam
 /// calculations.
-#[cfg(feature = "cuda")]
-pub(crate) struct NoBeamCUDA {
+#[cfg(any(feature = "cuda", feature = "hip"))]
+pub(crate) struct NoBeamGpu {
     tile_map: DevicePointer<i32>,
     freq_map: DevicePointer<i32>,
 }
 
-#[cfg(feature = "cuda")]
-impl BeamCUDA for NoBeamCUDA {
+#[cfg(any(feature = "cuda", feature = "hip"))]
+impl BeamGpu for NoBeamGpu {
     unsafe fn calc_jones_pair(
         &self,
-        az_rad: &[CudaFloat],
-        _za_rad: &[CudaFloat],
+        az_rad: &[GpuFloat],
+        _za_rad: &[GpuFloat],
         _latitude_rad: f64,
         d_jones: *mut std::ffi::c_void,
     ) -> Result<(), BeamError> {
-        let identities: Vec<Jones<CudaFloat>> = vec![Jones::identity(); az_rad.len()];
-        cuda_runtime_sys::cudaMemcpy(
+        #[cfg(feature = "cuda")]
+        use cuda_runtime_sys::{
+            cudaMemcpy as gpuMemcpy,
+            cudaMemcpyKind::cudaMemcpyHostToDevice as gpuMemcpyHostToDevice,
+        };
+        #[cfg(feature = "hip")]
+        use hip_sys::hiprt::{
+            hipMemcpy as gpuMemcpy, hipMemcpyKind::hipMemcpyHostToDevice as gpuMemcpyHostToDevice,
+        };
+
+        let identities: Vec<Jones<GpuFloat>> = vec![Jones::identity(); az_rad.len()];
+        gpuMemcpy(
             d_jones,
             identities.as_ptr().cast(),
-            identities.len() * std::mem::size_of::<Jones<CudaFloat>>(),
-            cuda_runtime_sys::cudaMemcpyKind::cudaMemcpyHostToDevice,
+            identities.len() * std::mem::size_of::<Jones<GpuFloat>>(),
+            gpuMemcpyHostToDevice,
         );
         Ok(())
     }
diff --git a/src/beam/tests.rs b/src/beam/tests.rs
index 83e14b68..a4b8a1e2 100644
--- a/src/beam/tests.rs
+++ b/src/beam/tests.rs
@@ -74,8 +74,8 @@ fn fee_beam_values_are_sensible() {
 
 #[test]
 #[serial]
-#[cfg(feature = "cuda")]
-fn fee_cuda_beam_values_are_sensible() {
+#[cfg(any(feature = "cuda", feature = "hip"))]
+fn fee_gpu_beam_values_are_sensible() {
     let delays = Array2::zeros((1, 16));
     let amps = Array2::ones((1, 16));
     let freqs = [150e6 as u32];
@@ -86,7 +86,7 @@ fn fee_cuda_beam_values_are_sensible() {
     ];
     let (azs, zas): (Vec<_>, Vec<_>) = azels
         .iter()
-        .map(|azel| (azel.az as CudaFloat, azel.za() as CudaFloat))
+        .map(|azel| (azel.az as GpuFloat, azel.za() as GpuFloat))
         .unzip();
 
     // Get the beam values right out of hyperbeam.
@@ -100,13 +100,13 @@ fn fee_cuda_beam_values_are_sensible() {
     // Compare these with the hyperdrive `Beam` trait.
     let hyperdrive =
         super::fee::FEEBeam::new_from_env(1, Delays::Full(delays), Some(amps)).unwrap();
-    let hyperdrive = hyperdrive.prepare_cuda_beam(&freqs).unwrap();
+    let hyperdrive = hyperdrive.prepare_gpu_beam(&freqs).unwrap();
     let hyperdrive_values_device = unsafe {
-        let mut hyperdrive_values_device: DevicePointer<Jones<CudaFloat>> = DevicePointer::malloc(
+        let mut hyperdrive_values_device: DevicePointer<Jones<GpuFloat>> = DevicePointer::malloc(
             hyperdrive.get_num_unique_tiles() as usize
                 * hyperdrive.get_num_unique_freqs() as usize
                 * azs.len()
-                * std::mem::size_of::<Jones<CudaFloat>>(),
+                * std::mem::size_of::<Jones<GpuFloat>>(),
         )
         .unwrap();
         hyperdrive
diff --git a/src/cli/beam.rs b/src/cli/beam.rs
index 087d4076..db12b288 100644
--- a/src/cli/beam.rs
+++ b/src/cli/beam.rs
@@ -60,18 +60,18 @@ pub struct BeamArgs {
     #[clap(short, long, default_value = "beam_responses.tsv")]
     output: PathBuf,
 
-    /// Use CUDA to generate the beam responses.
-    #[cfg(feature = "cuda")]
+    /// Use a GPU (i.e. CUDA or HIP) to generate the beam responses.
+    #[cfg(any(feature = "cuda", feature = "hip"))]
     #[clap(short, long)]
-    cuda: bool,
+    gpu: bool,
 }
 
 impl BeamArgs {
     pub(super) fn run(&self) -> Result<(), HyperdriveError> {
         cfg_if::cfg_if! {
-            if #[cfg(feature = "cuda")] {
-                if self.cuda {
-                    calc_cuda(self)
+            if #[cfg(any(feature = "cuda", feature = "hip"))] {
+                if self.gpu {
+                    calc_gpu(self)
                 } else {
                     calc_cpu(self)
                 }
@@ -109,8 +109,8 @@ fn calc_cpu(args: &BeamArgs) -> Result<(), HyperdriveError> {
         max_za,
         step,
         output,
-        #[cfg(feature = "cuda")]
-            cuda: _,
+        #[cfg(any(feature = "cuda", feature = "hip"))]
+            gpu: _,
     } = args;
 
     let beam = create_beam_object(
@@ -137,12 +137,12 @@ fn calc_cpu(args: &BeamArgs) -> Result<(), HyperdriveError> {
     Ok(())
 }
 
-#[cfg(feature = "cuda")]
-fn calc_cuda(args: &BeamArgs) -> Result<(), HyperdriveError> {
+#[cfg(any(feature = "cuda", feature = "hip"))]
+fn calc_gpu(args: &BeamArgs) -> Result<(), HyperdriveError> {
     use itertools::izip;
     use num_complex::Complex;
 
-    use crate::cuda::{CudaFloat, CudaJones, DevicePointer};
+    use crate::gpu::{DevicePointer, GpuFloat, GpuJones};
 
     let BeamArgs {
         beam_type,
@@ -152,7 +152,7 @@ fn calc_cuda(args: &BeamArgs) -> Result<(), HyperdriveError> {
         max_za,
         step,
         output,
-        cuda: _,
+        gpu: _,
     } = args;
 
     let beam = create_beam_object(
@@ -160,20 +160,20 @@ fn calc_cuda(args: &BeamArgs) -> Result<(), HyperdriveError> {
         1,
         Delays::Partial(delays.clone().unwrap_or(vec![0; 16])),
     )?;
-    let cuda_beam = beam.prepare_cuda_beam(&[(freq_mhz * 1e6) as u32])?;
+    let gpu_beam = beam.prepare_gpu_beam(&[(freq_mhz * 1e6) as u32])?;
     let mut out = BufWriter::new(File::create(output)?);
 
     let (azs, zas): (Vec<_>, Vec<_>) =
-        gen_azzas::<CudaFloat>(max_za.to_radians(), step.to_radians()).unzip();
-    let mut d_jones: DevicePointer<CudaJones> = DevicePointer::malloc(
-        cuda_beam.get_num_unique_tiles() as usize
-            * cuda_beam.get_num_unique_freqs() as usize
+        gen_azzas::<GpuFloat>(max_za.to_radians(), step.to_radians()).unzip();
+    let mut d_jones: DevicePointer<GpuJones> = DevicePointer::malloc(
+        gpu_beam.get_num_unique_tiles() as usize
+            * gpu_beam.get_num_unique_freqs() as usize
             * azs.len()
-            * std::mem::size_of::<CudaJones>(),
+            * std::mem::size_of::<GpuJones>(),
     )?;
 
     unsafe {
-        cuda_beam.calc_jones_pair(
+        gpu_beam.calc_jones_pair(
             &azs,
             &zas,
             latitude_deg.to_radians(),
diff --git a/src/cli/common/mod.rs b/src/cli/common/mod.rs
index 617511dc..63946eb3 100644
--- a/src/cli/common/mod.rs
+++ b/src/cli/common/mod.rs
@@ -459,7 +459,7 @@ pub(super) struct ModellingArgs {
 
     /// Use the CPU for visibility generation. This is deliberately made
     /// non-default because using a GPU is much faster.
-    #[cfg(feature = "cuda")]
+    #[cfg(any(feature = "cuda", feature = "hip"))]
     #[clap(long, help_heading = "MODELLING")]
     #[serde(default)]
     pub(super) cpu: bool,
@@ -469,7 +469,7 @@ impl ModellingArgs {
     pub(super) fn merge(self, other: Self) -> Self {
         Self {
             no_precession: self.no_precession || other.no_precession,
-            #[cfg(feature = "cuda")]
+            #[cfg(any(feature = "cuda", feature = "hip"))]
             cpu: self.cpu || other.cpu,
         }
     }
@@ -477,11 +477,11 @@ impl ModellingArgs {
     pub(super) fn parse(self) -> ModellingParams {
         let ModellingArgs {
             no_precession,
-            #[cfg(feature = "cuda")]
+            #[cfg(any(feature = "cuda", feature = "hip"))]
             cpu,
         } = self;
 
-        #[cfg(feature = "cuda")]
+        #[cfg(any(feature = "cuda", feature = "hip"))]
         if cpu {
             MODEL_DEVICE.store(ModelDevice::Cpu);
         }
@@ -495,36 +495,43 @@ impl ModellingArgs {
                 block.push(crate::model::get_cpu_info().into());
             }
 
-            #[cfg(feature = "cuda")]
-            ModelDevice::Cuda => {
+            #[cfg(any(feature = "cuda", feature = "hip"))]
+            ModelDevice::Gpu => {
                 block.push(format!("Using GPU with {} precision", d.get_precision()).into());
-                let (device_info, driver_info) = match crate::cuda::get_device_info() {
+                let (device_info, driver_info) = match crate::gpu::get_device_info() {
                     Ok(i) => i,
                     Err(e) => {
                         // For some reason, despite hyperdrive being compiled
-                        // with the "cuda" feature, we failed to get the device
-                        // info. Maybe there's no CUDA-capable device present.
-                        // Either way, we cannot continue. I'd rather not have
-                        // error handling here because (1) without the "cuda"
+                        // with the "cuda" or "hip" feature, we failed to get
+                        // the device info. Maybe there's no GPU present. Either
+                        // way, we cannot continue. I'd rather not have error
+                        // handling here because (1) without the "cuda" or "hip"
                         // feature, this function will never fail on the CPU
                         // path, so adding error handling means the caller would
                         // have to handle a `Result` uselessly and (2) if this
                         // "petty" display function fails, then we can't use the
                         // GPU for real work anyway.
+                        #[cfg(feature = "cuda")]
                         eprintln!("Couldn't retrieve CUDA device info for device 0, is a device present? {e}");
+                        #[cfg(feature = "hip")]
+                        eprintln!("Couldn't retrieve HIP device info for device 0, is a device present? {e}");
                         std::process::exit(1);
                     }
                 };
+                #[cfg(feature = "cuda")]
+                let device_type = "CUDA";
+                #[cfg(feature = "hip")]
+                let device_type = "HIP";
                 block.push(
                     format!(
-                        "CUDA device: {} (capability {}, {} MiB)",
+                        "{device_type} device: {} (capability {}, {} MiB)",
                         device_info.name, device_info.capability, device_info.total_global_mem
                     )
                     .into(),
                 );
                 block.push(
                     format!(
-                        "CUDA driver: {}, runtime: {}",
+                        "{device_type} driver: {}, runtime: {}",
                         driver_info.driver_version, driver_info.runtime_version
                     )
                     .into(),
diff --git a/src/cli/di_calibrate/error.rs b/src/cli/di_calibrate/error.rs
deleted file mode 100644
index 726d3924..00000000
--- a/src/cli/di_calibrate/error.rs
+++ /dev/null
@@ -1,177 +0,0 @@
-// This Source Code Form is subject to the terms of the Mozilla Public
-// License, v. 2.0. If a copy of the MPL was not distributed with this
-// file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-//! Errors associated with calibration arguments.
-
-use std::path::PathBuf;
-
-use thiserror::Error;
-use vec1::Vec1;
-
-use super::ARG_FILE_TYPES_COMMA_SEPARATED;
-use crate::filenames::SUPPORTED_INPUT_FILE_COMBINATIONS;
-
-/// Errors associated with DI calibration arguments.
-#[derive(Error, Debug)]
-pub(crate) enum DiCalArgsError {
-    #[error("Argument file '{0}' doesn't have a recognised file extension! Valid extensions are: {}", *ARG_FILE_TYPES_COMMA_SEPARATED)]
-    UnrecognisedArgFileExt(String),
-
-    #[error("Couldn't decode toml structure from {file}:\n{err}")]
-    TomlDecode { file: String, err: String },
-
-    #[error("Couldn't decode json structure from {file}:\n{err}")]
-    JsonDecode { file: String, err: String },
-
-    #[error("No input data was given!")]
-    NoInputData,
-
-    #[error("{0}\n\nSupported combinations of file formats:\n{SUPPORTED_INPUT_FILE_COMBINATIONS}")]
-    InvalidDataInput(&'static str),
-
-    #[error("Multiple metafits files were specified: {0:?}\nThis is unsupported.")]
-    MultipleMetafits(Vec1<PathBuf>),
-
-    #[error("Multiple measurement sets were specified: {0:?}\nThis is currently unsupported.")]
-    MultipleMeasurementSets(Vec1<PathBuf>),
-
-    #[error("Multiple uvfits files were specified: {0:?}\nThis is currently unsupported.")]
-    MultipleUvfits(Vec1<PathBuf>),
-
-    #[error("No calibration output was specified. There must be at least one calibration solution file.")]
-    NoOutput,
-
-    #[error("No sky-model source list file supplied")]
-    NoSourceList,
-
-    #[error("Tried to create a beam object, but MWA dipole delay information isn't available!")]
-    NoDelays,
-
-    #[error(
-        "The specified MWA dipole delays aren't valid; there should be 16 values between 0 and 32"
-    )]
-    BadDelays,
-
-    #[error("The data either contains no tiles or all tiles are flagged")]
-    NoTiles,
-
-    #[error("The data either contains no frequency channels or all channels are flagged")]
-    NoChannels,
-
-    #[error(
-        "All baselines were flagged due to UVW cutoffs. Try adjusting the UVW min and/or max."
-    )]
-    AllBaselinesFlaggedFromUvwCutoffs,
-
-    #[error("The data either contains no timesteps or no timesteps are being used")]
-    NoTimesteps,
-
-    #[error("The number of specified sources was 0, or the size of the source list was 0")]
-    NoSources,
-
-    #[error("After vetoing sources, none were left. Decrease the veto threshold, or supply more sources")]
-    NoSourcesAfterVeto,
-
-    #[error("Duplicate timesteps were specified; this is invalid")]
-    DuplicateTimesteps,
-
-    #[error("Timestep {got} was specified but it isn't available; the last timestep is {last}")]
-    UnavailableTimestep { got: usize, last: usize },
-
-    #[error(
-        "Cannot write visibilities to a file type '{ext}'. Supported formats are: {}", *crate::io::write::VIS_OUTPUT_EXTENSIONS
-    )]
-    VisFileType { ext: String },
-
-    #[error(transparent)]
-    TileFlag(#[from] crate::context::InvalidTileFlag),
-
-    #[error("Cannot write calibration solutions to a file type '{ext}'.\nSupported formats are: {}", *crate::solutions::CAL_SOLUTION_EXTENSIONS)]
-    CalibrationOutputFile { ext: String },
-
-    #[error(transparent)]
-    ParsePfbFlavour(#[from] crate::io::read::pfb_gains::PfbParseError),
-
-    #[error("Error when parsing time average factor: {0}")]
-    ParseCalTimeAverageFactor(crate::unit_parsing::UnitParseError),
-
-    #[error("Error when parsing freq. average factor: {0}")]
-    ParseCalFreqAverageFactor(crate::unit_parsing::UnitParseError),
-
-    #[error("Calibration time average factor isn't an integer")]
-    CalTimeFactorNotInteger,
-
-    #[error("Calibration freq. average factor isn't an integer")]
-    CalFreqFactorNotInteger,
-
-    #[error("Calibration time resolution isn't a multiple of input data's: {out} seconds vs {inp} seconds")]
-    CalTimeResNotMultiple { out: f64, inp: f64 },
-
-    #[error("Calibration freq. resolution isn't a multiple of input data's: {out} Hz vs {inp} Hz")]
-    CalFreqResNotMultiple { out: f64, inp: f64 },
-
-    #[error("Calibration time average factor cannot be 0")]
-    CalTimeFactorZero,
-
-    #[error("Calibration freq. average factor cannot be 0")]
-    CalFreqFactorZero,
-
-    #[error("Error when parsing output vis. time average factor: {0}")]
-    ParseOutputVisTimeAverageFactor(crate::unit_parsing::UnitParseError),
-
-    #[error("Error when parsing output vis. freq. average factor: {0}")]
-    ParseOutputVisFreqAverageFactor(crate::unit_parsing::UnitParseError),
-
-    #[error("Output vis. time average factor isn't an integer")]
-    OutputVisTimeFactorNotInteger,
-
-    #[error("Output vis. freq. average factor isn't an integer")]
-    OutputVisFreqFactorNotInteger,
-
-    #[error("Output vis. time average factor cannot be 0")]
-    OutputVisTimeAverageFactorZero,
-
-    #[error("Output vis. freq. average factor cannot be 0")]
-    OutputVisFreqAverageFactorZero,
-
-    #[error("Output vis. time resolution isn't a multiple of input data's: {out} seconds vs {inp} seconds")]
-    OutputVisTimeResNotMultiple { out: f64, inp: f64 },
-
-    #[error("Output vis. freq. resolution isn't a multiple of input data's: {out} Hz vs {inp} Hz")]
-    OutputVisFreqResNotMultiple { out: f64, inp: f64 },
-
-    #[error("Error when parsing minimum UVW cutoff: {0}")]
-    ParseUvwMin(crate::unit_parsing::UnitParseError),
-
-    #[error("Error when parsing maximum UVW cutoff: {0}")]
-    ParseUvwMax(crate::unit_parsing::UnitParseError),
-
-    #[error("Array position specified as {pos:?}, not [<Longitude>, <Latitude>, <Height>]")]
-    BadArrayPosition { pos: Vec<f64> },
-
-    #[error(transparent)]
-    Glob(#[from] crate::io::GlobError),
-
-    #[error(transparent)]
-    VisRead(#[from] crate::io::read::VisReadError),
-
-    #[error(transparent)]
-    FileWrite(#[from] crate::io::write::FileWriteError),
-
-    #[error(transparent)]
-    Veto(#[from] crate::srclist::VetoError),
-
-    #[error("Error when trying to read source list: {0}")]
-    SourceList(#[from] crate::srclist::ReadSourceListError),
-
-    #[error(transparent)]
-    Beam(#[from] crate::beam::BeamError),
-
-    #[error(transparent)]
-    IO(#[from] std::io::Error),
-
-    #[cfg(feature = "cuda")]
-    #[error(transparent)]
-    Cuda(#[from] crate::cuda::CudaError),
-}
diff --git a/src/cli/di_calibrate/tests.rs b/src/cli/di_calibrate/tests.rs
index e8f1a869..c56eef5a 100644
--- a/src/cli/di_calibrate/tests.rs
+++ b/src/cli/di_calibrate/tests.rs
@@ -710,17 +710,17 @@ fn test_1090008640_calibrate_model_ms() {
         // introduced. If a metafits' positions are used instead, the results
         // are *exactly* the same, but we should trust the MS's positions, so
         // these errors must remain.
-        #[cfg(feature = "cuda-single")]
+        #[cfg(feature = "gpu-single")]
         assert_abs_diff_eq!(vis_m, vis_c, epsilon = 2e-4);
-        #[cfg(not(feature = "cuda-single"))]
+        #[cfg(not(feature = "gpu-single"))]
         assert_abs_diff_eq!(vis_m, vis_c, epsilon = 4e-6);
         assert_abs_diff_eq!(weight_m, weight_c);
     }
 
     // Inspect the solutions; they should all be close to identity.
-    #[cfg(feature = "cuda-single")]
+    #[cfg(feature = "gpu-single")]
     let epsilon = 6e-8;
-    #[cfg(not(feature = "cuda-single"))]
+    #[cfg(not(feature = "gpu-single"))]
     let epsilon = 2e-9;
     assert_abs_diff_eq!(
         sols.di_jones,
@@ -772,9 +772,9 @@ fn test_cal_timeblocks() {
     // We didn't specify anything with calibration timeblocks, so this should be
     // 1 (all input data timesteps are used at once in calibration).
     assert_eq!(num_cal_timeblocks, 1);
-    #[cfg(not(feature = "cuda-single"))]
+    #[cfg(not(feature = "gpu-single"))]
     let eps = 0.0; // I am amazed
-    #[cfg(feature = "cuda-single")]
+    #[cfg(feature = "gpu-single")]
     let eps = 2e-8;
     assert_abs_diff_eq!(
         sols.di_jones,
@@ -796,9 +796,9 @@ fn test_cal_timeblocks() {
     let num_cal_timeblocks = sols.di_jones.len_of(Axis(0));
     // 3 / 2 = 1.5 = 2 rounded up
     assert_eq!(num_cal_timeblocks, 2);
-    #[cfg(not(feature = "cuda-single"))]
+    #[cfg(not(feature = "gpu-single"))]
     let eps = 0.0;
-    #[cfg(feature = "cuda-single")]
+    #[cfg(feature = "gpu-single")]
     let eps = 4e-8;
     assert_abs_diff_eq!(
         sols.di_jones,
diff --git a/src/cli/error.rs b/src/cli/error.rs
index 31048d68..fbdf3748 100644
--- a/src/cli/error.rs
+++ b/src/cli/error.rs
@@ -256,8 +256,8 @@ impl From<VisSubtractError> for HyperdriveError {
             VisSubtractError::VisWrite(e) => Self::from(e),
             VisSubtractError::Model(e) => Self::from(e),
             VisSubtractError::IO(e) => Self::from(e),
-            #[cfg(feature = "cuda")]
-            VisSubtractError::Cuda(e) => Self::from(e),
+            #[cfg(any(feature = "cuda", feature = "hip"))]
+            VisSubtractError::Gpu(e) => Self::from(e),
         }
     }
 }
@@ -429,8 +429,8 @@ impl From<BeamError> for HyperdriveError {
             | BeamError::BadTileIndex { .. }
             | BeamError::Hyperbeam(_)
             | BeamError::HyperbeamInit(_) => Self::Beam(s),
-            #[cfg(feature = "cuda")]
-            BeamError::Cuda(_) => Self::Beam(s),
+            #[cfg(any(feature = "cuda", feature = "hip"))]
+            BeamError::Gpu(_) => Self::Beam(s),
         }
     }
 }
@@ -440,8 +440,8 @@ impl From<ModelError> for HyperdriveError {
         match e {
             ModelError::Beam(e) => Self::from(e),
 
-            #[cfg(feature = "cuda")]
-            ModelError::Cuda(e) => Self::from(e),
+            #[cfg(any(feature = "cuda", feature = "hip"))]
+            ModelError::Gpu(e) => Self::from(e),
         }
     }
 }
@@ -464,9 +464,9 @@ impl From<mwalib::MwalibError> for HyperdriveError {
     }
 }
 
-#[cfg(feature = "cuda")]
-impl From<crate::cuda::CudaError> for HyperdriveError {
-    fn from(e: crate::cuda::CudaError) -> Self {
+#[cfg(any(feature = "cuda", feature = "hip"))]
+impl From<crate::gpu::GpuError> for HyperdriveError {
+    fn from(e: crate::gpu::GpuError) -> Self {
         Self::Generic(e.to_string())
     }
 }
diff --git a/src/cli/vis_simulate/tests.rs b/src/cli/vis_simulate/tests.rs
index 760094c8..4d83e5c9 100644
--- a/src/cli/vis_simulate/tests.rs
+++ b/src/cli/vis_simulate/tests.rs
@@ -203,9 +203,9 @@ fn test_1090008640_vis_simulate() {
     assert_abs_diff_eq!(group_params[4] as f64 + jd_zero, 2456860.3406018466);
 
     // The values of the visibilities changes slightly depending on the precision.
-    #[cfg(feature = "cuda-single")]
+    #[cfg(feature = "gpu-single")]
     let epsilon = 2e-4;
-    #[cfg(not(feature = "cuda-single"))]
+    #[cfg(not(feature = "gpu-single"))]
     let epsilon = 0.0;
     assert_abs_diff_eq!(
         vis[0..29],
@@ -288,9 +288,9 @@ fn test_1090008640_vis_simulate() {
     );
     assert_abs_diff_eq!(group_params[4] as f64 + jd_zero, 2456860.3406944424);
 
-    #[cfg(feature = "cuda-single")]
+    #[cfg(feature = "gpu-single")]
     let epsilon = 3e-4;
-    #[cfg(not(feature = "cuda-single"))]
+    #[cfg(not(feature = "gpu-single"))]
     let epsilon = 0.0;
     assert_abs_diff_eq!(
         vis[0..29],
@@ -338,7 +338,7 @@ fn test_1090008640_vis_simulate() {
 // exactly the same.
 #[test]
 #[serial]
-#[cfg(all(feature = "cuda", not(feature = "cuda-single")))]
+#[cfg(all(feature = "cuda", not(feature = "gpu-single")))]
 fn test_1090008640_vis_simulate_cpu_gpu_match() {
     use ndarray::prelude::*;
 
diff --git a/src/cuda/utils.cu b/src/cuda/utils.cu
deleted file mode 100644
index 84ed7f1e..00000000
--- a/src/cuda/utils.cu
+++ /dev/null
@@ -1,40 +0,0 @@
-// This Source Code Form is subject to the terms of the Mozilla Public
-// License, v. 2.0. If a copy of the MPL was not distributed with this
-// file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-// "Homegrown" CUDA utilities.
-//
-// As this code contains code derived from an official NVIDIA example
-// (https://github.com/NVIDIA/cuda-samples/blob/master/Samples/1_Utilities/deviceQuery/deviceQuery.cpp),
-// legally, a copyright, list of conditions and disclaimer must be distributed
-// with this code. This should be found in the "cuda" directory of the
-// mwa_hyperdrive git repo, file LICENSE-NVIDIA.
-
-#include <cuda.h>
-
-extern "C" const char *get_cuda_device_info(int device, char name[256], int *device_major, int *device_minor,
-                                            size_t *total_global_mem, int *driver_version, int *runtime_version) {
-    cudaError_t error_id = cudaSetDevice(device);
-    if (error_id != cudaSuccess)
-        return cudaGetErrorString(error_id);
-
-    cudaDeviceProp device_prop;
-    error_id = cudaGetDeviceProperties(&device_prop, device);
-    if (error_id != cudaSuccess)
-        return cudaGetErrorString(error_id);
-
-    memcpy(name, device_prop.name, 256);
-    *device_major = device_prop.major;
-    *device_minor = device_prop.minor;
-    *total_global_mem = device_prop.totalGlobalMem;
-
-    error_id = cudaDriverGetVersion(driver_version);
-    if (error_id != cudaSuccess)
-        return cudaGetErrorString(error_id);
-
-    error_id = cudaRuntimeGetVersion(runtime_version);
-    if (error_id != cudaSuccess)
-        return cudaGetErrorString(error_id);
-
-    return NULL;
-}
diff --git a/src/cuda/common.cuh b/src/gpu/common.cuh
similarity index 63%
rename from src/cuda/common.cuh
rename to src/gpu/common.cuh
index bc39ec1f..90dc777e 100644
--- a/src/cuda/common.cuh
+++ b/src/gpu/common.cuh
@@ -7,32 +7,92 @@
 #include <assert.h>
 #include <stdio.h>
 
-#include <cuComplex.h>
-
 #include "types.h"
 
+// HIP-specific defines.
+#if __HIPCC__
+#define gpuMalloc             hipMalloc
+#define gpuFree               hipFree
+#define gpuMemcpy             hipMemcpy
+#define gpuMemcpyHostToDevice hipMemcpyHostToDevice
+#define gpuGetErrorString     hipGetErrorString
+#define gpuGetLastError       hipGetLastError
+#define gpuDeviceSynchronize  hipDeviceSynchronize
+#define gpuError_t            hipError_t
+#define gpuSuccess            hipSuccess
+
 // If SINGLE is enabled, use single-precision floats everywhere. Otherwise
 // default to double-precision.
 #ifdef SINGLE
-#define FLOAT4  float4
-#define SINCOS  sincosf
-#define EXP     expf
-#define POW     powf
-#define FLOOR   floorf
-#define COMPLEX cuFloatComplex
-#define CUCONJ  cuConjf
-#define LOG     logf
-#define EXP     expf
+#define FLOAT4       float4
+#define SINCOS       sincosf
+#define EXP          expf
+#define POW          powf
+#define FLOOR        floorf
+#define COMPLEX      hipFloatComplex
+#define MAKE_COMPLEX make_hipFloatComplex
+#define CUCONJ       hipConjf
+#define LOG          logf
+#define EXP          expf
 #else
-#define FLOAT4  double4
-#define SINCOS  sincos
-#define EXP     exp
-#define POW     pow
-#define FLOOR   floor
-#define COMPLEX cuDoubleComplex
-#define CUCONJ  cuConj
-#define LOG     log
-#define EXP     exp
+#define FLOAT4       double4
+#define SINCOS       sincos
+#define EXP          exp
+#define POW          pow
+#define FLOOR        floor
+#define COMPLEX      hipDoubleComplex
+#define MAKE_COMPLEX make_hipDoubleComplex
+#define CUCONJ       hipConj
+#define LOG          log
+#define EXP          exp
+#endif // SINGLE
+
+// CUDA-specific defines.
+#elif __CUDACC__
+
+#define gpuMalloc             cudaMalloc
+#define gpuFree               cudaFree
+#define gpuMemcpy             cudaMemcpy
+#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
+#define gpuGetErrorString     cudaGetErrorString
+#define gpuGetLastError       cudaGetLastError
+#define gpuDeviceSynchronize  cudaDeviceSynchronize
+#define gpuError_t            cudaError_t
+#define gpuSuccess            cudaSuccess
+#define warpSize              32
+
+#ifdef SINGLE
+#define FLOAT4       float4
+#define SINCOS       sincosf
+#define EXP          expf
+#define POW          powf
+#define FLOOR        floorf
+#define COMPLEX      cuFloatComplex
+#define MAKE_COMPLEX make_cuFloatComplex
+#define CUCONJ       cuConjf
+#define LOG          logf
+#define EXP          expf
+#else
+#define FLOAT4       double4
+#define SINCOS       sincos
+#define EXP          exp
+#define POW          pow
+#define FLOOR        floor
+#define COMPLEX      cuDoubleComplex
+#define MAKE_COMPLEX make_cuDoubleComplex
+#define CUCONJ       cuConj
+#define LOG          log
+#define EXP          exp
+#endif // SINGLE
+#endif // __HIPCC__
+// #define C32 cuFloatComplex
+// #define C64 cuDoubleComplex
+
+#ifdef __CUDACC__
+#include <cuComplex.h>
+#elif __HIPCC__
+#include <hip/hip_complex.h>
+#include <hip/hip_runtime.h>
 #endif
 
 const FLOAT VEL_C = 299792458.0;                           // speed of light in a vacuum
@@ -54,33 +114,17 @@ typedef struct JONES_C {
     COMPLEX j11;
 } JONES_C;
 
-inline __device__ COMPLEX operator+(const COMPLEX a, const COMPLEX b) {
-    return COMPLEX{
-        .x = a.x + b.x,
-        .y = a.y + b.y,
-    };
-}
+inline __device__ COMPLEX operator+(const COMPLEX a, const COMPLEX b) { return MAKE_COMPLEX(a.x + b.x, a.y + b.y); }
 
 inline __device__ COMPLEX operator*(const COMPLEX a, const COMPLEX b) {
-    return COMPLEX{
-        .x = a.x * b.x - a.y * b.y,
-        .y = a.x * b.y + a.y * b.x,
-    };
+    return MAKE_COMPLEX(a.x * b.x - a.y * b.y, a.x * b.y + a.y * b.x);
 }
 
 inline __device__ void operator*=(COMPLEX &a, const COMPLEX b) {
-    a = COMPLEX{
-        .x = a.x * b.x - a.y * b.y,
-        .y = a.x * b.y + a.y * b.x,
-    };
+    a = MAKE_COMPLEX(a.x * b.x - a.y * b.y, a.x * b.y + a.y * b.x);
 }
 
-inline __device__ COMPLEX operator*(const COMPLEX a, const FLOAT b) {
-    return COMPLEX{
-        .x = a.x * b,
-        .y = a.y * b,
-    };
-}
+inline __device__ COMPLEX operator*(const COMPLEX a, const FLOAT b) { return MAKE_COMPLEX(a.x * b, a.y * b); }
 
 inline __device__ void operator+=(COMPLEX &a, const COMPLEX b) {
     a.x += b.x;
diff --git a/src/cuda/compile_flags.txt b/src/gpu/compile_flags.txt
similarity index 93%
rename from src/cuda/compile_flags.txt
rename to src/gpu/compile_flags.txt
index 93689f8a..d5d1b011 100644
--- a/src/cuda/compile_flags.txt
+++ b/src/gpu/compile_flags.txt
@@ -1,5 +1,6 @@
 -I/usr/local/cuda/include
 -I/opt/cuda/include
+-I/opt/rocm/include
 -Wall
 -Wextra
 -Wpedantic
diff --git a/src/cuda/mod.rs b/src/gpu/mod.rs
similarity index 57%
rename from src/cuda/mod.rs
rename to src/gpu/mod.rs
index 58ee9b6b..c61ef90e 100644
--- a/src/cuda/mod.rs
+++ b/src/gpu/mod.rs
@@ -2,7 +2,7 @@
 // License, v. 2.0. If a copy of the MPL was not distributed with this
 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-//! CUDA code to be used by hyperdrive.
+//! GPU code to be used by hyperdrive.
 
 #![allow(non_snake_case)]
 #![allow(clippy::upper_case_acronyms)]
@@ -21,45 +21,67 @@ use thiserror::Error;
 
 pub(crate) use utils::get_device_info;
 
-// Import Rust bindings to the CUDA code specific to the precision we're using,
-// and set corresponding compile-time types.
+// Import Rust bindings to the CUDA/HIP code specific to the precision we're
+// using, and set corresponding compile-time types.
 cfg_if::cfg_if! {
-    if #[cfg(feature = "cuda-single")] {
-        /// f32 (using the "cuda-single" feature)
-        pub(crate) type CudaFloat = f32;
-        pub(crate) type CudaJones = JonesF32;
+    if #[cfg(feature = "gpu-single")] {
+        /// f32 (using the "gpu-single" feature)
+        pub(crate) type GpuFloat = f32;
+        pub(crate) type GpuJones = JonesF32;
 
         include!("types_single.rs");
         include!("model_single.rs");
-    } else if #[cfg(all(feature = "cuda", not(feature = "cuda-single")))] {
-        /// f64 (using the "cuda" feature and not "cuda-single")
-        pub(crate) type CudaFloat = f64;
-        pub(crate) type CudaJones = JonesF64;
+    } else if #[cfg(all(any(feature = "cuda", feature = "hip"), not(feature = "gpu-single")))] {
+        /// f64 (not using "gpu-single")
+        pub(crate) type GpuFloat = f64;
+        pub(crate) type GpuJones = JonesF64;
 
         include!("types_double.rs");
         include!("model_double.rs");
     }
 }
 
-// Ensure that the shapelet constants are the same in the Rust code and CUDA
+// Import CUDA/HIP functions into the same names.
+#[cfg(feature = "cuda")]
+use cuda_runtime_sys::{
+    cudaDeviceSynchronize as gpuDeviceSynchronize, cudaError::cudaSuccess as gpuSuccess,
+    cudaFree as gpuFree, cudaGetErrorString as gpuGetErrorString,
+    cudaGetLastError as gpuGetLastError, cudaMalloc as gpuMalloc, cudaMemcpy as gpuMemcpy,
+    cudaMemcpyKind::cudaMemcpyDeviceToHost as gpuMemcpyDeviceToHost,
+    cudaMemcpyKind::cudaMemcpyHostToDevice as gpuMemcpyHostToDevice,
+};
+#[cfg(feature = "hip")]
+use hip_sys::hiprt::{
+    hipDeviceSynchronize as gpuDeviceSynchronize, hipError_t::hipSuccess as gpuSuccess,
+    hipFree as gpuFree, hipGetErrorString as gpuGetErrorString, hipGetLastError as gpuGetLastError,
+    hipMalloc as gpuMalloc, hipMemcpy as gpuMemcpy,
+    hipMemcpyKind::hipMemcpyDeviceToHost as gpuMemcpyDeviceToHost,
+    hipMemcpyKind::hipMemcpyHostToDevice as gpuMemcpyHostToDevice,
+};
+
+// Ensure that the shapelet constants are the same in the Rust code and GPU
 // code.
 static_assertions::const_assert_eq!(crate::model::shapelets::SBF_L as i32, SBF_L);
 static_assertions::const_assert_eq!(crate::model::shapelets::SBF_N as i32, SBF_N);
-static_assertions::const_assert_eq!(crate::model::shapelets::SBF_C as CudaFloat, SBF_C);
-static_assertions::const_assert_eq!(crate::model::shapelets::SBF_DX as CudaFloat, SBF_DX);
+static_assertions::const_assert_eq!(crate::model::shapelets::SBF_C as GpuFloat, SBF_C);
+static_assertions::const_assert_eq!(crate::model::shapelets::SBF_DX as GpuFloat, SBF_DX);
 
-macro_rules! cuda_kernel_call {
-    ($cuda_fn:path, $($args:expr),* $(,)?) => {{
+macro_rules! gpu_kernel_call {
+    ($gpu_fn:path, $($args:expr),* $(,)?) => {{
         #[allow(unused_unsafe)]
         unsafe {
-            let error_message_ptr = $cuda_fn($($args),*);
+            let error_message_ptr = $gpu_fn($($args),*);
             if error_message_ptr.is_null() {
                 Ok(())
             } else {
-                // Get the CUDA error message behind the pointer.
-                let error_message = std::ffi::CStr::from_ptr(error_message_ptr).to_str().unwrap_or("<cannot read CUDA error string>");
-                let our_error_message = format!("{}: {error_message}", stringify!($cuda_fn));
-                Err(CudaError::Kernel {
+                // Get the GPU error message behind the pointer.
+                let error_message = std::ffi::CStr::from_ptr(error_message_ptr).to_str();
+                #[cfg(feature = "cuda")]
+                let error_message = error_message.unwrap_or("<cannot read CUDA error string>");
+                #[cfg(feature = "hip")]
+                let error_message = error_message.unwrap_or("<cannot read HIP error string>");
+                let our_error_message = format!("{}: {error_message}", stringify!($gpu_fn));
+                Err(GpuError::Kernel {
                     msg: our_error_message.into(),
                     file: file!(),
                     line: line!(),
@@ -68,47 +90,50 @@ macro_rules! cuda_kernel_call {
         }
     }};
 }
-pub(crate) use cuda_kernel_call;
+pub(crate) use gpu_kernel_call;
 
 #[derive(Clone, Copy)]
-pub(crate) enum CudaCall {
+pub(crate) enum GpuCall {
     Malloc,
     CopyToDevice,
     CopyFromDevice,
 }
 
-/// Run [`cuda_runtime_sys::cudaGetLastError`] and
-/// [`cuda_runtime_sys::cudaDeviceSynchronize`]. If either of these calls return
-/// an error, it is converted to a Rust error and returned from this function.
-/// The single argument describes what the just-performed operation was and
-/// makes the returned error a helpful one.
+/// Run [`gpuGetLastError`] and [`gpuDeviceSynchronize`]. If either of these
+/// calls return an error, it is converted to a Rust error and returned from
+/// this function. The single argument describes what the just-performed
+/// operation was and makes the returned error a helpful one.
 ///
 /// # Safety
 ///
-/// This function interfaces directly with the CUDA API. Rust errors attempt to
-/// catch problems but there are no guarantees.
+/// This function interfaces directly with the CUDA/HIP API. Rust errors attempt
+/// to catch problems but there are no guarantees.
 #[track_caller]
-unsafe fn check_for_errors(cuda_call: CudaCall) -> Result<(), CudaError> {
+unsafe fn check_for_errors(gpu_call: GpuCall) -> Result<(), GpuError> {
     // Only do a device sync if we're in debug mode, for performance.
     let debug_mode = matches!(std::env::var("DEBUG").as_deref(), Ok("true"));
     if debug_mode {
-        let code = cuda_runtime_sys::cudaDeviceSynchronize();
-        if code != cuda_runtime_sys::cudaError::cudaSuccess {
-            let c_str = CStr::from_ptr(cuda_runtime_sys::cudaGetErrorString(code));
-            let msg = c_str.to_str().unwrap_or("<cannot read CUDA error string>");
+        let code = gpuDeviceSynchronize();
+        if code != gpuSuccess {
+            let c_str = CStr::from_ptr(gpuGetErrorString(code));
+            let msg = c_str.to_str();
+            #[cfg(feature = "cuda")]
+            let msg = msg.unwrap_or("<cannot read CUDA error string>");
+            #[cfg(feature = "hip")]
+            let msg = msg.unwrap_or("<cannot read HIP error string>");
             let location = Location::caller();
-            return Err(match cuda_call {
-                CudaCall::Malloc => CudaError::Malloc {
+            return Err(match gpu_call {
+                GpuCall::Malloc => GpuError::Malloc {
                     msg: msg.into(),
                     file: location.file(),
                     line: location.line(),
                 },
-                CudaCall::CopyToDevice => CudaError::CopyToDevice {
+                GpuCall::CopyToDevice => GpuError::CopyToDevice {
                     msg: msg.into(),
                     file: location.file(),
                     line: location.line(),
                 },
-                CudaCall::CopyFromDevice => CudaError::CopyFromDevice {
+                GpuCall::CopyFromDevice => GpuError::CopyFromDevice {
                     msg: msg.into(),
                     file: location.file(),
                     line: location.line(),
@@ -117,23 +142,27 @@ unsafe fn check_for_errors(cuda_call: CudaCall) -> Result<(), CudaError> {
         }
     }
 
-    let code = cuda_runtime_sys::cudaGetLastError();
-    if code != cuda_runtime_sys::cudaError::cudaSuccess {
-        let c_str = CStr::from_ptr(cuda_runtime_sys::cudaGetErrorString(code));
-        let msg = c_str.to_str().unwrap_or("<cannot read CUDA error string>");
+    let code = gpuGetLastError();
+    if code != gpuSuccess {
+        let c_str = CStr::from_ptr(gpuGetErrorString(code));
+        let msg = c_str.to_str();
+        #[cfg(feature = "cuda")]
+        let msg = msg.unwrap_or("<cannot read CUDA error string>");
+        #[cfg(feature = "hip")]
+        let msg = msg.unwrap_or("<cannot read HIP error string>");
         let location = Location::caller();
-        return Err(match cuda_call {
-            CudaCall::Malloc => CudaError::Malloc {
+        return Err(match gpu_call {
+            GpuCall::Malloc => GpuError::Malloc {
                 msg: msg.into(),
                 file: location.file(),
                 line: location.line(),
             },
-            CudaCall::CopyToDevice => CudaError::CopyToDevice {
+            GpuCall::CopyToDevice => GpuError::CopyToDevice {
                 msg: msg.into(),
                 file: location.file(),
                 line: location.line(),
             },
-            CudaCall::CopyFromDevice => CudaError::CopyFromDevice {
+            GpuCall::CopyFromDevice => GpuError::CopyFromDevice {
                 msg: msg.into(),
                 file: location.file(),
                 line: location.line(),
@@ -145,7 +174,7 @@ unsafe fn check_for_errors(cuda_call: CudaCall) -> Result<(), CudaError> {
 }
 
 /// A Rust-managed pointer to CUDA device memory. When this is dropped,
-/// [`cuda_runtime_sys::cudaFree`] is called on the pointer.
+/// [`gpuFree`] is called on the pointer.
 #[derive(Debug)]
 pub(crate) struct DevicePointer<T> {
     ptr: *mut T,
@@ -158,7 +187,7 @@ impl<T> Drop for DevicePointer<T> {
     fn drop(&mut self) {
         if !self.ptr.is_null() {
             unsafe {
-                cuda_runtime_sys::cudaFree(self.ptr.cast());
+                gpuFree(self.ptr.cast());
             }
         }
     }
@@ -177,14 +206,14 @@ impl<T> DevicePointer<T> {
 
     /// Allocate a number of bytes on the device.
     #[track_caller]
-    pub(crate) fn malloc(size: usize) -> Result<DevicePointer<T>, CudaError> {
+    pub(crate) fn malloc(size: usize) -> Result<DevicePointer<T>, GpuError> {
         if size == 0 {
             Ok(Self::default())
         } else {
             let mut d_ptr = std::ptr::null_mut();
             unsafe {
-                cuda_runtime_sys::cudaMalloc(&mut d_ptr, size);
-                check_for_errors(CudaCall::Malloc)?;
+                gpuMalloc(&mut d_ptr, size);
+                check_for_errors(GpuCall::Malloc)?;
             }
             Ok(Self {
                 ptr: d_ptr.cast(),
@@ -197,12 +226,12 @@ impl<T> DevicePointer<T> {
     /// is smaller than `self.size`. Note that unlike `libc`'s `remalloc`, if a
     /// new buffer is created, the original bytes are not preserved.
     #[track_caller]
-    pub(crate) fn realloc(&mut self, size: usize) -> Result<(), CudaError> {
+    pub(crate) fn realloc(&mut self, size: usize) -> Result<(), GpuError> {
         if size <= self.size {
             return Ok(());
         }
 
-        // CUDA doesn't provide a realloc, so just make a new `DevicePointer`
+        // CUDA/HIP don't provide a realloc, so just make a new `DevicePointer`
         // and swap it with the old one; the old buffer will be dropped.
         let mut new = Self::malloc(size)?;
         std::mem::swap(self, &mut new);
@@ -212,17 +241,17 @@ impl<T> DevicePointer<T> {
     /// Copy a slice of data to the device. Any type is allowed, and the returned
     /// pointer is to the device memory.
     #[track_caller]
-    pub(crate) fn copy_to_device(v: &[T]) -> Result<DevicePointer<T>, CudaError> {
+    pub(crate) fn copy_to_device(v: &[T]) -> Result<DevicePointer<T>, GpuError> {
         let size = std::mem::size_of_val(v);
         unsafe {
             let mut d_ptr = Self::malloc(size)?;
-            cuda_runtime_sys::cudaMemcpy(
+            gpuMemcpy(
                 d_ptr.get_mut().cast(),
                 v.as_ptr().cast(),
                 size,
-                cuda_runtime_sys::cudaMemcpyKind::cudaMemcpyHostToDevice,
+                gpuMemcpyHostToDevice,
             );
-            check_for_errors(CudaCall::CopyToDevice)?;
+            check_for_errors(GpuCall::CopyToDevice)?;
             Ok(d_ptr)
         }
     }
@@ -231,10 +260,10 @@ impl<T> DevicePointer<T> {
     /// bytes in the `DevicePointer` and `v`. The contents of `v` are
     /// overwritten.
     #[track_caller]
-    pub fn copy_from_device(&self, v: &mut [T]) -> Result<(), CudaError> {
+    pub fn copy_from_device(&self, v: &mut [T]) -> Result<(), GpuError> {
         let location = Location::caller();
         if self.ptr.is_null() {
-            return Err(CudaError::CopyFromDevice {
+            return Err(GpuError::CopyFromDevice {
                 msg: "Attempted to copy data from a null device pointer".into(),
                 file: location.file(),
                 line: location.line(),
@@ -243,7 +272,7 @@ impl<T> DevicePointer<T> {
 
         let size = std::mem::size_of_val(v);
         if size != self.size {
-            return Err(CudaError::CopyFromDevice {
+            return Err(GpuError::CopyFromDevice {
                 msg: format!(
                     "Device buffer size {} is not equal to provided buffer size {size} (length {})",
                     self.size,
@@ -256,13 +285,13 @@ impl<T> DevicePointer<T> {
         }
 
         unsafe {
-            cuda_runtime_sys::cudaMemcpy(
+            gpuMemcpy(
                 v.as_mut_ptr().cast(),
                 self.ptr.cast(),
                 size,
-                cuda_runtime_sys::cudaMemcpyKind::cudaMemcpyDeviceToHost,
+                gpuMemcpyDeviceToHost,
             );
-            check_for_errors(CudaCall::CopyFromDevice)
+            check_for_errors(GpuCall::CopyFromDevice)
         }
     }
 
@@ -271,7 +300,7 @@ impl<T> DevicePointer<T> {
     /// what is already allocated against the pointer, then the buffer is freed
     /// and another is created to fit `v` (i.e. re-alloc).
     #[track_caller]
-    pub(crate) fn overwrite(&mut self, v: &[T]) -> Result<(), CudaError> {
+    pub(crate) fn overwrite(&mut self, v: &[T]) -> Result<(), GpuError> {
         // Nothing to do if the collection is empty.
         if v.is_empty() {
             return Ok(());
@@ -280,22 +309,27 @@ impl<T> DevicePointer<T> {
         let size = std::mem::size_of_val(v);
         self.realloc(size)?;
         unsafe {
-            cuda_runtime_sys::cudaMemcpy(
+            gpuMemcpy(
                 self.get_mut() as *mut c_void,
                 v.as_ptr().cast(),
                 size,
-                cuda_runtime_sys::cudaMemcpyKind::cudaMemcpyHostToDevice,
+                gpuMemcpyHostToDevice,
             );
-            check_for_errors(CudaCall::CopyToDevice)
+            check_for_errors(GpuCall::CopyToDevice)
         }
     }
 
     /// Clear all of the bytes in the buffer by writing zeros.
     #[cfg(test)]
     pub(crate) fn clear(&mut self) {
+        #[cfg(feature = "cuda")]
+        use cuda_runtime_sys::cudaMemset as gpuMemset;
+        #[cfg(feature = "hip")]
+        use hip_sys::hiprt::hipMemset as gpuMemset;
+
         unsafe {
             if self.size > 0 {
-                cuda_runtime_sys::cudaMemset(self.get_mut().cast(), 0, self.size);
+                gpuMemset(self.get_mut().cast(), 0, self.size);
             }
         }
     }
@@ -305,10 +339,10 @@ impl<T: Default> DevicePointer<T> {
     /// Copy a slice of data from the device. There must be an equal number of
     /// bytes in the `DevicePointer` and `v`.
     #[track_caller]
-    pub fn copy_from_device_new(&self) -> Result<Vec<T>, CudaError> {
+    pub fn copy_from_device_new(&self) -> Result<Vec<T>, GpuError> {
         if self.ptr.is_null() {
             let location = Location::caller();
-            return Err(CudaError::CopyFromDevice {
+            return Err(GpuError::CopyFromDevice {
                 msg: "Attempted to copy data from a null device pointer".into(),
                 file: location.file(),
                 line: location.line(),
@@ -319,13 +353,13 @@ impl<T: Default> DevicePointer<T> {
         v.resize_with(self.size / std::mem::size_of::<T>(), || T::default());
 
         unsafe {
-            cuda_runtime_sys::cudaMemcpy(
+            gpuMemcpy(
                 v.as_mut_ptr().cast(),
                 self.ptr.cast(),
                 self.size,
-                cuda_runtime_sys::cudaMemcpyKind::cudaMemcpyDeviceToHost,
+                gpuMemcpyDeviceToHost,
             );
-            check_for_errors(CudaCall::CopyFromDevice)?;
+            check_for_errors(GpuCall::CopyFromDevice)?;
         }
 
         Ok(v)
@@ -342,7 +376,8 @@ impl<T> Default for DevicePointer<T> {
 }
 
 #[derive(Error, Debug)]
-pub enum CudaError {
+pub enum GpuError {
+    #[cfg(feature = "cuda")]
     #[error("{file}:{line}: cudaMemcpy to device failed: {msg}")]
     CopyToDevice {
         msg: Box<str>,
@@ -350,6 +385,15 @@ pub enum CudaError {
         line: u32,
     },
 
+    #[cfg(feature = "hip")]
+    #[error("{file}:{line}: hipMemcpy to device failed: {msg}")]
+    CopyToDevice {
+        msg: Box<str>,
+        file: &'static str,
+        line: u32,
+    },
+
+    #[cfg(feature = "cuda")]
     #[error("{file}:{line}: cudaMemcpy from device failed: {msg}")]
     CopyFromDevice {
         msg: Box<str>,
@@ -357,6 +401,15 @@ pub enum CudaError {
         line: u32,
     },
 
+    #[cfg(feature = "hip")]
+    #[error("{file}:{line}: hipMemcpy from device failed: {msg}")]
+    CopyFromDevice {
+        msg: Box<str>,
+        file: &'static str,
+        line: u32,
+    },
+
+    #[cfg(feature = "cuda")]
     #[error("{file}:{line}: cudaMalloc error: {msg}")]
     Malloc {
         msg: Box<str>,
@@ -364,6 +417,15 @@ pub enum CudaError {
         line: u32,
     },
 
+    #[cfg(feature = "hip")]
+    #[error("{file}:{line}: hipMalloc error: {msg}")]
+    Malloc {
+        msg: Box<str>,
+        file: &'static str,
+        line: u32,
+    },
+
+    #[cfg(feature = "cuda")]
     #[error("{file}:{line}: CUDA kernel error: {msg}")]
     Kernel {
         msg: Box<str>,
@@ -371,6 +433,23 @@ pub enum CudaError {
         line: u32,
     },
 
+    #[cfg(feature = "hip")]
+    #[error("{file}:{line}: HIP kernel error: {msg}")]
+    Kernel {
+        msg: Box<str>,
+        file: &'static str,
+        line: u32,
+    },
+
+    #[cfg(feature = "cuda")]
+    #[error("{file}:{line}: {msg}")]
+    Generic {
+        msg: Box<str>,
+        file: &'static str,
+        line: u32,
+    },
+
+    #[cfg(feature = "hip")]
     #[error("{file}:{line}: {msg}")]
     Generic {
         msg: Box<str>,
@@ -379,7 +458,7 @@ pub enum CudaError {
     },
 }
 
-// Suppress warnings for unused CUDA shapelet consts.
+// Suppress warnings for unused GPU shapelet consts.
 mod unused {
     #[allow(unused)]
     fn unused() {
diff --git a/src/cuda/model.cu b/src/gpu/model.cu
similarity index 95%
rename from src/cuda/model.cu
rename to src/gpu/model.cu
index 322f89ec..c6f39934 100644
--- a/src/cuda/model.cu
+++ b/src/gpu/model.cu
@@ -4,8 +4,6 @@
 
 #include <stdlib.h>
 
-#include <cuComplex.h>
-
 #include "common.cuh"
 #include "model.h"
 #include "types.h"
@@ -64,10 +62,7 @@ inline __device__ COMPLEX get_shapelet_envelope(const GaussianParams g_params, c
     int x_pos_int = x_pos < 0 ? 0 : (int)FLOOR(x_pos);
     int y_pos_int = y_pos < 0 ? 0 : (int)FLOOR(y_pos);
 
-    COMPLEX envelope = COMPLEX{
-        .x = 0.0,
-        .y = 0.0,
-    };
+    COMPLEX envelope = MAKE_COMPLEX(0.0, 0.0);
     for (int i_coeff = 0; i_coeff < num_coeffs; i_coeff++) {
         const ShapeletCoeff coeff = coeffs[i_coeff];
 
@@ -92,10 +87,7 @@ inline __device__ COMPLEX get_shapelet_envelope(const GaussianParams g_params, c
         //
         // The following is my attempt at doing this efficiently.
         int i_power_index = (int)((coeff.n1 + coeff.n2) % 4);
-        COMPLEX i_power = COMPLEX{
-            .x = I_POWERS_REAL[i_power_index],
-            .y = I_POWERS_IMAG[i_power_index],
-        };
+        COMPLEX i_power = MAKE_COMPLEX(I_POWERS_REAL[i_power_index], I_POWERS_IMAG[i_power_index]);
 
         envelope += i_power * rest;
     }
@@ -395,16 +387,16 @@ extern "C" const char *model_points(const Points *comps, const Addresses *a, con
                                                a->d_tile_map, a->d_freq_map, a->num_unique_beam_freqs,
                                                a->d_tile_index_to_unflagged_tile_index_map, d_vis_fb);
 
-    cudaError_t error_id;
+    gpuError_t error_id;
 #ifdef DEBUG
-    error_id = cudaDeviceSynchronize();
-    if (error_id != cudaSuccess) {
-        return cudaGetErrorString(error_id);
+    error_id = gpuDeviceSynchronize();
+    if (error_id != gpuSuccess) {
+        return gpuGetErrorString(error_id);
     }
 #endif
-    error_id = cudaGetLastError();
-    if (error_id != cudaSuccess) {
-        return cudaGetErrorString(error_id);
+    error_id = gpuGetLastError();
+    if (error_id != gpuSuccess) {
+        return gpuGetErrorString(error_id);
     }
 
     return NULL;
@@ -421,16 +413,16 @@ extern "C" const char *model_gaussians(const Gaussians *comps, const Addresses *
                                                   d_beam_jones, a->d_tile_map, a->d_freq_map, a->num_unique_beam_freqs,
                                                   a->d_tile_index_to_unflagged_tile_index_map, d_vis_fb);
 
-    cudaError_t error_id;
+    gpuError_t error_id;
 #ifdef DEBUG
-    error_id = cudaDeviceSynchronize();
-    if (error_id != cudaSuccess) {
-        return cudaGetErrorString(error_id);
+    error_id = gpuDeviceSynchronize();
+    if (error_id != gpuSuccess) {
+        return gpuGetErrorString(error_id);
     }
 #endif
-    error_id = cudaGetLastError();
-    if (error_id != cudaSuccess) {
-        return cudaGetErrorString(error_id);
+    error_id = gpuGetLastError();
+    if (error_id != gpuSuccess) {
+        return gpuGetErrorString(error_id);
     }
 
     return NULL;
@@ -447,16 +439,16 @@ extern "C" const char *model_shapelets(const Shapelets *comps, const Addresses *
         a->num_freqs, a->num_baselines, a->d_freqs, d_uvws, *comps, a->d_shapelet_basis_values, d_beam_jones,
         a->d_tile_map, a->d_freq_map, a->num_unique_beam_freqs, a->d_tile_index_to_unflagged_tile_index_map, d_vis_fb);
 
-    cudaError_t error_id;
+    gpuError_t error_id;
 #ifdef DEBUG
-    error_id = cudaDeviceSynchronize();
-    if (error_id != cudaSuccess) {
-        return cudaGetErrorString(error_id);
+    error_id = gpuDeviceSynchronize();
+    if (error_id != gpuSuccess) {
+        return gpuGetErrorString(error_id);
     }
 #endif
-    error_id = cudaGetLastError();
-    if (error_id != cudaSuccess) {
-        return cudaGetErrorString(error_id);
+    error_id = gpuGetLastError();
+    if (error_id != gpuSuccess) {
+        return gpuGetErrorString(error_id);
     }
 
     return NULL;
diff --git a/src/cuda/model.h b/src/gpu/model.h
similarity index 100%
rename from src/cuda/model.h
rename to src/gpu/model.h
diff --git a/src/cuda/model_double.rs b/src/gpu/model_double.rs
similarity index 100%
rename from src/cuda/model_double.rs
rename to src/gpu/model_double.rs
diff --git a/src/cuda/model_single.rs b/src/gpu/model_single.rs
similarity index 100%
rename from src/cuda/model_single.rs
rename to src/gpu/model_single.rs
diff --git a/src/cuda/tests.rs b/src/gpu/tests.rs
similarity index 78%
rename from src/cuda/tests.rs
rename to src/gpu/tests.rs
index 48386ea0..e46569b7 100644
--- a/src/cuda/tests.rs
+++ b/src/gpu/tests.rs
@@ -25,12 +25,18 @@ fn copy_to_and_from_device_succeeds() {
 
 #[test]
 #[serial]
-fn cuda_malloc_huge_fails() {
+fn gpu_malloc_huge_fails() {
     let size = 1024_usize.pow(4); // 1 TB;
-    let result: Result<DevicePointer<u8>, CudaError> = DevicePointer::malloc(size);
+    let result: Result<DevicePointer<u8>, GpuError> = DevicePointer::malloc(size);
     assert!(result.is_err());
-    let err = result.unwrap_err();
-    assert!(err.to_string().ends_with("cudaMalloc error: out of memory"));
+    let err = result.unwrap_err().to_string();
+    #[cfg(feature = "cuda")]
+    assert!(err.ends_with("cudaMalloc error: out of memory"), "{err}");
+    #[cfg(feature = "hip")]
+    assert!(
+        err.contains("hipMalloc error"),
+        "Error string wasn't expected; got: {err}"
+    );
 }
 
 #[test]
@@ -43,7 +49,10 @@ fn copy_from_non_existent_pointer_fails() {
     let result = d_ptr.copy_from_device(&mut dest);
     assert!(result.is_err());
     let err = result.unwrap_err().to_string();
+    #[cfg(feature = "cuda")]
     assert!(err.contains("cudaMemcpy from device failed"));
+    #[cfg(feature = "hip")]
+    assert!(err.contains("hipMemcpy from device failed"));
     assert!(err.contains("Attempted to copy data from a null device pointer"));
 }
 
diff --git a/src/cuda/types.h b/src/gpu/types.h
similarity index 100%
rename from src/cuda/types.h
rename to src/gpu/types.h
diff --git a/src/cuda/types_double.rs b/src/gpu/types_double.rs
similarity index 100%
rename from src/cuda/types_double.rs
rename to src/gpu/types_double.rs
diff --git a/src/cuda/types_single.rs b/src/gpu/types_single.rs
similarity index 100%
rename from src/cuda/types_single.rs
rename to src/gpu/types_single.rs
diff --git a/src/cuda/update_rust_bindings.sh b/src/gpu/update_rust_bindings.sh
similarity index 90%
rename from src/cuda/update_rust_bindings.sh
rename to src/gpu/update_rust_bindings.sh
index 0adb84a5..5070e37d 100755
--- a/src/cuda/update_rust_bindings.sh
+++ b/src/gpu/update_rust_bindings.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 
-# Update the Rust bindings to CUDA code. This script must be run whenever the C
-# headers for CUDA code change.
+# Update the Rust bindings to GPU code. This script must be run whenever the C
+# headers for GPU code change.
 
 # This script requires bindgen. This can be provided by a package manager or
 # installed with "cargo install bindgen".
@@ -10,7 +10,7 @@
 SCRIPTPATH="$(cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P)"
 
 bindgen "${SCRIPTPATH}"/utils.h \
-    --allowlist-function "get_cuda_device_info" \
+    --allowlist-function "get_gpu_device_info" \
     > "${SCRIPTPATH}"/utils_bindings.rs
 
 for PRECISION in SINGLE DOUBLE; do
diff --git a/src/gpu/utils.cu b/src/gpu/utils.cu
new file mode 100644
index 00000000..dc77142a
--- /dev/null
+++ b/src/gpu/utils.cu
@@ -0,0 +1,68 @@
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License, v. 2.0. If a copy of the MPL was not distributed with this
+// file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// "Homegrown" GPU utilities.
+//
+// As this code contains code derived from an official NVIDIA example
+// (https://github.com/NVIDIA/cuda-samples/blob/master/Samples/1_Utilities/deviceQuery/deviceQuery.cpp),
+// legally, a copyright, list of conditions and disclaimer must be distributed
+// with this code. This should be found in the root directory of the
+// mwa_hyperdrive git repo, file LICENSE-NVIDIA.
+
+// HIP-specific defines.
+#if __HIPCC__
+#define gpuDeviceProp          hipDeviceProp_t
+#define gpuError_t             hipError_t
+#define gpuDriverGetVersion    hipDriverGetVersion
+#define gpuGetDeviceProperties hipGetDeviceProperties
+#define gpuGetErrorString      hipGetErrorString
+#define gpuRuntimeGetVersion   hipRuntimeGetVersion
+#define gpuSetDevice           hipSetDevice
+#define gpuSuccess             hipSuccess
+
+// CUDA-specific defines.
+#elif __CUDACC__
+#define gpuDeviceProp          cudaDeviceProp
+#define gpuError_t             cudaError_t
+#define gpuDriverGetVersion    cudaDriverGetVersion
+#define gpuGetDeviceProperties cudaGetDeviceProperties
+#define gpuGetErrorString      cudaGetErrorString
+#define gpuRuntimeGetVersion   cudaRuntimeGetVersion
+#define gpuSetDevice           cudaSetDevice
+#define gpuSuccess             cudaSuccess
+#endif // __HIPCC__
+
+#ifdef __CUDACC__
+#include <cuda.h>
+#elif __HIPCC__
+#include <hip/hip_complex.h>
+#include <hip/hip_runtime.h>
+#endif
+
+extern "C" const char *get_gpu_device_info(int device, char name[256], int *device_major, int *device_minor,
+                                           size_t *total_global_mem, int *driver_version, int *runtime_version) {
+    gpuError_t error_id = gpuSetDevice(device);
+    if (error_id != gpuSuccess)
+        return gpuGetErrorString(error_id);
+
+    gpuDeviceProp device_prop;
+    error_id = gpuGetDeviceProperties(&device_prop, device);
+    if (error_id != gpuSuccess)
+        return gpuGetErrorString(error_id);
+
+    memcpy(name, device_prop.name, 256);
+    *device_major = device_prop.major;
+    *device_minor = device_prop.minor;
+    *total_global_mem = device_prop.totalGlobalMem;
+
+    error_id = gpuDriverGetVersion(driver_version);
+    if (error_id != gpuSuccess)
+        return gpuGetErrorString(error_id);
+
+    error_id = gpuRuntimeGetVersion(runtime_version);
+    if (error_id != gpuSuccess)
+        return gpuGetErrorString(error_id);
+
+    return NULL;
+}
diff --git a/src/cuda/utils.h b/src/gpu/utils.h
similarity index 77%
rename from src/cuda/utils.h
rename to src/gpu/utils.h
index 3eb1d9df..862b0971 100644
--- a/src/cuda/utils.h
+++ b/src/gpu/utils.h
@@ -3,7 +3,7 @@
 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
 /**
- * Utilities for CUDA devices.
+ * Utilities for CUDA/HIP devices.
  */
 
 #pragma once
@@ -25,8 +25,8 @@ extern "C" {
  * code. This should be found in the root of the mwa_hyperdrive git repo, file
  * LICENSE-NVIDIA.
  */
-const char *get_cuda_device_info(int device, char name[256], int *device_major, int *device_minor,
-                                 size_t *total_global_mem, int *driver_version, int *runtime_version);
+const char *get_gpu_device_info(int device, char name[256], int *device_major, int *device_minor,
+                                size_t *total_global_mem, int *driver_version, int *runtime_version);
 
 #ifdef __cplusplus
 } // extern "C"
diff --git a/src/cuda/utils.rs b/src/gpu/utils.rs
similarity index 52%
rename from src/cuda/utils.rs
rename to src/gpu/utils.rs
index 55b110a1..c4e31eb6 100644
--- a/src/cuda/utils.rs
+++ b/src/gpu/utils.rs
@@ -2,7 +2,7 @@
 // License, v. 2.0. If a copy of the MPL was not distributed with this
 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-//! Utilities for CUDA devices.
+//! Utilities for CUDA/HIP devices.
 //!
 //! We assume that everything is UTF-8.
 
@@ -13,28 +13,27 @@ use std::{
     panic::Location,
 };
 
-use super::CudaError;
+use super::GpuError;
 
 #[derive(Debug, Clone)]
-pub(crate) struct CudaDriverInfo {
-    /// Formatted CUDA driver version, e.g. "11.7".
+pub(crate) struct GpuDriverInfo {
+    /// Formatted CUDA/HIP driver version, e.g. "11.7".
     pub(crate) driver_version: Box<str>,
-    /// Formatted CUDA runtime version, e.g. "11.7".
+    /// Formatted CUDA/HIP runtime version, e.g. "11.7".
     pub(crate) runtime_version: Box<str>,
 }
 
 #[derive(Debug, Clone)]
-pub(crate) struct CudaDeviceInfo {
+pub(crate) struct GpuDeviceInfo {
     pub(crate) name: Box<str>,
     pub(crate) capability: Box<str>,
     /// \[MebiBytes (MiB)\]
     pub(crate) total_global_mem: usize,
 }
 
-/// Get CUDA device and driver information. At present, this function only
+/// Get CUDA/HIP device and driver information. At present, this function only
 /// returns information on "device 0".
-#[track_caller]
-pub(crate) fn get_device_info() -> Result<(CudaDeviceInfo, CudaDriverInfo), CudaError> {
+pub(crate) fn get_device_info() -> Result<(GpuDeviceInfo, GpuDriverInfo), GpuError> {
     unsafe {
         // TODO: Always assume we're using device 0 for now.
         let device = 0;
@@ -44,7 +43,7 @@ pub(crate) fn get_device_info() -> Result<(CudaDeviceInfo, CudaDriverInfo), Cuda
         let mut total_global_mem = 0;
         let mut driver_version = 0;
         let mut runtime_version = 0;
-        let error_message_ptr = get_cuda_device_info(
+        let error_message_ptr = get_gpu_device_info(
             device,
             name,
             &mut device_major,
@@ -54,38 +53,60 @@ pub(crate) fn get_device_info() -> Result<(CudaDeviceInfo, CudaDriverInfo), Cuda
             &mut runtime_version,
         );
         if !error_message_ptr.is_null() {
-            // Get the CUDA error message behind the pointer.
-            let error_message = CStr::from_ptr(error_message_ptr)
-                .to_str()
-                .unwrap_or("<cannot read CUDA error string>");
+            // Get the CUDA/HIP error message behind the pointer.
+            let error_message = CStr::from_ptr(error_message_ptr).to_str();
+            #[cfg(feature = "cuda")]
+            let error_message = error_message.unwrap_or("<cannot read CUDA error string>");
+            #[cfg(feature = "hip")]
+            let error_message = error_message.unwrap_or("<cannot read HIP error string>");
             let location = Location::caller();
-            return Err(CudaError::Generic {
+            return Err(GpuError::Generic {
                 msg: error_message.into(),
                 file: location.file(),
                 line: location.line(),
             });
         }
 
-        let device_info = CudaDeviceInfo {
+        let device_info = GpuDeviceInfo {
             name: CString::from_raw(name)
                 .to_str()
-                .expect("CUDA device name isn't UTF-8")
+                .expect("GPU device name isn't UTF-8")
                 .to_string()
                 .into_boxed_str(),
             capability: format!("{device_major}.{device_minor}").into_boxed_str(),
             total_global_mem: total_global_mem / 1048576,
         };
 
-        let driver_version = format!("{}.{}", driver_version / 1000, (driver_version / 10) % 100);
-        let runtime_version = format!(
-            "{}.{}",
-            runtime_version / 1000,
-            (runtime_version / 10) % 100
-        );
+        #[cfg(feature = "cuda")]
+        let (driver_version, runtime_version) = {
+            let d = format!("{}.{}", driver_version / 1000, (driver_version / 10) % 100);
+            let r = format!(
+                "{}.{}",
+                runtime_version / 1000,
+                (runtime_version / 10) % 100
+            );
+            (d, r)
+        };
+        #[cfg(feature = "hip")]
+        let (driver_version, runtime_version) = {
+            // This isn't documented, but is the only thing that makes sense to
+            // me.
+            let d = format!(
+                "{}.{}",
+                driver_version / 10_000_000,
+                (driver_version / 10_000) % 100
+            );
+            let r = format!(
+                "{}.{}",
+                runtime_version / 10_000_000,
+                (runtime_version / 10_000) % 100
+            );
+            (d, r)
+        };
 
         Ok((
             device_info,
-            CudaDriverInfo {
+            GpuDriverInfo {
                 driver_version: driver_version.into_boxed_str(),
                 runtime_version: runtime_version.into_boxed_str(),
             },
diff --git a/src/cuda/utils_bindings.rs b/src/gpu/utils_bindings.rs
similarity index 96%
rename from src/cuda/utils_bindings.rs
rename to src/gpu/utils_bindings.rs
index 62f3c693..070f24e3 100644
--- a/src/cuda/utils_bindings.rs
+++ b/src/gpu/utils_bindings.rs
@@ -2,7 +2,7 @@
 
 extern "C" {
     #[doc = " A \"watered-down\" version of the CUDA example \"deviceQuery\".\n\n See the full example at:\nhttps://github.com/NVIDIA/cuda-samples/blob/master/Samples/1_Utilities/deviceQuery/deviceQuery.cpp\n\n As this code contains code derived from an official NVIDIA example, legally,\n a copyright, list of conditions and disclaimer must be distributed with this\n code. This should be found in the root of the mwa_hyperdrive git repo, file\n LICENSE-NVIDIA."]
-    pub fn get_cuda_device_info(
+    pub fn get_gpu_device_info(
         device: ::std::os::raw::c_int,
         name: *mut ::std::os::raw::c_char,
         device_major: *mut ::std::os::raw::c_int,
diff --git a/src/lib.rs b/src/lib.rs
index b5b99319..375e34c0 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -24,8 +24,8 @@ mod solutions;
 pub mod srclist;
 mod unit_parsing;
 
-#[cfg(feature = "cuda")]
-mod cuda;
+#[cfg(any(feature = "cuda", feature = "hip"))]
+mod gpu;
 
 #[cfg(test)]
 mod tests;
@@ -40,8 +40,8 @@ lazy_static::lazy_static! {
     /// This should only ever be changed from its default by CLI code.
     static ref MODEL_DEVICE: AtomicCell<model::ModelDevice> = {
         cfg_if::cfg_if! {
-            if #[cfg(feature = "cuda")] {
-                AtomicCell::new(ModelDevice::Cuda)
+            if #[cfg(any(feature = "cuda", feature = "hip"))] {
+                AtomicCell::new(ModelDevice::Gpu)
             } else {
                 AtomicCell::new(ModelDevice::Cpu)
             }
diff --git a/src/model/error.rs b/src/model/error.rs
index 810ce048..6a861b5f 100644
--- a/src/model/error.rs
+++ b/src/model/error.rs
@@ -11,7 +11,7 @@ pub enum ModelError {
     #[error(transparent)]
     Beam(#[from] crate::beam::BeamError),
 
-    #[cfg(feature = "cuda")]
+    #[cfg(any(feature = "cuda", feature = "hip"))]
     #[error(transparent)]
-    Cuda(#[from] crate::cuda::CudaError),
+    Gpu(#[from] crate::gpu::GpuError),
 }
diff --git a/src/model/cuda.rs b/src/model/gpu.rs
similarity index 75%
rename from src/model/cuda.rs
rename to src/model/gpu.rs
index 0676103f..f6da1295 100644
--- a/src/model/cuda.rs
+++ b/src/model/gpu.rs
@@ -2,7 +2,7 @@
 // License, v. 2.0. If a copy of the MPL was not distributed with this
 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-//! Code to generate sky-model visibilities with CUDA.
+//! Code to generate sky-model visibilities with CUDA/HIP.
 
 use std::{borrow::Cow, collections::HashSet};
 
@@ -17,9 +17,9 @@ use ndarray::prelude::*;
 
 use super::{mask_pols, shapelets, ModelError, SkyModeller};
 use crate::{
-    beam::{Beam, BeamCUDA},
+    beam::{Beam, BeamGpu},
     context::Polarisations,
-    cuda::{self, cuda_kernel_call, CudaError, CudaFloat, CudaJones, DevicePointer},
+    gpu::{self, gpu_kernel_call, DevicePointer, GpuError, GpuFloat, GpuJones},
     srclist::{
         get_instrumental_flux_densities, ComponentType, FluxDensityType, ShapeletCoeff, SourceList,
     },
@@ -28,9 +28,9 @@ use crate::{
 /// The first axis of `*_list_fds` is unflagged fine channel frequency, the
 /// second is the source component. The length of `hadecs`, `lmns`,
 /// `*_list_fds`'s second axis are the same.
-pub struct SkyModellerCuda<'a> {
+pub struct SkyModellerGpu<'a> {
     /// The trait object to use for beam calculations.
-    cuda_beam: Box<dyn BeamCUDA>,
+    gpu_beam: Box<dyn BeamGpu>,
 
     /// The phase centre used for all modelling.
     phase_centre: RADec,
@@ -41,7 +41,7 @@ pub struct SkyModellerCuda<'a> {
     array_latitude: f64,
     /// The UT1 - UTC offset. If this is 0, effectively UT1 == UTC, which is a
     /// wrong assumption by up to 0.9s. We assume the this value does not change
-    /// over the timestamps given to this `SkyModellerCuda`.
+    /// over the timestamps given to this `SkyModellerGpu`.
     dut1: Duration,
     /// Shift baselines, LSTs and array latitudes back to J2000.
     precess: bool,
@@ -55,85 +55,85 @@ pub struct SkyModellerCuda<'a> {
     pols: Polarisations,
 
     /// A simple map from an absolute tile index into an unflagged tile index.
-    /// This is important because CUDA will use tile indices from 0 to the
+    /// This is important because CUDA/HIP will use tile indices from 0 to the
     /// length of `unflagged_tile_xyzs`, but the beam code has dipole delays and
     /// dipole gains available for *all* tiles. So if tile 32 is flagged, any
-    /// CUDA thread with a tile index of 32 would naively get the flagged beam
-    /// info. This map would make tile index go to the next unflagged tile,
+    /// CUDA/HIP thread with a tile index of 32 would naively get the flagged
+    /// beam info. This map would make tile index go to the next unflagged tile,
     /// perhaps 33.
     tile_index_to_unflagged_tile_index_map: DevicePointer<i32>,
 
-    d_freqs: DevicePointer<CudaFloat>,
-    d_shapelet_basis_values: DevicePointer<CudaFloat>,
+    d_freqs: DevicePointer<GpuFloat>,
+    d_shapelet_basis_values: DevicePointer<GpuFloat>,
 
     point_power_law_radecs: Vec<RADec>,
-    point_power_law_lmns: DevicePointer<cuda::LmnRime>,
+    point_power_law_lmns: DevicePointer<gpu::LmnRime>,
     /// Instrumental flux densities calculated at 150 MHz.
-    point_power_law_fds: DevicePointer<CudaJones>,
+    point_power_law_fds: DevicePointer<GpuJones>,
     /// Spectral indices.
-    point_power_law_sis: DevicePointer<CudaFloat>,
+    point_power_law_sis: DevicePointer<GpuFloat>,
 
     point_curved_power_law_radecs: Vec<RADec>,
-    point_curved_power_law_lmns: DevicePointer<cuda::LmnRime>,
-    pub(super) point_curved_power_law_fds: DevicePointer<CudaJones>,
-    pub(super) point_curved_power_law_sis: DevicePointer<CudaFloat>,
-    point_curved_power_law_qs: DevicePointer<CudaFloat>,
+    point_curved_power_law_lmns: DevicePointer<gpu::LmnRime>,
+    pub(super) point_curved_power_law_fds: DevicePointer<GpuJones>,
+    pub(super) point_curved_power_law_sis: DevicePointer<GpuFloat>,
+    point_curved_power_law_qs: DevicePointer<GpuFloat>,
 
     point_list_radecs: Vec<RADec>,
-    point_list_lmns: DevicePointer<cuda::LmnRime>,
+    point_list_lmns: DevicePointer<gpu::LmnRime>,
     /// Instrumental (i.e. XX, XY, YX, XX).
-    point_list_fds: DevicePointer<CudaJones>,
+    point_list_fds: DevicePointer<GpuJones>,
 
     gaussian_power_law_radecs: Vec<RADec>,
-    gaussian_power_law_lmns: DevicePointer<cuda::LmnRime>,
+    gaussian_power_law_lmns: DevicePointer<gpu::LmnRime>,
     /// Instrumental flux densities calculated at 150 MHz.
-    gaussian_power_law_fds: DevicePointer<CudaJones>,
+    gaussian_power_law_fds: DevicePointer<GpuJones>,
     /// Spectral indices.
-    gaussian_power_law_sis: DevicePointer<CudaFloat>,
-    gaussian_power_law_gps: DevicePointer<cuda::GaussianParams>,
+    gaussian_power_law_sis: DevicePointer<GpuFloat>,
+    gaussian_power_law_gps: DevicePointer<gpu::GaussianParams>,
 
     gaussian_curved_power_law_radecs: Vec<RADec>,
-    gaussian_curved_power_law_lmns: DevicePointer<cuda::LmnRime>,
-    gaussian_curved_power_law_fds: DevicePointer<CudaJones>,
-    gaussian_curved_power_law_sis: DevicePointer<CudaFloat>,
-    gaussian_curved_power_law_qs: DevicePointer<CudaFloat>,
-    gaussian_curved_power_law_gps: DevicePointer<cuda::GaussianParams>,
+    gaussian_curved_power_law_lmns: DevicePointer<gpu::LmnRime>,
+    gaussian_curved_power_law_fds: DevicePointer<GpuJones>,
+    gaussian_curved_power_law_sis: DevicePointer<GpuFloat>,
+    gaussian_curved_power_law_qs: DevicePointer<GpuFloat>,
+    gaussian_curved_power_law_gps: DevicePointer<gpu::GaussianParams>,
 
     gaussian_list_radecs: Vec<RADec>,
-    gaussian_list_lmns: DevicePointer<cuda::LmnRime>,
+    gaussian_list_lmns: DevicePointer<gpu::LmnRime>,
     /// Instrumental (i.e. XX, XY, YX, XX).
-    gaussian_list_fds: DevicePointer<CudaJones>,
-    gaussian_list_gps: DevicePointer<cuda::GaussianParams>,
+    gaussian_list_fds: DevicePointer<GpuJones>,
+    gaussian_list_gps: DevicePointer<gpu::GaussianParams>,
 
     shapelet_power_law_radecs: Vec<RADec>,
-    shapelet_power_law_lmns: DevicePointer<cuda::LmnRime>,
+    shapelet_power_law_lmns: DevicePointer<gpu::LmnRime>,
     /// Instrumental flux densities calculated at 150 MHz.
-    shapelet_power_law_fds: DevicePointer<CudaJones>,
+    shapelet_power_law_fds: DevicePointer<GpuJones>,
     /// Spectral indices.
-    shapelet_power_law_sis: DevicePointer<CudaFloat>,
-    shapelet_power_law_gps: DevicePointer<cuda::GaussianParams>,
-    shapelet_power_law_coeffs: DevicePointer<cuda::ShapeletCoeff>,
+    shapelet_power_law_sis: DevicePointer<GpuFloat>,
+    shapelet_power_law_gps: DevicePointer<gpu::GaussianParams>,
+    shapelet_power_law_coeffs: DevicePointer<gpu::ShapeletCoeff>,
     shapelet_power_law_coeff_lens: DevicePointer<i32>,
 
     shapelet_curved_power_law_radecs: Vec<RADec>,
-    shapelet_curved_power_law_lmns: DevicePointer<cuda::LmnRime>,
-    shapelet_curved_power_law_fds: DevicePointer<CudaJones>,
-    shapelet_curved_power_law_sis: DevicePointer<CudaFloat>,
-    shapelet_curved_power_law_qs: DevicePointer<CudaFloat>,
-    shapelet_curved_power_law_gps: DevicePointer<cuda::GaussianParams>,
-    shapelet_curved_power_law_coeffs: DevicePointer<cuda::ShapeletCoeff>,
+    shapelet_curved_power_law_lmns: DevicePointer<gpu::LmnRime>,
+    shapelet_curved_power_law_fds: DevicePointer<GpuJones>,
+    shapelet_curved_power_law_sis: DevicePointer<GpuFloat>,
+    shapelet_curved_power_law_qs: DevicePointer<GpuFloat>,
+    shapelet_curved_power_law_gps: DevicePointer<gpu::GaussianParams>,
+    shapelet_curved_power_law_coeffs: DevicePointer<gpu::ShapeletCoeff>,
     shapelet_curved_power_law_coeff_lens: DevicePointer<i32>,
 
     shapelet_list_radecs: Vec<RADec>,
-    shapelet_list_lmns: DevicePointer<cuda::LmnRime>,
+    shapelet_list_lmns: DevicePointer<gpu::LmnRime>,
     /// Instrumental (i.e. XX, XY, YX, XX).
-    shapelet_list_fds: DevicePointer<CudaJones>,
-    shapelet_list_gps: DevicePointer<cuda::GaussianParams>,
-    shapelet_list_coeffs: DevicePointer<cuda::ShapeletCoeff>,
+    shapelet_list_fds: DevicePointer<GpuJones>,
+    shapelet_list_gps: DevicePointer<gpu::GaussianParams>,
+    shapelet_list_coeffs: DevicePointer<gpu::ShapeletCoeff>,
     shapelet_list_coeff_lens: DevicePointer<i32>,
 }
 
-impl<'a> SkyModellerCuda<'a> {
+impl<'a> SkyModellerGpu<'a> {
     /// Given a source list, split the components into each component type (e.g.
     /// points, shapelets) and by each flux density type (e.g. list, power law),
     /// then copy them to a GPU ready for modelling. Where possible, list flux
@@ -153,71 +153,71 @@ impl<'a> SkyModellerCuda<'a> {
         array_latitude_rad: f64,
         dut1: Duration,
         apply_precession: bool,
-    ) -> Result<SkyModellerCuda<'a>, ModelError> {
+    ) -> Result<SkyModellerGpu<'a>, ModelError> {
         let mut point_power_law_radecs: Vec<RADec> = vec![];
-        let mut point_power_law_lmns: Vec<cuda::LmnRime> = vec![];
+        let mut point_power_law_lmns: Vec<gpu::LmnRime> = vec![];
         let mut point_power_law_fds: Vec<_> = vec![];
         let mut point_power_law_sis: Vec<_> = vec![];
 
         let mut point_curved_power_law_radecs: Vec<RADec> = vec![];
-        let mut point_curved_power_law_lmns: Vec<cuda::LmnRime> = vec![];
+        let mut point_curved_power_law_lmns: Vec<gpu::LmnRime> = vec![];
         let mut point_curved_power_law_fds: Vec<_> = vec![];
         let mut point_curved_power_law_sis: Vec<_> = vec![];
         let mut point_curved_power_law_qs: Vec<_> = vec![];
 
         let mut point_list_radecs: Vec<RADec> = vec![];
-        let mut point_list_lmns: Vec<cuda::LmnRime> = vec![];
+        let mut point_list_lmns: Vec<gpu::LmnRime> = vec![];
         let mut point_list_fds: Vec<&FluxDensityType> = vec![];
 
         let mut gaussian_power_law_radecs: Vec<RADec> = vec![];
-        let mut gaussian_power_law_lmns: Vec<cuda::LmnRime> = vec![];
+        let mut gaussian_power_law_lmns: Vec<gpu::LmnRime> = vec![];
         let mut gaussian_power_law_fds: Vec<_> = vec![];
         let mut gaussian_power_law_sis: Vec<_> = vec![];
-        let mut gaussian_power_law_gps: Vec<cuda::GaussianParams> = vec![];
+        let mut gaussian_power_law_gps: Vec<gpu::GaussianParams> = vec![];
 
         let mut gaussian_curved_power_law_radecs: Vec<RADec> = vec![];
-        let mut gaussian_curved_power_law_lmns: Vec<cuda::LmnRime> = vec![];
+        let mut gaussian_curved_power_law_lmns: Vec<gpu::LmnRime> = vec![];
         let mut gaussian_curved_power_law_fds: Vec<_> = vec![];
         let mut gaussian_curved_power_law_sis: Vec<_> = vec![];
         let mut gaussian_curved_power_law_qs: Vec<_> = vec![];
-        let mut gaussian_curved_power_law_gps: Vec<cuda::GaussianParams> = vec![];
+        let mut gaussian_curved_power_law_gps: Vec<gpu::GaussianParams> = vec![];
 
         let mut gaussian_list_radecs: Vec<RADec> = vec![];
-        let mut gaussian_list_lmns: Vec<cuda::LmnRime> = vec![];
+        let mut gaussian_list_lmns: Vec<gpu::LmnRime> = vec![];
         let mut gaussian_list_fds: Vec<&FluxDensityType> = vec![];
-        let mut gaussian_list_gps: Vec<cuda::GaussianParams> = vec![];
+        let mut gaussian_list_gps: Vec<gpu::GaussianParams> = vec![];
 
         let mut shapelet_power_law_radecs: Vec<RADec> = vec![];
-        let mut shapelet_power_law_lmns: Vec<cuda::LmnRime> = vec![];
+        let mut shapelet_power_law_lmns: Vec<gpu::LmnRime> = vec![];
         let mut shapelet_power_law_fds: Vec<_> = vec![];
         let mut shapelet_power_law_sis: Vec<_> = vec![];
-        let mut shapelet_power_law_gps: Vec<cuda::GaussianParams> = vec![];
+        let mut shapelet_power_law_gps: Vec<gpu::GaussianParams> = vec![];
         let mut shapelet_power_law_coeffs: Vec<&[ShapeletCoeff]> = vec![];
 
         let mut shapelet_curved_power_law_radecs: Vec<RADec> = vec![];
-        let mut shapelet_curved_power_law_lmns: Vec<cuda::LmnRime> = vec![];
+        let mut shapelet_curved_power_law_lmns: Vec<gpu::LmnRime> = vec![];
         let mut shapelet_curved_power_law_fds: Vec<_> = vec![];
         let mut shapelet_curved_power_law_sis: Vec<_> = vec![];
         let mut shapelet_curved_power_law_qs: Vec<_> = vec![];
-        let mut shapelet_curved_power_law_gps: Vec<cuda::GaussianParams> = vec![];
+        let mut shapelet_curved_power_law_gps: Vec<gpu::GaussianParams> = vec![];
         let mut shapelet_curved_power_law_coeffs: Vec<&[ShapeletCoeff]> = vec![];
 
         let mut shapelet_list_radecs: Vec<RADec> = vec![];
-        let mut shapelet_list_lmns: Vec<cuda::LmnRime> = vec![];
+        let mut shapelet_list_lmns: Vec<gpu::LmnRime> = vec![];
         let mut shapelet_list_fds: Vec<&FluxDensityType> = vec![];
-        let mut shapelet_list_gps: Vec<cuda::GaussianParams> = vec![];
+        let mut shapelet_list_gps: Vec<gpu::GaussianParams> = vec![];
         let mut shapelet_list_coeffs: Vec<&[ShapeletCoeff]> = vec![];
 
-        let jones_to_cuda_jones = |j: Jones<f64>| -> CudaJones {
-            CudaJones {
-                j00_re: j[0].re as CudaFloat,
-                j00_im: j[0].im as CudaFloat,
-                j01_re: j[1].re as CudaFloat,
-                j01_im: j[1].im as CudaFloat,
-                j10_re: j[2].re as CudaFloat,
-                j10_im: j[2].im as CudaFloat,
-                j11_re: j[3].re as CudaFloat,
-                j11_im: j[3].im as CudaFloat,
+        let jones_to_gpu_jones = |j: Jones<f64>| -> GpuJones {
+            GpuJones {
+                j00_re: j[0].re as GpuFloat,
+                j00_im: j[0].im as GpuFloat,
+                j01_re: j[1].re as GpuFloat,
+                j01_im: j[1].im as GpuFloat,
+                j10_re: j[2].re as GpuFloat,
+                j10_im: j[2].im as GpuFloat,
+                j11_re: j[3].re as GpuFloat,
+                j11_im: j[3].im as GpuFloat,
             }
         };
 
@@ -235,39 +235,39 @@ impl<'a> SkyModellerCuda<'a> {
         {
             let radec = comp.radec;
             let LmnRime { l, m, n } = comp.radec.to_lmn(phase_centre).prepare_for_rime();
-            let lmn = cuda::LmnRime {
-                l: l as CudaFloat,
-                m: m as CudaFloat,
-                n: n as CudaFloat,
+            let lmn = gpu::LmnRime {
+                l: l as GpuFloat,
+                m: m as GpuFloat,
+                n: n as GpuFloat,
             };
             match &comp.flux_type {
                 FluxDensityType::PowerLaw { si, fd: _ } => {
                     // Rather than using this PL's reference freq, use a pre-
-                    // defined one, so the CUDA code doesn't need to keep track
-                    // of all reference freqs.
-                    let fd_at_150mhz = comp.estimate_at_freq(cuda::POWER_LAW_FD_REF_FREQ as _);
+                    // defined one, so the the GPU code doesn't need to keep
+                    // track of all reference freqs.
+                    let fd_at_150mhz = comp.estimate_at_freq(gpu::POWER_LAW_FD_REF_FREQ as _);
                     let inst_fd: Jones<f64> = fd_at_150mhz.to_inst_stokes();
-                    let cuda_inst_fd = jones_to_cuda_jones(inst_fd);
+                    let gpu_inst_fd = jones_to_gpu_jones(inst_fd);
 
                     match &comp.comp_type {
                         ComponentType::Point => {
                             point_power_law_radecs.push(radec);
                             point_power_law_lmns.push(lmn);
-                            point_power_law_fds.push(cuda_inst_fd);
-                            point_power_law_sis.push(*si as CudaFloat);
+                            point_power_law_fds.push(gpu_inst_fd);
+                            point_power_law_sis.push(*si as GpuFloat);
                         }
 
                         ComponentType::Gaussian { maj, min, pa } => {
-                            let gp = cuda::GaussianParams {
-                                maj: *maj as CudaFloat,
-                                min: *min as CudaFloat,
-                                pa: *pa as CudaFloat,
+                            let gp = gpu::GaussianParams {
+                                maj: *maj as GpuFloat,
+                                min: *min as GpuFloat,
+                                pa: *pa as GpuFloat,
                             };
                             gaussian_power_law_radecs.push(radec);
                             gaussian_power_law_lmns.push(lmn);
                             gaussian_power_law_gps.push(gp);
-                            gaussian_power_law_fds.push(cuda_inst_fd);
-                            gaussian_power_law_sis.push(*si as CudaFloat);
+                            gaussian_power_law_fds.push(gpu_inst_fd);
+                            gaussian_power_law_sis.push(*si as GpuFloat);
                         }
 
                         ComponentType::Shapelet {
@@ -276,34 +276,34 @@ impl<'a> SkyModellerCuda<'a> {
                             pa,
                             coeffs,
                         } => {
-                            let gp = cuda::GaussianParams {
-                                maj: *maj as CudaFloat,
-                                min: *min as CudaFloat,
-                                pa: *pa as CudaFloat,
+                            let gp = gpu::GaussianParams {
+                                maj: *maj as GpuFloat,
+                                min: *min as GpuFloat,
+                                pa: *pa as GpuFloat,
                             };
                             shapelet_power_law_radecs.push(radec);
                             shapelet_power_law_lmns.push(lmn);
                             shapelet_power_law_gps.push(gp);
                             shapelet_power_law_coeffs.push(coeffs);
-                            shapelet_power_law_fds.push(cuda_inst_fd);
-                            shapelet_power_law_sis.push(*si as CudaFloat);
+                            shapelet_power_law_fds.push(gpu_inst_fd);
+                            shapelet_power_law_sis.push(*si as GpuFloat);
                         }
                     };
                 }
 
                 FluxDensityType::CurvedPowerLaw { si, fd, q } => {
-                    let fd_at_150mhz = comp.estimate_at_freq(cuda::POWER_LAW_FD_REF_FREQ as _);
+                    let fd_at_150mhz = comp.estimate_at_freq(gpu::POWER_LAW_FD_REF_FREQ as _);
                     let inst_fd: Jones<f64> = fd_at_150mhz.to_inst_stokes();
-                    let cuda_inst_fd = jones_to_cuda_jones(inst_fd);
+                    let gpu_inst_fd = jones_to_gpu_jones(inst_fd);
 
                     // A new SI is needed when changing the reference freq.
                     // Thanks Jack.
                     #[allow(clippy::unnecessary_cast)]
-                    let si = if fd.freq == cuda::POWER_LAW_FD_REF_FREQ as f64 {
+                    let si = if fd.freq == gpu::POWER_LAW_FD_REF_FREQ as f64 {
                         *si
                     } else {
                         #[allow(clippy::unnecessary_cast)]
-                        let logratio = (fd.freq / cuda::POWER_LAW_FD_REF_FREQ as f64).ln();
+                        let logratio = (fd.freq / gpu::POWER_LAW_FD_REF_FREQ as f64).ln();
                         ((fd.i / fd_at_150mhz.i).ln() - q * logratio.powi(2)) / logratio
                     };
 
@@ -311,23 +311,23 @@ impl<'a> SkyModellerCuda<'a> {
                         ComponentType::Point => {
                             point_curved_power_law_radecs.push(radec);
                             point_curved_power_law_lmns.push(lmn);
-                            point_curved_power_law_fds.push(cuda_inst_fd);
-                            point_curved_power_law_sis.push(si as CudaFloat);
-                            point_curved_power_law_qs.push(*q as CudaFloat);
+                            point_curved_power_law_fds.push(gpu_inst_fd);
+                            point_curved_power_law_sis.push(si as GpuFloat);
+                            point_curved_power_law_qs.push(*q as GpuFloat);
                         }
 
                         ComponentType::Gaussian { maj, min, pa } => {
-                            let gp = cuda::GaussianParams {
-                                maj: *maj as CudaFloat,
-                                min: *min as CudaFloat,
-                                pa: *pa as CudaFloat,
+                            let gp = gpu::GaussianParams {
+                                maj: *maj as GpuFloat,
+                                min: *min as GpuFloat,
+                                pa: *pa as GpuFloat,
                             };
                             gaussian_curved_power_law_radecs.push(radec);
                             gaussian_curved_power_law_lmns.push(lmn);
                             gaussian_curved_power_law_gps.push(gp);
-                            gaussian_curved_power_law_fds.push(cuda_inst_fd);
-                            gaussian_curved_power_law_sis.push(si as CudaFloat);
-                            gaussian_curved_power_law_qs.push(*q as CudaFloat);
+                            gaussian_curved_power_law_fds.push(gpu_inst_fd);
+                            gaussian_curved_power_law_sis.push(si as GpuFloat);
+                            gaussian_curved_power_law_qs.push(*q as GpuFloat);
                         }
 
                         ComponentType::Shapelet {
@@ -336,18 +336,18 @@ impl<'a> SkyModellerCuda<'a> {
                             pa,
                             coeffs,
                         } => {
-                            let gp = cuda::GaussianParams {
-                                maj: *maj as CudaFloat,
-                                min: *min as CudaFloat,
-                                pa: *pa as CudaFloat,
+                            let gp = gpu::GaussianParams {
+                                maj: *maj as GpuFloat,
+                                min: *min as GpuFloat,
+                                pa: *pa as GpuFloat,
                             };
                             shapelet_curved_power_law_radecs.push(radec);
                             shapelet_curved_power_law_lmns.push(lmn);
                             shapelet_curved_power_law_gps.push(gp);
                             shapelet_curved_power_law_coeffs.push(coeffs);
-                            shapelet_curved_power_law_fds.push(cuda_inst_fd);
-                            shapelet_curved_power_law_sis.push(si as CudaFloat);
-                            shapelet_curved_power_law_qs.push(*q as CudaFloat);
+                            shapelet_curved_power_law_fds.push(gpu_inst_fd);
+                            shapelet_curved_power_law_sis.push(si as GpuFloat);
+                            shapelet_curved_power_law_qs.push(*q as GpuFloat);
                         }
                     };
                 }
@@ -360,10 +360,10 @@ impl<'a> SkyModellerCuda<'a> {
                     }
 
                     ComponentType::Gaussian { maj, min, pa } => {
-                        let gp = cuda::GaussianParams {
-                            maj: *maj as CudaFloat,
-                            min: *min as CudaFloat,
-                            pa: *pa as CudaFloat,
+                        let gp = gpu::GaussianParams {
+                            maj: *maj as GpuFloat,
+                            min: *min as GpuFloat,
+                            pa: *pa as GpuFloat,
                         };
                         gaussian_list_radecs.push(radec);
                         gaussian_list_lmns.push(lmn);
@@ -377,10 +377,10 @@ impl<'a> SkyModellerCuda<'a> {
                         pa,
                         coeffs,
                     } => {
-                        let gp = cuda::GaussianParams {
-                            maj: *maj as CudaFloat,
-                            min: *min as CudaFloat,
-                            pa: *pa as CudaFloat,
+                        let gp = gpu::GaussianParams {
+                            maj: *maj as GpuFloat,
+                            min: *min as GpuFloat,
+                            pa: *pa as GpuFloat,
                         };
                         shapelet_list_radecs.push(radec);
                         shapelet_list_lmns.push(lmn);
@@ -394,13 +394,13 @@ impl<'a> SkyModellerCuda<'a> {
 
         let point_list_fds =
             get_instrumental_flux_densities(&point_list_fds, unflagged_fine_chan_freqs)
-                .mapv(jones_to_cuda_jones);
+                .mapv(jones_to_gpu_jones);
         let gaussian_list_fds =
             get_instrumental_flux_densities(&gaussian_list_fds, unflagged_fine_chan_freqs)
-                .mapv(jones_to_cuda_jones);
+                .mapv(jones_to_gpu_jones);
         let shapelet_list_fds =
             get_instrumental_flux_densities(&shapelet_list_fds, unflagged_fine_chan_freqs)
-                .mapv(jones_to_cuda_jones);
+                .mapv(jones_to_gpu_jones);
 
         let (shapelet_power_law_coeffs, shapelet_power_law_coeff_lens) =
             get_flattened_coeffs(shapelet_power_law_coeffs);
@@ -409,16 +409,16 @@ impl<'a> SkyModellerCuda<'a> {
         let (shapelet_list_coeffs, shapelet_list_coeff_lens) =
             get_flattened_coeffs(shapelet_list_coeffs);
 
-        // Variables for CUDA. They're made flexible in their types for
-        // whichever precision is being used in the CUDA code.
+        // Variables for CUDA/HIP. They're made flexible in their types for
+        // whichever precision is being used.
         let (unflagged_fine_chan_freqs_ints, unflagged_fine_chan_freqs_floats): (Vec<_>, Vec<_>) =
             unflagged_fine_chan_freqs
                 .iter()
-                .map(|&f| (f as u32, f as CudaFloat))
+                .map(|&f| (f as u32, f as GpuFloat))
                 .unzip();
-        let shapelet_basis_values: Vec<CudaFloat> = shapelets::SHAPELET_BASIS_VALUES
+        let shapelet_basis_values: Vec<GpuFloat> = shapelets::SHAPELET_BASIS_VALUES
             .iter()
-            .map(|&f| f as CudaFloat)
+            .map(|&f| f as GpuFloat)
             .collect();
 
         let num_baselines = (unflagged_tile_xyzs.len() * (unflagged_tile_xyzs.len() - 1)) / 2;
@@ -441,8 +441,8 @@ impl<'a> SkyModellerCuda<'a> {
         let d_tile_index_to_unflagged_tile_index_map =
             DevicePointer::copy_to_device(&tile_index_to_unflagged_tile_index_map)?;
 
-        Ok(SkyModellerCuda {
-            cuda_beam: beam.prepare_cuda_beam(&unflagged_fine_chan_freqs_ints)?,
+        Ok(SkyModellerGpu {
+            gpu_beam: beam.prepare_gpu_beam(&unflagged_fine_chan_freqs_ints)?,
 
             phase_centre,
             array_longitude: array_longitude_rad,
@@ -557,8 +557,8 @@ impl<'a> SkyModellerCuda<'a> {
     /// This function is mostly used for testing. For a single timestep, over
     /// the already-provided baselines and frequencies, generate visibilities
     /// for each specified sky-model point-source component. The
-    /// `SkyModellerCuda` object *must* already have its UVW coordinates set;
-    /// see [`SkyModellerCuda::set_uvws`].
+    /// `SkyModellerGpu` object *must* already have its UVW coordinates set; see
+    /// [`SkyModellerGpu::set_uvws`].
     ///
     /// `lst_rad`: The local sidereal time in \[radians\].
     ///
@@ -568,8 +568,8 @@ impl<'a> SkyModellerCuda<'a> {
         &self,
         lst_rad: f64,
         array_latitude_rad: f64,
-        d_uvws: &DevicePointer<cuda::UVW>,
-        d_beam_jones: &mut DevicePointer<CudaJones>,
+        d_uvws: &DevicePointer<gpu::UVW>,
+        d_beam_jones: &mut DevicePointer<GpuJones>,
         d_vis_fb: &mut DevicePointer<Jones<f32>>,
     ) -> Result<(), ModelError> {
         if self.point_power_law_radecs.is_empty()
@@ -580,23 +580,23 @@ impl<'a> SkyModellerCuda<'a> {
         }
 
         {
-            let (azs, zas): (Vec<CudaFloat>, Vec<CudaFloat>) = self
+            let (azs, zas): (Vec<GpuFloat>, Vec<GpuFloat>) = self
                 .point_power_law_radecs
                 .iter()
                 .chain(self.point_curved_power_law_radecs.iter())
                 .chain(self.point_list_radecs.iter())
                 .map(|radec| {
                     let azel = radec.to_hadec(lst_rad).to_azel(array_latitude_rad);
-                    (azel.az as CudaFloat, azel.za() as CudaFloat)
+                    (azel.az as GpuFloat, azel.za() as GpuFloat)
                 })
                 .unzip();
             d_beam_jones.realloc(
-                self.cuda_beam.get_num_unique_tiles() as usize
-                    * self.cuda_beam.get_num_unique_freqs() as usize
+                self.gpu_beam.get_num_unique_tiles() as usize
+                    * self.gpu_beam.get_num_unique_freqs() as usize
                     * azs.len()
-                    * std::mem::size_of::<CudaJones>(),
+                    * std::mem::size_of::<GpuJones>(),
             )?;
-            self.cuda_beam.calc_jones_pair(
+            self.gpu_beam.calc_jones_pair(
                 &azs,
                 &zas,
                 array_latitude_rad,
@@ -604,9 +604,9 @@ impl<'a> SkyModellerCuda<'a> {
             )?;
         }
 
-        cuda_kernel_call!(
-            cuda::model_points,
-            &cuda::Points {
+        gpu_kernel_call!(
+            gpu::model_points,
+            &gpu::Points {
                 num_power_laws: self
                     .point_power_law_radecs
                     .len()
@@ -644,8 +644,8 @@ impl<'a> SkyModellerCuda<'a> {
     /// This function is mostly used for testing. For a single timestep, over
     /// the already-provided baselines and frequencies, generate visibilities
     /// for each specified sky-model Gaussian-source component. The
-    /// `SkyModellerCuda` object *must* already have its UVW coordinates set;
-    /// see [`SkyModellerCuda::set_uvws`].
+    /// `SkyModellerGpu` object *must* already have its UVW coordinates set; see
+    /// [`SkyModellerGpu::set_uvws`].
     ///
     /// `lst_rad`: The local sidereal time in \[radians\].
     ///
@@ -655,8 +655,8 @@ impl<'a> SkyModellerCuda<'a> {
         &self,
         lst_rad: f64,
         array_latitude_rad: f64,
-        d_uvws: &DevicePointer<cuda::UVW>,
-        d_beam_jones: &mut DevicePointer<CudaJones>,
+        d_uvws: &DevicePointer<gpu::UVW>,
+        d_beam_jones: &mut DevicePointer<GpuJones>,
         d_vis_fb: &mut DevicePointer<Jones<f32>>,
     ) -> Result<(), ModelError> {
         if self.gaussian_power_law_radecs.is_empty()
@@ -667,23 +667,23 @@ impl<'a> SkyModellerCuda<'a> {
         }
 
         {
-            let (azs, zas): (Vec<CudaFloat>, Vec<CudaFloat>) = self
+            let (azs, zas): (Vec<GpuFloat>, Vec<GpuFloat>) = self
                 .gaussian_power_law_radecs
                 .iter()
                 .chain(self.gaussian_curved_power_law_radecs.iter())
                 .chain(self.gaussian_list_radecs.iter())
                 .map(|radec| {
                     let azel = radec.to_hadec(lst_rad).to_azel(array_latitude_rad);
-                    (azel.az as CudaFloat, azel.za() as CudaFloat)
+                    (azel.az as GpuFloat, azel.za() as GpuFloat)
                 })
                 .unzip();
             d_beam_jones.realloc(
-                self.cuda_beam.get_num_unique_tiles() as usize
-                    * self.cuda_beam.get_num_unique_freqs() as usize
+                self.gpu_beam.get_num_unique_tiles() as usize
+                    * self.gpu_beam.get_num_unique_freqs() as usize
                     * azs.len()
-                    * std::mem::size_of::<CudaJones>(),
+                    * std::mem::size_of::<GpuJones>(),
             )?;
-            self.cuda_beam.calc_jones_pair(
+            self.gpu_beam.calc_jones_pair(
                 &azs,
                 &zas,
                 array_latitude_rad,
@@ -691,9 +691,9 @@ impl<'a> SkyModellerCuda<'a> {
             )?;
         }
 
-        cuda_kernel_call!(
-            cuda::model_gaussians,
-            &cuda::Gaussians {
+        gpu_kernel_call!(
+            gpu::model_gaussians,
+            &gpu::Gaussians {
                 num_power_laws: self
                     .gaussian_power_law_radecs
                     .len()
@@ -734,8 +734,8 @@ impl<'a> SkyModellerCuda<'a> {
     /// This function is mostly used for testing. For a single timestep, over
     /// the already-provided baselines and frequencies, generate visibilities
     /// for each specified sky-model Gaussian-source component. The
-    /// `SkyModellerCuda` object *must* already have its UVW coordinates set;
-    /// see [`SkyModellerCuda::set_uvws`].
+    /// `SkyModellerGpu` object *must* already have its UVW coordinates set; see
+    /// [`SkyModellerGpu::set_uvws`].
     ///
     /// `lst_rad`: The local sidereal time in \[radians\].
     ///
@@ -745,8 +745,8 @@ impl<'a> SkyModellerCuda<'a> {
         &self,
         lst_rad: f64,
         array_latitude_rad: f64,
-        d_uvws: &DevicePointer<cuda::UVW>,
-        d_beam_jones: &mut DevicePointer<CudaJones>,
+        d_uvws: &DevicePointer<gpu::UVW>,
+        d_beam_jones: &mut DevicePointer<GpuJones>,
         d_vis_fb: &mut DevicePointer<Jones<f32>>,
     ) -> Result<(), ModelError> {
         if self.shapelet_power_law_radecs.is_empty()
@@ -757,23 +757,23 @@ impl<'a> SkyModellerCuda<'a> {
         }
 
         {
-            let (azs, zas): (Vec<CudaFloat>, Vec<CudaFloat>) = self
+            let (azs, zas): (Vec<GpuFloat>, Vec<GpuFloat>) = self
                 .shapelet_power_law_radecs
                 .iter()
                 .chain(self.shapelet_curved_power_law_radecs.iter())
                 .chain(self.shapelet_list_radecs.iter())
                 .map(|radec| {
                     let azel = radec.to_hadec(lst_rad).to_azel(array_latitude_rad);
-                    (azel.az as CudaFloat, azel.za() as CudaFloat)
+                    (azel.az as GpuFloat, azel.za() as GpuFloat)
                 })
                 .unzip();
             d_beam_jones.realloc(
-                self.cuda_beam.get_num_unique_tiles() as usize
-                    * self.cuda_beam.get_num_unique_freqs() as usize
+                self.gpu_beam.get_num_unique_tiles() as usize
+                    * self.gpu_beam.get_num_unique_freqs() as usize
                     * azs.len()
-                    * std::mem::size_of::<CudaJones>(),
+                    * std::mem::size_of::<GpuJones>(),
             )?;
-            self.cuda_beam.calc_jones_pair(
+            self.gpu_beam.calc_jones_pair(
                 &azs,
                 &zas,
                 array_latitude_rad,
@@ -788,9 +788,9 @@ impl<'a> SkyModellerCuda<'a> {
             DevicePointer::copy_to_device(uvs.curved_power_law.as_slice().expect("is contiguous"))?;
         let list_uvs = DevicePointer::copy_to_device(uvs.list.as_slice().expect("is contiguous"))?;
 
-        cuda_kernel_call!(
-            cuda::model_shapelets,
-            &cuda::Shapelets {
+        gpu_kernel_call!(
+            gpu::model_shapelets,
+            &gpu::Shapelets {
                 num_power_laws: self
                     .shapelet_power_law_radecs
                     .len()
@@ -840,15 +840,15 @@ impl<'a> SkyModellerCuda<'a> {
     }
 
     /// This is a "specialised" version of [`SkyModeller::model_timestep_with`];
-    /// it accepts CUDA buffers directly, saving some allocations. Unlike the
+    /// it accepts GPU buffers directly, saving some allocations. Unlike the
     /// aforementioned function, the incoming visibilities *are not* cleared;
     /// visibilities are accumulated.
     fn model_timestep_with(
         &self,
         lst_rad: f64,
         array_latitude_rad: f64,
-        d_uvws: &DevicePointer<cuda::UVW>,
-        d_beam_jones: &mut DevicePointer<CudaJones>,
+        d_uvws: &DevicePointer<gpu::UVW>,
+        d_beam_jones: &mut DevicePointer<GpuJones>,
         d_vis_fb: &mut DevicePointer<Jones<f32>>,
     ) -> Result<(), ModelError> {
         unsafe {
@@ -867,8 +867,8 @@ impl<'a> SkyModellerCuda<'a> {
     fn get_lst_uvws_latitude(
         &self,
         timestamp: Epoch,
-        d_uvws: &mut DevicePointer<cuda::UVW>,
-    ) -> Result<(f64, Vec<UVW>, f64), CudaError> {
+        d_uvws: &mut DevicePointer<gpu::UVW>,
+    ) -> Result<(f64, Vec<UVW>, f64), GpuError> {
         let (lst, xyzs, latitude) = if self.precess {
             let precession_info = precess_time(
                 self.array_longitude,
@@ -905,30 +905,30 @@ impl<'a> SkyModellerCuda<'a> {
         };
 
         let uvws = xyzs_to_cross_uvws(&xyzs, self.phase_centre.to_hadec(lst));
-        let cuda_uvws: Vec<cuda::UVW> = uvws
+        let gpu_uvws: Vec<gpu::UVW> = uvws
             .iter()
-            .map(|&uvw| cuda::UVW {
-                u: uvw.u as CudaFloat,
-                v: uvw.v as CudaFloat,
-                w: uvw.w as CudaFloat,
+            .map(|&uvw| gpu::UVW {
+                u: uvw.u as GpuFloat,
+                v: uvw.v as GpuFloat,
+                w: uvw.w as GpuFloat,
             })
             .collect();
-        d_uvws.overwrite(&cuda_uvws)?;
+        d_uvws.overwrite(&gpu_uvws)?;
 
         Ok((lst, uvws, latitude))
     }
 
-    /// Get a populated [`cuda::Addresses`]. This should never outlive `self`.
-    fn get_addresses(&self) -> cuda::Addresses {
-        cuda::Addresses {
+    /// Get a populated [`gpu::Addresses`]. This should never outlive `self`.
+    fn get_addresses(&self) -> gpu::Addresses {
+        gpu::Addresses {
             num_freqs: self.num_freqs,
             num_vis: self.num_baselines * self.num_freqs,
             num_baselines: self.num_baselines,
             d_freqs: self.d_freqs.get(),
             d_shapelet_basis_values: self.d_shapelet_basis_values.get(),
-            num_unique_beam_freqs: self.cuda_beam.get_num_unique_freqs(),
-            d_tile_map: self.cuda_beam.get_tile_map(),
-            d_freq_map: self.cuda_beam.get_freq_map(),
+            num_unique_beam_freqs: self.gpu_beam.get_num_unique_freqs(),
+            d_tile_map: self.gpu_beam.get_tile_map(),
+            d_freq_map: self.gpu_beam.get_freq_map(),
             d_tile_index_to_unflagged_tile_index_map: self
                 .tile_index_to_unflagged_tile_index_map
                 .get(),
@@ -937,7 +937,7 @@ impl<'a> SkyModellerCuda<'a> {
 
     /// Shapelets need their own special kind of UVW coordinates. Each shapelet
     /// component's position is treated as the phase centre. This function uses
-    /// the FFI type [`cuda::ShapeletUV`]; the W isn't actually used in
+    /// the FFI type [`gpu::ShapeletUV`]; the W isn't actually used in
     /// computation, and omitting it is hopefully a little more efficient.
     ///
     /// The returned arrays have baseline as the first axis and component as the
@@ -963,7 +963,7 @@ impl<'a> SkyModellerCuda<'a> {
     }
 }
 
-impl<'a> SkyModeller<'a> for SkyModellerCuda<'a> {
+impl<'a> SkyModeller<'a> for SkyModellerGpu<'a> {
     fn model_timestep(
         &self,
         timestamp: Epoch,
@@ -1006,36 +1006,36 @@ impl<'a> SkyModeller<'a> for SkyModellerCuda<'a> {
     }
 }
 
-/// The return type of [SkyModellerCuda::get_shapelet_uvs]. These arrays have
+/// The return type of [SkyModellerGpu::get_shapelet_uvs]. These arrays have
 /// baseline as the first axis and component as the second.
 pub(super) struct ShapeletUVs {
-    power_law: Array2<cuda::ShapeletUV>,
-    curved_power_law: Array2<cuda::ShapeletUV>,
-    pub(super) list: Array2<cuda::ShapeletUV>,
+    power_law: Array2<gpu::ShapeletUV>,
+    curved_power_law: Array2<gpu::ShapeletUV>,
+    pub(super) list: Array2<gpu::ShapeletUV>,
 }
 
 fn get_shapelet_uvs_inner(
     radecs: &[RADec],
     lst_rad: f64,
     tile_xyzs: &[XyzGeodetic],
-) -> Array2<cuda::ShapeletUV> {
+) -> Array2<gpu::ShapeletUV> {
     let n = tile_xyzs.len();
     let num_baselines = (n * (n - 1)) / 2;
 
-    let mut shapelet_uvs: Array2<cuda::ShapeletUV> = Array2::from_elem(
+    let mut shapelet_uvs: Array2<gpu::ShapeletUV> = Array2::from_elem(
         (num_baselines, radecs.len()),
-        cuda::ShapeletUV { u: 0.0, v: 0.0 },
+        gpu::ShapeletUV { u: 0.0, v: 0.0 },
     );
     shapelet_uvs
         .axis_iter_mut(Axis(1))
         .zip(radecs.iter())
         .for_each(|(mut baseline_uv, radec)| {
             let hadec = radec.to_hadec(lst_rad);
-            let shapelet_uvs: Vec<cuda::ShapeletUV> = xyzs_to_cross_uvws(tile_xyzs, hadec)
+            let shapelet_uvs: Vec<gpu::ShapeletUV> = xyzs_to_cross_uvws(tile_xyzs, hadec)
                 .into_iter()
-                .map(|uvw| cuda::ShapeletUV {
-                    u: uvw.u as CudaFloat,
-                    v: uvw.v as CudaFloat,
+                .map(|uvw| gpu::ShapeletUV {
+                    u: uvw.u as GpuFloat,
+                    v: uvw.v as GpuFloat,
                 })
                 .collect();
             baseline_uv.assign(&Array1::from(shapelet_uvs));
@@ -1049,8 +1049,8 @@ fn get_shapelet_uvs_inner(
 /// array-of-arrays).
 fn get_flattened_coeffs(
     shapelet_coeffs: Vec<&[ShapeletCoeff]>,
-) -> (Vec<cuda::ShapeletCoeff>, Vec<i32>) {
-    let mut coeffs: Vec<cuda::ShapeletCoeff> = vec![];
+) -> (Vec<gpu::ShapeletCoeff>, Vec<i32>) {
+    let mut coeffs: Vec<gpu::ShapeletCoeff> = vec![];
     let mut coeff_lengths = Vec::with_capacity(coeffs.len());
 
     for coeffs_for_comp in shapelet_coeffs {
@@ -1061,10 +1061,10 @@ fn get_flattened_coeffs(
                 .expect("not bigger than i32::MAX"),
         );
         for &ShapeletCoeff { n1, n2, value } in coeffs_for_comp {
-            coeffs.push(cuda::ShapeletCoeff {
+            coeffs.push(gpu::ShapeletCoeff {
                 n1,
                 n2,
-                value: value as CudaFloat,
+                value: value as GpuFloat,
             })
         }
     }
diff --git a/src/model/mod.rs b/src/model/mod.rs
index f414aed6..2125bf32 100644
--- a/src/model/mod.rs
+++ b/src/model/mod.rs
@@ -5,17 +5,17 @@
 //! Code to generate sky-model visibilities.
 
 mod cpu;
-#[cfg(feature = "cuda")]
-mod cuda;
 mod error;
+#[cfg(any(feature = "cuda", feature = "hip"))]
+mod gpu;
 pub(crate) mod shapelets;
 #[cfg(test)]
 mod tests;
 
 pub use cpu::SkyModellerCpu;
-#[cfg(feature = "cuda")]
-pub use cuda::SkyModellerCuda;
 pub(crate) use error::ModelError;
+#[cfg(any(feature = "cuda", feature = "hip"))]
+pub use gpu::SkyModellerGpu;
 
 use std::collections::HashSet;
 
@@ -31,10 +31,10 @@ pub enum ModelDevice {
     /// when modelling.
     Cpu,
 
-    /// A CUDA-capable device is used for modelling. The precision depends on
-    /// the compile features used.
-    #[cfg(feature = "cuda")]
-    Cuda,
+    /// A CUDA- or HIP-capable device is used for modelling. The precision
+    /// depends on the compile features used.
+    #[cfg(any(feature = "cuda", feature = "hip"))]
+    Gpu,
 }
 
 impl ModelDevice {
@@ -42,11 +42,11 @@ impl ModelDevice {
         match self {
             ModelDevice::Cpu => "double",
 
-            #[cfg(feature = "cuda-single")]
-            ModelDevice::Cuda => "single",
+            #[cfg(feature = "gpu-single")]
+            ModelDevice::Gpu => "single",
 
-            #[cfg(all(feature = "cuda", not(feature = "cuda-single")))]
-            ModelDevice::Cuda => "double",
+            #[cfg(all(any(feature = "cuda", feature = "hip"), not(feature = "gpu-single")))]
+            ModelDevice::Gpu => "double",
         }
     }
 
@@ -56,11 +56,15 @@ impl ModelDevice {
         match self {
             ModelDevice::Cpu => Ok(get_cpu_info()),
 
-            #[cfg(feature = "cuda")]
-            ModelDevice::Cuda => {
-                let (device_info, driver_info) = crate::cuda::get_device_info()?;
+            #[cfg(any(feature = "cuda", feature = "hip"))]
+            ModelDevice::Gpu => {
+                let (device_info, driver_info) = crate::gpu::get_device_info()?;
+                #[cfg(feature = "cuda")]
+                let device_type = "CUDA";
+                #[cfg(feature = "hip")]
+                let device_type = "HIP";
                 Ok(format!(
-                    "{} (capability {}, {} MiB), CUDA driver {}, runtime {}",
+                    "{} (capability {}, {} MiB), {device_type} driver {}, runtime {}",
                     device_info.name,
                     device_info.capability,
                     device_info.total_global_mem,
@@ -74,9 +78,9 @@ impl ModelDevice {
 
 #[derive(thiserror::Error, Debug)]
 pub(crate) enum DeviceError {
-    #[cfg(feature = "cuda")]
+    #[cfg(any(feature = "cuda", feature = "hip"))]
     #[error(transparent)]
-    Cuda(#[from] crate::cuda::CudaError),
+    Gpu(#[from] crate::gpu::GpuError),
 }
 
 /// Get a formatted string with information on the device used for modelling.
@@ -127,7 +131,8 @@ pub trait SkyModeller<'a> {
     /// # Errors
     ///
     /// This function will return an error if there was a problem with
-    /// beam-response calculation or a CUDA error (if using CUDA functionality).
+    /// beam-response calculation or a CUDA/HIP error (if using CUDA/HIP
+    /// functionality).
     fn model_timestep(
         &self,
         timestamp: Epoch,
@@ -151,7 +156,8 @@ pub trait SkyModeller<'a> {
     /// # Errors
     ///
     /// This function will return an error if there was a problem with
-    /// beam-response calculation or a CUDA error (if using CUDA functionality).
+    /// beam-response calculation or a CUDA/HIP error (if using CUDA/HIP
+    /// functionality).
     fn model_timestep_with(
         &self,
         timestamp: Epoch,
@@ -160,14 +166,13 @@ pub trait SkyModeller<'a> {
 }
 
 /// Create a [`SkyModeller`] trait object that generates sky-model visibilities
-/// on either the CPU or a CUDA-compatible GPU. This function conveniently
-/// provides either a [`SkyModellerCpu`] or [`SkyModellerCuda`] depending on how
-/// `hyperdrive` was compiled and the `use_cpu_for_modelling` flag.
+/// on the CPU, a CUDA-compatible GPU or a HIP-compatible GPU, depending on the
+/// value of [`MODEL_DEVICE`].
 ///
 /// # Errors
 ///
-/// This function will return an error if CUDA mallocs and copies can't be
-/// executed, or if there was a problem in setting up a `BeamCUDA`.
+/// This function will return an error if GPU mallocs and copies can't be
+/// executed, or if there was a problem in setting up a `BeamGpu`.
 #[allow(clippy::too_many_arguments)]
 pub fn new_sky_modeller<'a>(
     beam: &'a dyn Beam,
@@ -197,9 +202,9 @@ pub fn new_sky_modeller<'a>(
             apply_precession,
         ))),
 
-        #[cfg(feature = "cuda")]
-        ModelDevice::Cuda => {
-            let modeller = SkyModellerCuda::new(
+        #[cfg(any(feature = "cuda", feature = "hip"))]
+        ModelDevice::Gpu => {
+            let modeller = SkyModellerGpu::new(
                 beam,
                 source_list,
                 pols,
diff --git a/src/model/tests/cpu.rs b/src/model/tests/cpu.rs
index 32f4938a..69d290e9 100644
--- a/src/model/tests/cpu.rs
+++ b/src/model/tests/cpu.rs
@@ -361,8 +361,8 @@ fn shapelet_multiple_components() {
         .components
         .shapelets
         .get_shapelet_uvws(0.0, &obs.xyzs);
-    // Set the w terms to 0, because they 0 on the CUDA side, and this way the
-    // CPU and CUDA tests can use the same test values.
+    // Set the w terms to 0, because they are 0 on the GPU side, and this way
+    // the CPU and GPU tests can use the same test values.
     shapelet_uvws.iter_mut().for_each(|uvw| uvw.w = 0.0);
 
     test_multiple_shapelet_components(visibilities.view(), shapelet_uvws.view(), 0.0, 0.0);
diff --git a/src/model/tests/cuda.rs b/src/model/tests/gpu.rs
similarity index 94%
rename from src/model/tests/cuda.rs
rename to src/model/tests/gpu.rs
index 892fc0e2..11386a75 100644
--- a/src/model/tests/cuda.rs
+++ b/src/model/tests/gpu.rs
@@ -2,7 +2,7 @@
 // License, v. 2.0. If a copy of the MPL was not distributed with this
 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-//! Tests on generating sky-model visibilities with CUDA.
+//! Tests on generating sky-model visibilities with a GPU.
 //!
 //! These tests use the same expected values as the CPU tests.
 
@@ -10,7 +10,7 @@ use ndarray::prelude::*;
 
 use super::*;
 use crate::{
-    cuda::{self, CudaFloat, DevicePointer},
+    gpu::{self, DevicePointer, GpuFloat},
     srclist::Source,
 };
 
@@ -40,9 +40,9 @@ macro_rules! test_modelling {
         d_vis_fb
             .copy_from_device(visibilities.as_slice_mut().unwrap())
             .unwrap();
-        #[cfg(not(feature = "cuda-single"))]
+        #[cfg(not(feature = "gpu-single"))]
         let epsilon = if $no_beam { 0.0 } else { 1e-15 };
-        #[cfg(feature = "cuda-single")]
+        #[cfg(feature = "gpu-single")]
         let epsilon = if $no_beam { 6e-8 } else { 2e-3 };
         $list_test_fn(visibilities.view(), epsilon);
         d_vis_fb.clear();
@@ -88,7 +88,7 @@ macro_rules! test_modelling {
 fn point_zenith_gpu() {
     test_modelling!(
         true,
-        SkyModellerCuda::model_points,
+        SkyModellerGpu::model_points,
         &POINT_ZENITH_LIST,
         &POINT_ZENITH_POWER_LAW,
         &POINT_ZENITH_CURVED_POWER_LAW,
@@ -102,7 +102,7 @@ fn point_zenith_gpu() {
 fn point_off_zenith_gpu() {
     test_modelling!(
         true,
-        SkyModellerCuda::model_points,
+        SkyModellerGpu::model_points,
         &POINT_OFF_ZENITH_LIST,
         &POINT_OFF_ZENITH_POWER_LAW,
         &POINT_OFF_ZENITH_CURVED_POWER_LAW,
@@ -116,7 +116,7 @@ fn point_off_zenith_gpu() {
 fn gaussian_zenith_gpu() {
     test_modelling!(
         true,
-        SkyModellerCuda::model_gaussians,
+        SkyModellerGpu::model_gaussians,
         &GAUSSIAN_ZENITH_LIST,
         &GAUSSIAN_ZENITH_POWER_LAW,
         &GAUSSIAN_ZENITH_CURVED_POWER_LAW,
@@ -130,7 +130,7 @@ fn gaussian_zenith_gpu() {
 fn gaussian_off_zenith_gpu() {
     test_modelling!(
         true,
-        SkyModellerCuda::model_gaussians,
+        SkyModellerGpu::model_gaussians,
         &GAUSSIAN_OFF_ZENITH_LIST,
         &GAUSSIAN_OFF_ZENITH_POWER_LAW,
         &GAUSSIAN_OFF_ZENITH_CURVED_POWER_LAW,
@@ -144,7 +144,7 @@ fn gaussian_off_zenith_gpu() {
 fn shapelet_zenith_gpu() {
     test_modelling!(
         true,
-        SkyModellerCuda::model_shapelets,
+        SkyModellerGpu::model_shapelets,
         &SHAPELET_ZENITH_LIST,
         &SHAPELET_ZENITH_POWER_LAW,
         &SHAPELET_ZENITH_CURVED_POWER_LAW,
@@ -158,7 +158,7 @@ fn shapelet_zenith_gpu() {
 fn shapelet_off_zenith_gpu() {
     test_modelling!(
         true,
-        SkyModellerCuda::model_shapelets,
+        SkyModellerGpu::model_shapelets,
         &SHAPELET_OFF_ZENITH_LIST,
         &SHAPELET_OFF_ZENITH_POWER_LAW,
         &SHAPELET_OFF_ZENITH_CURVED_POWER_LAW,
@@ -172,7 +172,7 @@ fn shapelet_off_zenith_gpu() {
 fn point_zenith_gpu_fee() {
     test_modelling!(
         false,
-        SkyModellerCuda::model_points,
+        SkyModellerGpu::model_points,
         &POINT_ZENITH_LIST,
         &POINT_ZENITH_POWER_LAW,
         &POINT_ZENITH_CURVED_POWER_LAW,
@@ -186,7 +186,7 @@ fn point_zenith_gpu_fee() {
 fn point_off_zenith_gpu_fee() {
     test_modelling!(
         false,
-        SkyModellerCuda::model_points,
+        SkyModellerGpu::model_points,
         &POINT_OFF_ZENITH_LIST,
         &POINT_OFF_ZENITH_POWER_LAW,
         &POINT_OFF_ZENITH_CURVED_POWER_LAW,
@@ -200,7 +200,7 @@ fn point_off_zenith_gpu_fee() {
 fn gaussian_zenith_gpu_fee() {
     test_modelling!(
         false,
-        SkyModellerCuda::model_gaussians,
+        SkyModellerGpu::model_gaussians,
         &GAUSSIAN_ZENITH_LIST,
         &GAUSSIAN_ZENITH_POWER_LAW,
         &GAUSSIAN_ZENITH_CURVED_POWER_LAW,
@@ -214,7 +214,7 @@ fn gaussian_zenith_gpu_fee() {
 fn gaussian_off_zenith_gpu_fee() {
     test_modelling!(
         false,
-        SkyModellerCuda::model_gaussians,
+        SkyModellerGpu::model_gaussians,
         &GAUSSIAN_OFF_ZENITH_LIST,
         &GAUSSIAN_OFF_ZENITH_POWER_LAW,
         &GAUSSIAN_OFF_ZENITH_CURVED_POWER_LAW,
@@ -228,7 +228,7 @@ fn gaussian_off_zenith_gpu_fee() {
 fn shapelet_zenith_gpu_fee() {
     test_modelling!(
         false,
-        SkyModellerCuda::model_shapelets,
+        SkyModellerGpu::model_shapelets,
         &SHAPELET_ZENITH_LIST,
         &SHAPELET_ZENITH_POWER_LAW,
         &SHAPELET_ZENITH_CURVED_POWER_LAW,
@@ -242,7 +242,7 @@ fn shapelet_zenith_gpu_fee() {
 fn shapelet_off_zenith_gpu_fee() {
     test_modelling!(
         false,
-        SkyModellerCuda::model_shapelets,
+        SkyModellerGpu::model_shapelets,
         &SHAPELET_OFF_ZENITH_LIST,
         &SHAPELET_OFF_ZENITH_POWER_LAW,
         &SHAPELET_OFF_ZENITH_CURVED_POWER_LAW,
@@ -256,7 +256,7 @@ fn shapelet_off_zenith_gpu_fee() {
 fn non_trivial_gaussian() {
     test_modelling!(
         false,
-        SkyModellerCuda::model_gaussians,
+        SkyModellerGpu::model_gaussians,
         &SourceList::from([(
             "list".to_string(),
             Source {
@@ -497,9 +497,9 @@ fn gaussian_multiple_components() {
         .copy_from_device(visibilities.as_slice_mut().unwrap())
         .unwrap();
 
-    #[cfg(not(feature = "cuda-single"))]
+    #[cfg(not(feature = "gpu-single"))]
     test_multiple_gaussian_components(visibilities.view(), 0.0);
-    #[cfg(feature = "cuda-single")]
+    #[cfg(feature = "gpu-single")]
     test_multiple_gaussian_components(visibilities.view(), 5e-7);
 }
 
@@ -543,15 +543,15 @@ fn shapelet_multiple_components() {
     let shapelet_uvs = modeller
         .get_shapelet_uvs(obs.lst)
         .list
-        .map(|&cuda::ShapeletUV { u, v }| UVW {
-            u: CudaFloat::into(u),
-            v: CudaFloat::into(v),
+        .map(|&gpu::ShapeletUV { u, v }| UVW {
+            u: GpuFloat::into(u),
+            v: GpuFloat::into(v),
             w: 0.0,
         });
 
-    #[cfg(not(feature = "cuda-single"))]
+    #[cfg(not(feature = "gpu-single"))]
     test_multiple_shapelet_components(visibilities.view(), shapelet_uvs.view(), 0.0, 0.0);
-    #[cfg(feature = "cuda-single")]
+    #[cfg(feature = "gpu-single")]
     test_multiple_shapelet_components(visibilities.view(), shapelet_uvs.view(), 5e-7, 3e-8);
 }
 
@@ -584,7 +584,7 @@ fn test_curved_power_law_changing_ref_freq() {
     obs.freqs.clear();
     obs.freqs.push(150e6);
     let (modeller, _) = obs.get_gpu_modeller(&srclist);
-    let mut modeller_fds = [crate::cuda::CudaJones::default(); 1];
+    let mut modeller_fds = [crate::gpu::GpuJones::default(); 1];
     let mut modeller_sis = [0.0; 1];
     modeller
         .point_curved_power_law_fds
diff --git a/src/model/tests/mod.rs b/src/model/tests/mod.rs
index f63303b7..fd21645f 100644
--- a/src/model/tests/mod.rs
+++ b/src/model/tests/mod.rs
@@ -11,8 +11,8 @@
 //! highlighted.
 
 mod cpu;
-#[cfg(feature = "cuda")]
-mod cuda;
+#[cfg(any(feature = "cuda", feature = "hip"))]
+mod gpu;
 
 use approx::assert_abs_diff_eq;
 use marlu::{
@@ -25,10 +25,10 @@ use num_complex::Complex;
 use vec1::vec1;
 
 use super::*;
-#[cfg(feature = "cuda")]
-use crate::cuda::DevicePointer;
-#[cfg(feature = "cuda")]
-use crate::model::cuda::SkyModellerCuda;
+#[cfg(any(feature = "cuda", feature = "hip"))]
+use crate::gpu::DevicePointer;
+#[cfg(any(feature = "cuda", feature = "hip"))]
+use crate::model::gpu::SkyModellerGpu;
 use crate::{
     beam::{create_beam_object, Delays},
     srclist::{
@@ -313,13 +313,13 @@ impl ObsParams {
         )
     }
 
-    #[cfg(feature = "cuda")]
+    #[cfg(any(feature = "cuda", feature = "hip"))]
     #[track_caller]
     fn get_gpu_modeller(
         &self,
         srclist: &SourceList,
-    ) -> (SkyModellerCuda, DevicePointer<crate::cuda::UVW>) {
-        let m = SkyModellerCuda::new(
+    ) -> (SkyModellerGpu, DevicePointer<crate::gpu::UVW>) {
+        let m = SkyModellerGpu::new(
             &*self.beam,
             srclist,
             Polarisations::default(),
@@ -333,16 +333,16 @@ impl ObsParams {
             true,
         )
         .unwrap();
-        let cuda_uvws = self
+        let gpu_uvws = self
             .uvws
             .iter()
-            .map(|&uvw| crate::cuda::UVW {
-                u: uvw.u as crate::cuda::CudaFloat,
-                v: uvw.v as crate::cuda::CudaFloat,
-                w: uvw.w as crate::cuda::CudaFloat,
+            .map(|&uvw| crate::gpu::UVW {
+                u: uvw.u as crate::gpu::GpuFloat,
+                v: uvw.v as crate::gpu::GpuFloat,
+                w: uvw.w as crate::gpu::GpuFloat,
             })
             .collect::<Vec<_>>();
-        let d_uvws = DevicePointer::copy_to_device(&cuda_uvws).unwrap();
+        let d_uvws = DevicePointer::copy_to_device(&gpu_uvws).unwrap();
         (m, d_uvws)
     }
 }
diff --git a/src/params/vis_subtract.rs b/src/params/vis_subtract.rs
index 56d4be13..30b05665 100644
--- a/src/params/vis_subtract.rs
+++ b/src/params/vis_subtract.rs
@@ -345,7 +345,7 @@ pub(crate) enum VisSubtractError {
     #[error(transparent)]
     IO(#[from] std::io::Error),
 
-    #[cfg(feature = "cuda")]
+    #[cfg(any(feature = "cuda", feature = "hip"))]
     #[error(transparent)]
-    Cuda(#[from] crate::cuda::CudaError),
+    Gpu(#[from] crate::gpu::GpuError),
 }
diff --git a/src/solutions/mod.rs b/src/solutions/mod.rs
index c0b57c69..f9d1e1f5 100644
--- a/src/solutions/mod.rs
+++ b/src/solutions/mod.rs
@@ -133,8 +133,7 @@ pub struct CalibrationSolutions {
     /// specified in lambdas to metres \[Hz\].
     pub freq_centroid: Option<f64>,
 
-    /// What was used to model the visibilities? This is currently either
-    /// "CPU" or "CUDA GPU".
+    /// What was used to model the visibilities?
     pub modeller: Option<String>,
 }