From f4718dc4bc407173c93c57a98d520310b3e6c785 Mon Sep 17 00:00:00 2001 From: "Christopher H. Jordan" Date: Sun, 11 Jun 2023 21:54:31 +0800 Subject: [PATCH] HIP support. --- .github/workflows/releases.yml | 2 +- .github/workflows/run-tests.yml | 5 +- CHANGELOG.md | 2 + Cargo.lock | 20 ++ Cargo.toml | 19 +- benches/bench.rs | 12 +- build.rs | 240 +++++++------ mdbook/src/installation/from_source.md | 41 ++- mdbook/src/installation/pre_compiled.md | 9 +- mdbook/src/user/beam.md | 6 +- src/beam/error.rs | 4 +- src/beam/fee.rs | 28 +- src/beam/mod.rs | 66 ++-- src/beam/tests.rs | 12 +- src/cli/beam.rs | 38 +- src/cli/common/mod.rs | 33 +- src/cli/di_calibrate/error.rs | 177 --------- src/cli/di_calibrate/tests.rs | 16 +- src/cli/error.rs | 18 +- src/cli/vis_simulate/tests.rs | 10 +- src/cuda/utils.cu | 40 --- src/{cuda => gpu}/common.cuh | 124 ++++--- src/{cuda => gpu}/compile_flags.txt | 1 + src/{cuda => gpu}/mod.rs | 231 ++++++++---- src/{cuda => gpu}/model.cu | 54 ++- src/{cuda => gpu}/model.h | 0 src/{cuda => gpu}/model_double.rs | 0 src/{cuda => gpu}/model_single.rs | 0 src/{cuda => gpu}/tests.rs | 17 +- src/{cuda => gpu}/types.h | 0 src/{cuda => gpu}/types_double.rs | 0 src/{cuda => gpu}/types_single.rs | 0 src/{cuda => gpu}/update_rust_bindings.sh | 6 +- src/gpu/utils.cu | 68 ++++ src/{cuda => gpu}/utils.h | 6 +- src/{cuda => gpu}/utils.rs | 69 ++-- src/{cuda => gpu}/utils_bindings.rs | 2 +- src/lib.rs | 8 +- src/model/error.rs | 4 +- src/model/{cuda.rs => gpu.rs} | 416 +++++++++++----------- src/model/mod.rs | 61 ++-- src/model/tests/cpu.rs | 4 +- src/model/tests/{cuda.rs => gpu.rs} | 50 +-- src/model/tests/mod.rs | 30 +- src/params/vis_subtract.rs | 4 +- src/solutions/mod.rs | 3 +- 46 files changed, 1036 insertions(+), 920 deletions(-) delete mode 100644 src/cli/di_calibrate/error.rs delete mode 100644 src/cuda/utils.cu rename src/{cuda => gpu}/common.cuh (63%) rename src/{cuda => gpu}/compile_flags.txt (93%) rename src/{cuda => gpu}/mod.rs (57%) rename src/{cuda => gpu}/model.cu (95%) rename src/{cuda => gpu}/model.h (100%) rename src/{cuda => gpu}/model_double.rs (100%) rename src/{cuda => gpu}/model_single.rs (100%) rename src/{cuda => gpu}/tests.rs (78%) rename src/{cuda => gpu}/types.h (100%) rename src/{cuda => gpu}/types_double.rs (100%) rename src/{cuda => gpu}/types_single.rs (100%) rename src/{cuda => gpu}/update_rust_bindings.sh (90%) create mode 100644 src/gpu/utils.cu rename src/{cuda => gpu}/utils.h (77%) rename src/{cuda => gpu}/utils.rs (52%) rename src/{cuda => gpu}/utils_bindings.rs (96%) rename src/model/{cuda.rs => gpu.rs} (75%) rename src/model/tests/{cuda.rs => gpu.rs} (94%) diff --git a/.github/workflows/releases.yml b/.github/workflows/releases.yml index b3dd9770..fae5853d 100644 --- a/.github/workflows/releases.yml +++ b/.github/workflows/releases.yml @@ -67,7 +67,7 @@ jobs: LICENSE COPYING-hdf5 LICENSE-erfa LICENSE-cfitsio LICENSE-NVIDIA README.md \ hyperdrive - cargo build --profile production --locked --no-default-features --features=hdf5-static,cfitsio-static,cuda-single + cargo build --profile production --locked --no-default-features --features=hdf5-static,cfitsio-static,cuda,gpu-single mv target/production/hyperdrive . tar -acvf mwa_hyperdrive-$(git describe --tags)-Linux-x86-64-v3-CUDA-single.tar.gz \ LICENSE COPYING-hdf5 LICENSE-erfa LICENSE-cfitsio LICENSE-NVIDIA README.md \ diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml index 7fdea7e6..c92bb76b 100644 --- a/.github/workflows/run-tests.yml +++ b/.github/workflows/run-tests.yml @@ -80,4 +80,7 @@ jobs: cargo clean cargo +${MIN_RUST} test --locked cargo +${MIN_RUST} test --locked --features=all-static - cargo +${MIN_RUST} test --locked --all-features --no-run + # Can't test with --all-features, cuda and hip aren't allowed together + # hip is also difficult to install so ignore it + cargo +${MIN_RUST} test --locked --features=all-static,cuda --no-run + cargo +${MIN_RUST} test --locked --features=all-static,cuda,gpu-single --no-run diff --git a/CHANGELOG.md b/CHANGELOG.md index 50bf0f90..207b5895 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,8 @@ Versioning](https://semver.org/spec/v2.0.0.html). ## [0.3.0] - Unreleased ### Added +- Support for HIP, which allows AMD GPUs to be used instead of only NVIDIA GPUs + via CUDA. - Support for the "DipAmps" column in a metafits file. This allows users to control dipole gains in beam code. - Support for averaging incoming visibilities in time and frequency *before* diff --git a/Cargo.lock b/Cargo.lock index 9b497d45..b7845b1b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1078,6 +1078,24 @@ dependencies = [ "serde_derive", ] +[[package]] +name = "hip-runtime-sys" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "901a5d54cfff799dd9e6f6e1d53883bb50afdd92edce7680b1ca299d1805f65a" +dependencies = [ + "libc", +] + +[[package]] +name = "hip-sys" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f399629f98b6249efc10039e949baa70e2ff2bf84d3c30e7a0ca49f179a04287" +dependencies = [ + "hip-runtime-sys", +] + [[package]] name = "humantime" version = "2.1.0" @@ -1456,6 +1474,7 @@ dependencies = [ "cuda-runtime-sys", "hdf5", "hdf5-sys", + "hip-sys", "marlu", "ndarray", "num-complex", @@ -1490,6 +1509,7 @@ dependencies = [ "flate2", "glob", "hifitime", + "hip-sys", "indexmap 1.9.3", "indicatif", "indoc", diff --git a/Cargo.toml b/Cargo.toml index 613b91a0..42ab3ed6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -30,13 +30,14 @@ inherits = "production" default = ["plotting"] # Use CUDA code with double-precision floats. -cuda = ["mwa_hyperbeam/cuda", "cuda-runtime-sys"] +cuda = ["mwa_hyperbeam/cuda", "cuda-runtime-sys", "cc"] -# Use CUDA code with single-precision floats. -cuda-single = [ - "cuda", - "mwa_hyperbeam/gpu-single", -] +# Use HIP code with double-precision floats. +hip = ["mwa_hyperbeam/hip", "hip-sys", "cc"] + +# Opt-out of GPU double precision, use only single precision (faster on desktop +# GPUs). +gpu-single = ["mwa_hyperbeam/gpu-single"] # Enable plotting. plotting = ["plotters"] @@ -100,6 +101,9 @@ vec1 = { version = "1.5.0", features = ["serde"] } # "cuda" feature cuda-runtime-sys = { version = "0.3.0-alpha.1", optional = true } +# "hip" feature +hip-sys = { version = "0.1.0", optional = true } + # "plotting" feature plotters = { version = "0.3.5", default-features = false, features = [ "bitmap_backend", @@ -121,7 +125,8 @@ tempfile = "3.6.0" [build-dependencies] built = { version = "0.6.0", features = ["chrono", "git2"] } -cc = { version = "1.0.72", features = ["parallel"] } +cc = { version = "1.0.72", features = ["parallel"], optional = true } +hip-sys = { version = "0.1.0", optional = true } [[bench]] name = "bench" diff --git a/benches/bench.rs b/benches/bench.rs index d10f7c88..0320f27c 100644 --- a/benches/bench.rs +++ b/benches/bench.rs @@ -92,7 +92,7 @@ fn model_benchmarks(c: &mut Criterion) { ); } - #[cfg(feature = "cuda")] + #[cfg(any(feature = "cuda", feature = "hip"))] for (num_sources, num_chans) in [ (1, 768), (32, 768), @@ -130,7 +130,7 @@ fn model_benchmarks(c: &mut Criterion) { }, ); } - let modeller = model::SkyModellerCuda::new( + let modeller = model::SkyModellerGpu::new( &*beam, &source_list, Polarisations::default(), @@ -209,7 +209,7 @@ fn model_benchmarks(c: &mut Criterion) { }) }); - #[cfg(feature = "cuda")] + #[cfg(any(feature = "cuda", feature = "hip"))] for (num_sources, num_chans) in [ (1, 768), (32, 768), @@ -251,7 +251,7 @@ fn model_benchmarks(c: &mut Criterion) { }, ); } - let modeller = model::SkyModellerCuda::new( + let modeller = model::SkyModellerGpu::new( &*beam, &source_list, Polarisations::default(), @@ -342,7 +342,7 @@ fn model_benchmarks(c: &mut Criterion) { }, ); - #[cfg(feature = "cuda")] + #[cfg(any(feature = "cuda", feature = "hip"))] for (num_sources, num_chans) in [ (1, 768), (32, 768), @@ -393,7 +393,7 @@ fn model_benchmarks(c: &mut Criterion) { }, ); } - let modeller = model::SkyModellerCuda::new( + let modeller = model::SkyModellerGpu::new( &*beam, &source_list, Polarisations::default(), diff --git a/build.rs b/build.rs index c9b3a4e9..332a1a2e 100644 --- a/build.rs +++ b/build.rs @@ -28,140 +28,168 @@ fn write_built(out_dir: &Path) { fn main() { println!("cargo:rerun-if-changed=build.rs"); + #[cfg(all(feature = "cuda", feature = "hip"))] + compile_error!("Both 'cuda' and 'hip' features are enabled; only one can be used."); + #[cfg(all(not(feature = "cuda"), not(feature = "hip"), feature = "gpu-single"))] + compile_error!( + "The 'gpu-single' feature must be used with either of the 'cuda' or 'hip' features." + ); + let out_dir = PathBuf::from(env::var("OUT_DIR").expect("OUT_DIR env. variable not defined!")); write_built(&out_dir); - #[cfg(feature = "cuda")] - cuda::build_and_link(); + #[cfg(any(feature = "cuda", feature = "hip"))] + gpu::build_and_link(); } -#[cfg(feature = "cuda")] -mod cuda { - use std::env; - - const DEFAULT_CUDA_ARCHES: &[u16] = &[60, 70, 80]; - const DEFAULT_CUDA_SMS: &[u16] = &[60, 70, 75, 80, 86]; - - fn parse_and_validate_compute(c: &str, var: &str) -> Vec { - let mut out = vec![]; - for compute in c.trim().split(',') { - // Check that there's only two numeric characters. - if compute.len() != 2 { - panic!("When parsing {var}, found '{compute}', which is not a two-digit number!") +#[cfg(any(feature = "cuda", feature = "hip"))] +mod gpu { + use std::{env, path::PathBuf}; + + /// Search for any C/C++/CUDA/HIP files, populate the provided buffer with + /// them, and have rerun-if-changed on all of them. + #[cfg(any(feature = "cuda", feature = "hip"))] + fn get_gpu_files>(dir: P, files: &mut Vec) { + for path in std::fs::read_dir(dir).expect("dir exists") { + let path = path.expect("is readable").path(); + if path.is_dir() { + get_gpu_files(&path, files) } - match compute.parse() { - Ok(p) => out.push(p), - Err(_) => panic!("'{compute}', part of {var}, couldn't be parsed into a number!"), + match path.extension().and_then(|os_str| os_str.to_str()) { + Some("cu") => { + println!("cargo:rerun-if-changed={}", path.display()); + files.push(path); + } + Some("h" | "cuh") => println!("cargo:rerun-if-changed={}", path.display()), + _ => (), } } - out } pub(super) fn build_and_link() { - // Attempt to read HYPERDRIVE_CUDA_COMPUTE. HYPERBEAM_CUDA_COMPUTE can be - // used instead, too. - println!("cargo:rerun-if-env-changed=HYPERDRIVE_CUDA_COMPUTE"); - println!("cargo:rerun-if-env-changed=HYPERBEAM_CUDA_COMPUTE"); - let (arches, sms) = match ( - env::var("HYPERDRIVE_CUDA_COMPUTE"), - env::var("HYPERBEAM_CUDA_COMPUTE"), - ) { - // When a user-supplied variable exists, use it as the CUDA arch and - // compute level. - (Ok(c), _) | (Err(_), Ok(c)) => { - let compute = parse_and_validate_compute(&c, "HYPERDRIVE_CUDA_COMPUTE"); - let sms = compute.clone(); - (compute, sms) - } - (Err(_), Err(_)) => { - // Print out all of the default arches and computes as a - // warning. - println!("cargo:warning=No HYPERDRIVE_CUDA_COMPUTE; Passing arch=compute_{DEFAULT_CUDA_ARCHES:?} and code=sm_{DEFAULT_CUDA_SMS:?} to nvcc"); - (DEFAULT_CUDA_ARCHES.to_vec(), DEFAULT_CUDA_SMS.to_vec()) - } - }; - - // Compile all CUDA source files into a single library. Find .cu, .h and - // .cuh files; if any of them change, tell cargo to recompile. - let mut cuda_files = vec![]; - for entry in std::fs::read_dir("src/cuda").expect("src/cuda directory doesn't exist!") { - let entry = entry.expect("Couldn't access file in src/cuda directory"); - let path = entry.path(); - // Skip this entry if it isn't a file. - if !path.is_file() { - continue; + let mut gpu_files = vec![]; + get_gpu_files("src/gpu", &mut gpu_files); + + #[cfg(feature = "cuda")] + let mut gpu_target = { + const DEFAULT_CUDA_ARCHES: &[u16] = &[60, 70, 80]; + const DEFAULT_CUDA_SMS: &[u16] = &[60, 70, 75, 80, 86]; + + fn parse_and_validate_compute(c: &str, var: &str) -> Vec { + let mut out = vec![]; + for compute in c.trim().split(',') { + // Check that there's only two numeric characters. + if compute.len() != 2 { + panic!("When parsing {var}, found '{compute}', which is not a two-digit number!") + } + + match compute.parse() { + Ok(p) => out.push(p), + Err(_) => { + panic!("'{compute}', part of {var}, couldn't be parsed into a number!") + } + } + } + out } - match path.extension().and_then(|os_str| os_str.to_str()) { - // Track this file if it's extension is .cu - Some("cu") => { - println!("cargo:rerun-if-changed={}", path.display()); - // Add this .cu file to be compiled later. - cuda_files.push(path); + // Attempt to read HYPERDRIVE_CUDA_COMPUTE. HYPERBEAM_CUDA_COMPUTE can be + // used instead, too. + println!("cargo:rerun-if-env-changed=HYPERDRIVE_CUDA_COMPUTE"); + println!("cargo:rerun-if-env-changed=HYPERBEAM_CUDA_COMPUTE"); + let (arches, sms) = match ( + env::var("HYPERDRIVE_CUDA_COMPUTE"), + env::var("HYPERBEAM_CUDA_COMPUTE"), + ) { + // When a user-supplied variable exists, use it as the CUDA arch and + // compute level. + (Ok(c), _) | (Err(_), Ok(c)) => { + let compute = parse_and_validate_compute(&c, "HYPERDRIVE_CUDA_COMPUTE"); + let sms = compute.clone(); + (compute, sms) } - - Some("h" | "cuh") => { - println!("cargo:rerun-if-changed={}", path.display()); + (Err(_), Err(_)) => { + // Print out all of the default arches and computes as a + // warning. + println!("cargo:warning=No HYPERDRIVE_CUDA_COMPUTE; Passing arch=compute_{DEFAULT_CUDA_ARCHES:?} and code=sm_{DEFAULT_CUDA_SMS:?} to nvcc"); + (DEFAULT_CUDA_ARCHES.to_vec(), DEFAULT_CUDA_SMS.to_vec()) + } + }; + + let mut cuda_target = cc::Build::new(); + cuda_target.cuda(true).cudart("shared"); // We handle linking cudart statically + + // If $CXX is not set but $CUDA_PATH is, search for + // $CUDA_PATH/bin/g++ and if it exists, set that as $CXX. + if env::var_os("CXX").is_none() { + // Unlike above, we care about $CUDA_PATH being unicode. + if let Ok(cuda_path) = env::var("CUDA_PATH") { + // Look for the g++ that CUDA wants. + let compiler = std::path::PathBuf::from(cuda_path).join("bin/g++"); + if compiler.exists() { + println!("cargo:warning=Setting $CXX to {}", compiler.display()); + env::set_var("CXX", compiler.into_os_string()); + } } - - _ => (), } - } - let mut cuda_target = cc::Build::new(); - cuda_target - .cuda(true) - .cudart("shared") // We handle linking cudart statically - .define( - // The DEBUG env. variable is set by cargo. If running "cargo build - // --release", DEBUG is "false", otherwise "true". C/C++/CUDA like - // the compile option "NDEBUG" to be defined when using assert.h, so - // if appropriate, define that here. We also define "DEBUG" so that - // can be used. - match env::var("DEBUG").as_deref() { - Ok("false") => "NDEBUG", - _ => "DEBUG", - }, - None, - ); - // If $CXX is not set but $CUDA_PATH is, search for $CUDA_PATH/bin/g++ - // and if it exists, set that as $CXX. - if env::var_os("CXX").is_none() { - // Unlike above, we care about $CUDA_PATH being unicode. - if let Ok(cuda_path) = env::var("CUDA_PATH") { - // Look for the g++ that CUDA wants. - let compiler = std::path::PathBuf::from(cuda_path).join("bin/g++"); - if compiler.exists() { - println!("cargo:warning=Setting $CXX to {}", compiler.display()); - env::set_var("CXX", compiler.into_os_string()); + // Loop over each arch and sm + for arch in arches { + for &sm in &sms { + if sm < arch { + continue; + } + + cuda_target.flag("-gencode"); + cuda_target.flag(&format!("arch=compute_{arch},code=sm_{sm}")); } } - } - // Loop over each arch and sm - for arch in arches { - for &sm in &sms { - if sm < arch { - continue; - } + cuda_target + }; - cuda_target.flag("-gencode"); - cuda_target.flag(&format!("arch=compute_{arch},code=sm_{sm}")); + #[cfg(feature = "hip")] + let mut gpu_target = { + let hip_path = hip_sys::hiprt::get_hip_path(); + + // It seems that various ROCm releases change where hipcc is... + let mut compiler = hip_path.join("bin/hipcc"); + if !compiler.exists() { + // Try the dir above, which might be the ROCm dir. + compiler = hip_path.join("../bin/hipcc"); } - } + let mut hip_target = cc::Build::new(); + hip_target + .compiler(compiler) + .include(hip_path.join("include/hip")); + hip_target + }; - // If we're told to, use single-precision floats. The default in the CUDA + gpu_target.define( + // The DEBUG env. variable is set by cargo. If running "cargo build + // --release", DEBUG is "false", otherwise "true". C/C++/CUDA like + // the compile option "NDEBUG" to be defined when using assert.h, so + // if appropriate, define that here. We also define "DEBUG" so that + // can be used. + match env::var("DEBUG").as_deref() { + Ok("false") => "NDEBUG", + _ => "DEBUG", + }, + None, + ); + + // If we're told to, use single-precision floats. The default in the GPU // code is to use double-precision. - #[cfg(feature = "cuda-single")] - cuda_target.define("SINGLE", None); + #[cfg(feature = "gpu-single")] + gpu_target.define("SINGLE", None); // Break in case of emergency. - // cuda_target.debug(true); + // gpu_target.debug(true); - for f in cuda_files { - cuda_target.file(f); + for f in gpu_files { + gpu_target.file(f); } - cuda_target.compile("hyperdrive_cu"); + gpu_target.compile("hyperdrive_gpu"); } } diff --git a/mdbook/src/installation/from_source.md b/mdbook/src/installation/from_source.md index 415438aa..97d6faca 100644 --- a/mdbook/src/installation/from_source.md +++ b/mdbook/src/installation/from_source.md @@ -35,8 +35,8 @@ time. ``` -```admonish tip title="CUDA (for accelerated sky modelling)" -- Only required if either the `cuda` or `cuda-single` feature is enabled +```admonish tip title="CUDA (for accelerated sky modelling with NVIDIA GPUs)" +- Only required if the `cuda` feature is enabled - Requires a [CUDA-capable device](https://developer.nvidia.com/cuda-gpus) - Arch: `cuda` - Ubuntu and others: [Download link](https://developer.nvidia.com/cuda-zone) @@ -45,6 +45,20 @@ - Can link statically; use the `cuda-static` or `all-static` features. ``` +```admonish tip title="HIP (for accelerated sky modelling with AMD GPUs)" +- Only required if either the `hip` feature is enabled +- Requires a [HIP-capable device](https://docs.amd.com/en/latest/release/gpu_os_support.html) (N.B. This seems to be incomplete) +- Arch: + - See [https://wiki.archlinux.org/title/GPGPU#ROCm](https://wiki.archlinux.org/title/GPGPU#ROCm) + - It is possible to get pre-compiled products from the [arch4edu repo](https://github.com/arch4edu/arch4edu). +- Ubuntu and others: [Download link](https://docs.amd.com/projects/HIP/en/docs-5.3.0/how_to_guides/install.html) +- The installation dir can be specified manually with `HIP_PATH` + - If not specified, `/opt/rocm/hip` is used. +- N.B. Despite HIP installations being able to run HIP code on NVIDIA GPUs, + this is not supported by `hyperdrive`; please compile with the CUDA + instructions above. +``` + ## Installing Rust ~~~admonish tip title="TL;DR" @@ -103,7 +117,7 @@ export HYPERDRIVE_CUDA_COMPUTE=75 Now you can compile `hyperdrive` with CUDA enabled (single-precision floats): ```shell -cargo install --path . --locked --features=cuda-single +cargo install --path . --locked --features=cuda,gpu-single ``` If you're using "datacentre" products (e.g. a V100 available on the @@ -123,6 +137,27 @@ compiler happy. You can select a custom C++ compiler with the `CXX` variable, e.g. `CXX=/opt/cuda/bin/g++`. ~~~ +~~~admonish danger title="HIP" +Do you have a HIP-capable AMD GPU? Ensure you have installed HIP (instructions +are above), and compile with the `hip` feature (single-precision floats): + +```shell +cargo install --path . --locked --features=hip,gpu-single +``` + +If you're using "datacentre" products (e.g. the GPUs on the "setonix" +supercomputer), you probably want double-precision floats: + +```shell +cargo install --path . --locked --features=hip +``` + +You can still compile with double-precision on a desktop GPU, but it will be +much slower than single-precision. + +If you are encountering problems, you may need to set your `HIP_PATH` variable. +~~~ + ~~~admonish tip title="Static dependencies" The aforementioned C libraries can each be compiled by `cargo`. `all-static` will statically-link all dependencies (including CUDA, if CUDA is enabled) such diff --git a/mdbook/src/installation/pre_compiled.md b/mdbook/src/installation/pre_compiled.md index 8aa22f24..24aee048 100644 --- a/mdbook/src/installation/pre_compiled.md +++ b/mdbook/src/installation/pre_compiled.md @@ -14,6 +14,11 @@ the "CUDA-single" release. You can still use the double-precision version on a desktop GPU, but it will be much slower than single-precision. Instructions to install CUDA are on [the next page](from_source.md#cuda). +It is possible to run `hyperdrive` with HIP (i.e. the AMD equivalent to +NVIDIA's CUDA), but HIP does not appear to offer static libraries, so no static +feature is provided, and users will need to compile hyperdrive themselves with +instructions on [the next page](from_source.md#gpu). + ~~~admonish The pre-compiled binaries are made by GitHub actions using: ```shell @@ -21,6 +26,6 @@ cargo build --release --locked --no-default-features --features=hdf5-static,cfit ``` This means they cannot plot calibration solutions. "CUDA-double" binaries have the `cuda` feature and "CUDA-single" binaries have -the `cuda-single` feature. CUDA cannot legally be statically linked so a local -installation of CUDA is required. +the `cuda` and `gpu-single` features. CUDA cannot legally be statically linked +so a local installation of CUDA is required. ~~~ diff --git a/mdbook/src/user/beam.md b/mdbook/src/user/beam.md index ab0d8a92..31c8ba83 100644 --- a/mdbook/src/user/beam.md +++ b/mdbook/src/user/beam.md @@ -9,9 +9,9 @@ in steps of `--step`, then for each of these zenith angles, moving from 0 to \\( 2 \pi \\) in steps of `--step` for the azimuth. Using a smaller `--step` will generate many more responses, so be aware that it might take a while. -~~~admonish danger title="CUDA" -If CUDA is available to you, the `--cuda` flag will generate the beam responses -on the GPU, vastly decreasing the time taken. +~~~admonish danger title="CUDA/HIP" +If CUDA or HIP is available to you, the `--gpu` flag will generate the beam +responses on the GPU, vastly decreasing the time taken. ~~~ ~~~admonish example title="`Python` example to plot beam responses" diff --git a/src/beam/error.rs b/src/beam/error.rs index 36df727f..aeba9a45 100644 --- a/src/beam/error.rs +++ b/src/beam/error.rs @@ -41,7 +41,7 @@ pub enum BeamError { #[error("hyperbeam init error: {0}")] HyperbeamInit(#[from] mwa_hyperbeam::fee::InitFEEBeamError), - #[cfg(feature = "cuda")] + #[cfg(any(feature = "cuda", feature = "hip"))] #[error(transparent)] - Cuda(#[from] crate::cuda::CudaError), + Gpu(#[from] crate::gpu::GpuError), } diff --git a/src/beam/fee.rs b/src/beam/fee.rs index 64c32d55..374ac3c0 100644 --- a/src/beam/fee.rs +++ b/src/beam/fee.rs @@ -12,8 +12,8 @@ use ndarray::prelude::*; use super::{Beam, BeamError, BeamType, Delays}; -#[cfg(feature = "cuda")] -use super::{BeamCUDA, CudaFloat, DevicePointer}; +#[cfg(any(feature = "cuda", feature = "hip"))] +use super::{BeamGpu, DevicePointer, GpuFloat}; /// A wrapper of the `FEEBeam` struct in hyperbeam that implements the [`Beam`] /// trait. @@ -301,9 +301,9 @@ impl Beam for FEEBeam { self.hyperbeam_object.empty_cache(); } - #[cfg(feature = "cuda")] - fn prepare_cuda_beam(&self, freqs_hz: &[u32]) -> Result, BeamError> { - let cuda_beam = unsafe { + #[cfg(any(feature = "cuda", feature = "hip"))] + fn prepare_gpu_beam(&self, freqs_hz: &[u32]) -> Result, BeamError> { + let gpu_beam = unsafe { self.hyperbeam_object.gpu_prepare( freqs_hz, self.delays.view(), @@ -311,29 +311,29 @@ impl Beam for FEEBeam { true, )? }; - Ok(Box::new(FEEBeamCUDA { - hyperbeam_object: cuda_beam, + Ok(Box::new(FEEBeamGpu { + hyperbeam_object: gpu_beam, })) } } -#[cfg(feature = "cuda")] -pub(crate) struct FEEBeamCUDA { +#[cfg(any(feature = "cuda", feature = "hip"))] +struct FEEBeamGpu { hyperbeam_object: mwa_hyperbeam::fee::FEEBeamGpu, } -#[cfg(feature = "cuda")] -impl BeamCUDA for FEEBeamCUDA { +#[cfg(any(feature = "cuda", feature = "hip"))] +impl BeamGpu for FEEBeamGpu { unsafe fn calc_jones_pair( &self, - az_rad: &[CudaFloat], - za_rad: &[CudaFloat], + az_rad: &[GpuFloat], + za_rad: &[GpuFloat], latitude_rad: f64, d_jones: *mut std::ffi::c_void, ) -> Result<(), BeamError> { let d_az_rad = DevicePointer::copy_to_device(az_rad)?; let d_za_rad = DevicePointer::copy_to_device(za_rad)?; - let d_array_latitude_rad = DevicePointer::copy_to_device(&[latitude_rad as CudaFloat])?; + let d_array_latitude_rad = DevicePointer::copy_to_device(&[latitude_rad as GpuFloat])?; self.hyperbeam_object.calc_jones_device_pair_inner( d_az_rad.get(), d_za_rad.get(), diff --git a/src/beam/mod.rs b/src/beam/mod.rs index 6be20284..8029d591 100644 --- a/src/beam/mod.rs +++ b/src/beam/mod.rs @@ -28,8 +28,8 @@ use marlu::{AzEl, Jones}; use ndarray::prelude::*; use strum::IntoEnumIterator; -#[cfg(feature = "cuda")] -use crate::cuda::{CudaFloat, DevicePointer}; +#[cfg(any(feature = "cuda", feature = "hip"))] +use crate::gpu::{DevicePointer, GpuFloat}; /// Supported beam types. #[derive( @@ -135,33 +135,33 @@ pub trait Beam: Sync + Send { /// If this [`Beam`] supports it, empty the coefficient cache. fn empty_coeff_cache(&self); - #[cfg(feature = "cuda")] + #[cfg(any(feature = "cuda", feature = "hip"))] /// Using the tile information from this [`Beam`] and frequencies to be - /// used, return a [`BeamCUDA`]. This object only needs frequencies to + /// used, return a [`BeamGpu`]. This object only needs frequencies to /// calculate beam response [`Jones`] matrices. - fn prepare_cuda_beam(&self, freqs_hz: &[u32]) -> Result, BeamError>; + fn prepare_gpu_beam(&self, freqs_hz: &[u32]) -> Result, BeamError>; } -/// A trait abstracting beam code functions on a CUDA-capable device. -#[cfg(feature = "cuda")] -pub trait BeamCUDA { +/// A trait abstracting beam code functions on a GPU. +#[cfg(any(feature = "cuda", feature = "hip"))] +pub trait BeamGpu { /// Calculate the Jones matrices for each `az` and `za` direction and /// frequency (these were defined when the [`BeamCUDA`] was created). The /// results are ordered tile, frequency, direction, slowest to fastest. /// /// # Safety /// - /// This function interfaces directly with the CUDA API. Rust errors attempt - /// to catch problems but there are no guarantees. + /// This function interfaces directly with the CUDA/HIP API. Rust errors + /// attempt to catch problems but there are no guarantees. unsafe fn calc_jones_pair( &self, - az_rad: &[CudaFloat], - za_rad: &[CudaFloat], + az_rad: &[GpuFloat], + za_rad: &[GpuFloat], latitude_rad: f64, d_jones: *mut std::ffi::c_void, ) -> Result<(), BeamError>; - /// Get the type of beam used to create this [`BeamCUDA`]. + /// Get the type of beam used to create this [`BeamGpu`]. fn get_beam_type(&self) -> BeamType; /// Get a pointer to the device tile map. This is necessary to access @@ -172,11 +172,11 @@ pub trait BeamCUDA { /// de-duplicated beam Jones matrices on the device. fn get_freq_map(&self) -> *const i32; - /// Get the number of de-duplicated tiles associated with this [`BeamCUDA`]. + /// Get the number of de-duplicated tiles associated with this [`BeamGpu`]. fn get_num_unique_tiles(&self) -> i32; /// Get the number of de-duplicated frequencies associated with this - /// [`BeamCUDA`]. + /// [`BeamGpu`]. fn get_num_unique_freqs(&self) -> i32; } @@ -323,9 +323,9 @@ impl Beam for NoBeam { fn empty_coeff_cache(&self) {} - #[cfg(feature = "cuda")] - fn prepare_cuda_beam(&self, freqs_hz: &[u32]) -> Result, BeamError> { - let obj = NoBeamCUDA { + #[cfg(any(feature = "cuda", feature = "hip"))] + fn prepare_gpu_beam(&self, freqs_hz: &[u32]) -> Result, BeamError> { + let obj = NoBeamGpu { tile_map: DevicePointer::copy_to_device(&vec![0; self.num_tiles])?, freq_map: DevicePointer::copy_to_device(&vec![0; freqs_hz.len()])?, }; @@ -335,27 +335,37 @@ impl Beam for NoBeam { /// A beam implementation that returns only identity Jones matrices for all beam /// calculations. -#[cfg(feature = "cuda")] -pub(crate) struct NoBeamCUDA { +#[cfg(any(feature = "cuda", feature = "hip"))] +pub(crate) struct NoBeamGpu { tile_map: DevicePointer, freq_map: DevicePointer, } -#[cfg(feature = "cuda")] -impl BeamCUDA for NoBeamCUDA { +#[cfg(any(feature = "cuda", feature = "hip"))] +impl BeamGpu for NoBeamGpu { unsafe fn calc_jones_pair( &self, - az_rad: &[CudaFloat], - _za_rad: &[CudaFloat], + az_rad: &[GpuFloat], + _za_rad: &[GpuFloat], _latitude_rad: f64, d_jones: *mut std::ffi::c_void, ) -> Result<(), BeamError> { - let identities: Vec> = vec![Jones::identity(); az_rad.len()]; - cuda_runtime_sys::cudaMemcpy( + #[cfg(feature = "cuda")] + use cuda_runtime_sys::{ + cudaMemcpy as gpuMemcpy, + cudaMemcpyKind::cudaMemcpyHostToDevice as gpuMemcpyHostToDevice, + }; + #[cfg(feature = "hip")] + use hip_sys::hiprt::{ + hipMemcpy as gpuMemcpy, hipMemcpyKind::hipMemcpyHostToDevice as gpuMemcpyHostToDevice, + }; + + let identities: Vec> = vec![Jones::identity(); az_rad.len()]; + gpuMemcpy( d_jones, identities.as_ptr().cast(), - identities.len() * std::mem::size_of::>(), - cuda_runtime_sys::cudaMemcpyKind::cudaMemcpyHostToDevice, + identities.len() * std::mem::size_of::>(), + gpuMemcpyHostToDevice, ); Ok(()) } diff --git a/src/beam/tests.rs b/src/beam/tests.rs index 83e14b68..a4b8a1e2 100644 --- a/src/beam/tests.rs +++ b/src/beam/tests.rs @@ -74,8 +74,8 @@ fn fee_beam_values_are_sensible() { #[test] #[serial] -#[cfg(feature = "cuda")] -fn fee_cuda_beam_values_are_sensible() { +#[cfg(any(feature = "cuda", feature = "hip"))] +fn fee_gpu_beam_values_are_sensible() { let delays = Array2::zeros((1, 16)); let amps = Array2::ones((1, 16)); let freqs = [150e6 as u32]; @@ -86,7 +86,7 @@ fn fee_cuda_beam_values_are_sensible() { ]; let (azs, zas): (Vec<_>, Vec<_>) = azels .iter() - .map(|azel| (azel.az as CudaFloat, azel.za() as CudaFloat)) + .map(|azel| (azel.az as GpuFloat, azel.za() as GpuFloat)) .unzip(); // Get the beam values right out of hyperbeam. @@ -100,13 +100,13 @@ fn fee_cuda_beam_values_are_sensible() { // Compare these with the hyperdrive `Beam` trait. let hyperdrive = super::fee::FEEBeam::new_from_env(1, Delays::Full(delays), Some(amps)).unwrap(); - let hyperdrive = hyperdrive.prepare_cuda_beam(&freqs).unwrap(); + let hyperdrive = hyperdrive.prepare_gpu_beam(&freqs).unwrap(); let hyperdrive_values_device = unsafe { - let mut hyperdrive_values_device: DevicePointer> = DevicePointer::malloc( + let mut hyperdrive_values_device: DevicePointer> = DevicePointer::malloc( hyperdrive.get_num_unique_tiles() as usize * hyperdrive.get_num_unique_freqs() as usize * azs.len() - * std::mem::size_of::>(), + * std::mem::size_of::>(), ) .unwrap(); hyperdrive diff --git a/src/cli/beam.rs b/src/cli/beam.rs index 087d4076..db12b288 100644 --- a/src/cli/beam.rs +++ b/src/cli/beam.rs @@ -60,18 +60,18 @@ pub struct BeamArgs { #[clap(short, long, default_value = "beam_responses.tsv")] output: PathBuf, - /// Use CUDA to generate the beam responses. - #[cfg(feature = "cuda")] + /// Use a GPU (i.e. CUDA or HIP) to generate the beam responses. + #[cfg(any(feature = "cuda", feature = "hip"))] #[clap(short, long)] - cuda: bool, + gpu: bool, } impl BeamArgs { pub(super) fn run(&self) -> Result<(), HyperdriveError> { cfg_if::cfg_if! { - if #[cfg(feature = "cuda")] { - if self.cuda { - calc_cuda(self) + if #[cfg(any(feature = "cuda", feature = "hip"))] { + if self.gpu { + calc_gpu(self) } else { calc_cpu(self) } @@ -109,8 +109,8 @@ fn calc_cpu(args: &BeamArgs) -> Result<(), HyperdriveError> { max_za, step, output, - #[cfg(feature = "cuda")] - cuda: _, + #[cfg(any(feature = "cuda", feature = "hip"))] + gpu: _, } = args; let beam = create_beam_object( @@ -137,12 +137,12 @@ fn calc_cpu(args: &BeamArgs) -> Result<(), HyperdriveError> { Ok(()) } -#[cfg(feature = "cuda")] -fn calc_cuda(args: &BeamArgs) -> Result<(), HyperdriveError> { +#[cfg(any(feature = "cuda", feature = "hip"))] +fn calc_gpu(args: &BeamArgs) -> Result<(), HyperdriveError> { use itertools::izip; use num_complex::Complex; - use crate::cuda::{CudaFloat, CudaJones, DevicePointer}; + use crate::gpu::{DevicePointer, GpuFloat, GpuJones}; let BeamArgs { beam_type, @@ -152,7 +152,7 @@ fn calc_cuda(args: &BeamArgs) -> Result<(), HyperdriveError> { max_za, step, output, - cuda: _, + gpu: _, } = args; let beam = create_beam_object( @@ -160,20 +160,20 @@ fn calc_cuda(args: &BeamArgs) -> Result<(), HyperdriveError> { 1, Delays::Partial(delays.clone().unwrap_or(vec![0; 16])), )?; - let cuda_beam = beam.prepare_cuda_beam(&[(freq_mhz * 1e6) as u32])?; + let gpu_beam = beam.prepare_gpu_beam(&[(freq_mhz * 1e6) as u32])?; let mut out = BufWriter::new(File::create(output)?); let (azs, zas): (Vec<_>, Vec<_>) = - gen_azzas::(max_za.to_radians(), step.to_radians()).unzip(); - let mut d_jones: DevicePointer = DevicePointer::malloc( - cuda_beam.get_num_unique_tiles() as usize - * cuda_beam.get_num_unique_freqs() as usize + gen_azzas::(max_za.to_radians(), step.to_radians()).unzip(); + let mut d_jones: DevicePointer = DevicePointer::malloc( + gpu_beam.get_num_unique_tiles() as usize + * gpu_beam.get_num_unique_freqs() as usize * azs.len() - * std::mem::size_of::(), + * std::mem::size_of::(), )?; unsafe { - cuda_beam.calc_jones_pair( + gpu_beam.calc_jones_pair( &azs, &zas, latitude_deg.to_radians(), diff --git a/src/cli/common/mod.rs b/src/cli/common/mod.rs index 617511dc..63946eb3 100644 --- a/src/cli/common/mod.rs +++ b/src/cli/common/mod.rs @@ -459,7 +459,7 @@ pub(super) struct ModellingArgs { /// Use the CPU for visibility generation. This is deliberately made /// non-default because using a GPU is much faster. - #[cfg(feature = "cuda")] + #[cfg(any(feature = "cuda", feature = "hip"))] #[clap(long, help_heading = "MODELLING")] #[serde(default)] pub(super) cpu: bool, @@ -469,7 +469,7 @@ impl ModellingArgs { pub(super) fn merge(self, other: Self) -> Self { Self { no_precession: self.no_precession || other.no_precession, - #[cfg(feature = "cuda")] + #[cfg(any(feature = "cuda", feature = "hip"))] cpu: self.cpu || other.cpu, } } @@ -477,11 +477,11 @@ impl ModellingArgs { pub(super) fn parse(self) -> ModellingParams { let ModellingArgs { no_precession, - #[cfg(feature = "cuda")] + #[cfg(any(feature = "cuda", feature = "hip"))] cpu, } = self; - #[cfg(feature = "cuda")] + #[cfg(any(feature = "cuda", feature = "hip"))] if cpu { MODEL_DEVICE.store(ModelDevice::Cpu); } @@ -495,36 +495,43 @@ impl ModellingArgs { block.push(crate::model::get_cpu_info().into()); } - #[cfg(feature = "cuda")] - ModelDevice::Cuda => { + #[cfg(any(feature = "cuda", feature = "hip"))] + ModelDevice::Gpu => { block.push(format!("Using GPU with {} precision", d.get_precision()).into()); - let (device_info, driver_info) = match crate::cuda::get_device_info() { + let (device_info, driver_info) = match crate::gpu::get_device_info() { Ok(i) => i, Err(e) => { // For some reason, despite hyperdrive being compiled - // with the "cuda" feature, we failed to get the device - // info. Maybe there's no CUDA-capable device present. - // Either way, we cannot continue. I'd rather not have - // error handling here because (1) without the "cuda" + // with the "cuda" or "hip" feature, we failed to get + // the device info. Maybe there's no GPU present. Either + // way, we cannot continue. I'd rather not have error + // handling here because (1) without the "cuda" or "hip" // feature, this function will never fail on the CPU // path, so adding error handling means the caller would // have to handle a `Result` uselessly and (2) if this // "petty" display function fails, then we can't use the // GPU for real work anyway. + #[cfg(feature = "cuda")] eprintln!("Couldn't retrieve CUDA device info for device 0, is a device present? {e}"); + #[cfg(feature = "hip")] + eprintln!("Couldn't retrieve HIP device info for device 0, is a device present? {e}"); std::process::exit(1); } }; + #[cfg(feature = "cuda")] + let device_type = "CUDA"; + #[cfg(feature = "hip")] + let device_type = "HIP"; block.push( format!( - "CUDA device: {} (capability {}, {} MiB)", + "{device_type} device: {} (capability {}, {} MiB)", device_info.name, device_info.capability, device_info.total_global_mem ) .into(), ); block.push( format!( - "CUDA driver: {}, runtime: {}", + "{device_type} driver: {}, runtime: {}", driver_info.driver_version, driver_info.runtime_version ) .into(), diff --git a/src/cli/di_calibrate/error.rs b/src/cli/di_calibrate/error.rs deleted file mode 100644 index 726d3924..00000000 --- a/src/cli/di_calibrate/error.rs +++ /dev/null @@ -1,177 +0,0 @@ -// This Source Code Form is subject to the terms of the Mozilla Public -// License, v. 2.0. If a copy of the MPL was not distributed with this -// file, You can obtain one at http://mozilla.org/MPL/2.0/. - -//! Errors associated with calibration arguments. - -use std::path::PathBuf; - -use thiserror::Error; -use vec1::Vec1; - -use super::ARG_FILE_TYPES_COMMA_SEPARATED; -use crate::filenames::SUPPORTED_INPUT_FILE_COMBINATIONS; - -/// Errors associated with DI calibration arguments. -#[derive(Error, Debug)] -pub(crate) enum DiCalArgsError { - #[error("Argument file '{0}' doesn't have a recognised file extension! Valid extensions are: {}", *ARG_FILE_TYPES_COMMA_SEPARATED)] - UnrecognisedArgFileExt(String), - - #[error("Couldn't decode toml structure from {file}:\n{err}")] - TomlDecode { file: String, err: String }, - - #[error("Couldn't decode json structure from {file}:\n{err}")] - JsonDecode { file: String, err: String }, - - #[error("No input data was given!")] - NoInputData, - - #[error("{0}\n\nSupported combinations of file formats:\n{SUPPORTED_INPUT_FILE_COMBINATIONS}")] - InvalidDataInput(&'static str), - - #[error("Multiple metafits files were specified: {0:?}\nThis is unsupported.")] - MultipleMetafits(Vec1), - - #[error("Multiple measurement sets were specified: {0:?}\nThis is currently unsupported.")] - MultipleMeasurementSets(Vec1), - - #[error("Multiple uvfits files were specified: {0:?}\nThis is currently unsupported.")] - MultipleUvfits(Vec1), - - #[error("No calibration output was specified. There must be at least one calibration solution file.")] - NoOutput, - - #[error("No sky-model source list file supplied")] - NoSourceList, - - #[error("Tried to create a beam object, but MWA dipole delay information isn't available!")] - NoDelays, - - #[error( - "The specified MWA dipole delays aren't valid; there should be 16 values between 0 and 32" - )] - BadDelays, - - #[error("The data either contains no tiles or all tiles are flagged")] - NoTiles, - - #[error("The data either contains no frequency channels or all channels are flagged")] - NoChannels, - - #[error( - "All baselines were flagged due to UVW cutoffs. Try adjusting the UVW min and/or max." - )] - AllBaselinesFlaggedFromUvwCutoffs, - - #[error("The data either contains no timesteps or no timesteps are being used")] - NoTimesteps, - - #[error("The number of specified sources was 0, or the size of the source list was 0")] - NoSources, - - #[error("After vetoing sources, none were left. Decrease the veto threshold, or supply more sources")] - NoSourcesAfterVeto, - - #[error("Duplicate timesteps were specified; this is invalid")] - DuplicateTimesteps, - - #[error("Timestep {got} was specified but it isn't available; the last timestep is {last}")] - UnavailableTimestep { got: usize, last: usize }, - - #[error( - "Cannot write visibilities to a file type '{ext}'. Supported formats are: {}", *crate::io::write::VIS_OUTPUT_EXTENSIONS - )] - VisFileType { ext: String }, - - #[error(transparent)] - TileFlag(#[from] crate::context::InvalidTileFlag), - - #[error("Cannot write calibration solutions to a file type '{ext}'.\nSupported formats are: {}", *crate::solutions::CAL_SOLUTION_EXTENSIONS)] - CalibrationOutputFile { ext: String }, - - #[error(transparent)] - ParsePfbFlavour(#[from] crate::io::read::pfb_gains::PfbParseError), - - #[error("Error when parsing time average factor: {0}")] - ParseCalTimeAverageFactor(crate::unit_parsing::UnitParseError), - - #[error("Error when parsing freq. average factor: {0}")] - ParseCalFreqAverageFactor(crate::unit_parsing::UnitParseError), - - #[error("Calibration time average factor isn't an integer")] - CalTimeFactorNotInteger, - - #[error("Calibration freq. average factor isn't an integer")] - CalFreqFactorNotInteger, - - #[error("Calibration time resolution isn't a multiple of input data's: {out} seconds vs {inp} seconds")] - CalTimeResNotMultiple { out: f64, inp: f64 }, - - #[error("Calibration freq. resolution isn't a multiple of input data's: {out} Hz vs {inp} Hz")] - CalFreqResNotMultiple { out: f64, inp: f64 }, - - #[error("Calibration time average factor cannot be 0")] - CalTimeFactorZero, - - #[error("Calibration freq. average factor cannot be 0")] - CalFreqFactorZero, - - #[error("Error when parsing output vis. time average factor: {0}")] - ParseOutputVisTimeAverageFactor(crate::unit_parsing::UnitParseError), - - #[error("Error when parsing output vis. freq. average factor: {0}")] - ParseOutputVisFreqAverageFactor(crate::unit_parsing::UnitParseError), - - #[error("Output vis. time average factor isn't an integer")] - OutputVisTimeFactorNotInteger, - - #[error("Output vis. freq. average factor isn't an integer")] - OutputVisFreqFactorNotInteger, - - #[error("Output vis. time average factor cannot be 0")] - OutputVisTimeAverageFactorZero, - - #[error("Output vis. freq. average factor cannot be 0")] - OutputVisFreqAverageFactorZero, - - #[error("Output vis. time resolution isn't a multiple of input data's: {out} seconds vs {inp} seconds")] - OutputVisTimeResNotMultiple { out: f64, inp: f64 }, - - #[error("Output vis. freq. resolution isn't a multiple of input data's: {out} Hz vs {inp} Hz")] - OutputVisFreqResNotMultiple { out: f64, inp: f64 }, - - #[error("Error when parsing minimum UVW cutoff: {0}")] - ParseUvwMin(crate::unit_parsing::UnitParseError), - - #[error("Error when parsing maximum UVW cutoff: {0}")] - ParseUvwMax(crate::unit_parsing::UnitParseError), - - #[error("Array position specified as {pos:?}, not [, , ]")] - BadArrayPosition { pos: Vec }, - - #[error(transparent)] - Glob(#[from] crate::io::GlobError), - - #[error(transparent)] - VisRead(#[from] crate::io::read::VisReadError), - - #[error(transparent)] - FileWrite(#[from] crate::io::write::FileWriteError), - - #[error(transparent)] - Veto(#[from] crate::srclist::VetoError), - - #[error("Error when trying to read source list: {0}")] - SourceList(#[from] crate::srclist::ReadSourceListError), - - #[error(transparent)] - Beam(#[from] crate::beam::BeamError), - - #[error(transparent)] - IO(#[from] std::io::Error), - - #[cfg(feature = "cuda")] - #[error(transparent)] - Cuda(#[from] crate::cuda::CudaError), -} diff --git a/src/cli/di_calibrate/tests.rs b/src/cli/di_calibrate/tests.rs index e8f1a869..c56eef5a 100644 --- a/src/cli/di_calibrate/tests.rs +++ b/src/cli/di_calibrate/tests.rs @@ -710,17 +710,17 @@ fn test_1090008640_calibrate_model_ms() { // introduced. If a metafits' positions are used instead, the results // are *exactly* the same, but we should trust the MS's positions, so // these errors must remain. - #[cfg(feature = "cuda-single")] + #[cfg(feature = "gpu-single")] assert_abs_diff_eq!(vis_m, vis_c, epsilon = 2e-4); - #[cfg(not(feature = "cuda-single"))] + #[cfg(not(feature = "gpu-single"))] assert_abs_diff_eq!(vis_m, vis_c, epsilon = 4e-6); assert_abs_diff_eq!(weight_m, weight_c); } // Inspect the solutions; they should all be close to identity. - #[cfg(feature = "cuda-single")] + #[cfg(feature = "gpu-single")] let epsilon = 6e-8; - #[cfg(not(feature = "cuda-single"))] + #[cfg(not(feature = "gpu-single"))] let epsilon = 2e-9; assert_abs_diff_eq!( sols.di_jones, @@ -772,9 +772,9 @@ fn test_cal_timeblocks() { // We didn't specify anything with calibration timeblocks, so this should be // 1 (all input data timesteps are used at once in calibration). assert_eq!(num_cal_timeblocks, 1); - #[cfg(not(feature = "cuda-single"))] + #[cfg(not(feature = "gpu-single"))] let eps = 0.0; // I am amazed - #[cfg(feature = "cuda-single")] + #[cfg(feature = "gpu-single")] let eps = 2e-8; assert_abs_diff_eq!( sols.di_jones, @@ -796,9 +796,9 @@ fn test_cal_timeblocks() { let num_cal_timeblocks = sols.di_jones.len_of(Axis(0)); // 3 / 2 = 1.5 = 2 rounded up assert_eq!(num_cal_timeblocks, 2); - #[cfg(not(feature = "cuda-single"))] + #[cfg(not(feature = "gpu-single"))] let eps = 0.0; - #[cfg(feature = "cuda-single")] + #[cfg(feature = "gpu-single")] let eps = 4e-8; assert_abs_diff_eq!( sols.di_jones, diff --git a/src/cli/error.rs b/src/cli/error.rs index 31048d68..fbdf3748 100644 --- a/src/cli/error.rs +++ b/src/cli/error.rs @@ -256,8 +256,8 @@ impl From for HyperdriveError { VisSubtractError::VisWrite(e) => Self::from(e), VisSubtractError::Model(e) => Self::from(e), VisSubtractError::IO(e) => Self::from(e), - #[cfg(feature = "cuda")] - VisSubtractError::Cuda(e) => Self::from(e), + #[cfg(any(feature = "cuda", feature = "hip"))] + VisSubtractError::Gpu(e) => Self::from(e), } } } @@ -429,8 +429,8 @@ impl From for HyperdriveError { | BeamError::BadTileIndex { .. } | BeamError::Hyperbeam(_) | BeamError::HyperbeamInit(_) => Self::Beam(s), - #[cfg(feature = "cuda")] - BeamError::Cuda(_) => Self::Beam(s), + #[cfg(any(feature = "cuda", feature = "hip"))] + BeamError::Gpu(_) => Self::Beam(s), } } } @@ -440,8 +440,8 @@ impl From for HyperdriveError { match e { ModelError::Beam(e) => Self::from(e), - #[cfg(feature = "cuda")] - ModelError::Cuda(e) => Self::from(e), + #[cfg(any(feature = "cuda", feature = "hip"))] + ModelError::Gpu(e) => Self::from(e), } } } @@ -464,9 +464,9 @@ impl From for HyperdriveError { } } -#[cfg(feature = "cuda")] -impl From for HyperdriveError { - fn from(e: crate::cuda::CudaError) -> Self { +#[cfg(any(feature = "cuda", feature = "hip"))] +impl From for HyperdriveError { + fn from(e: crate::gpu::GpuError) -> Self { Self::Generic(e.to_string()) } } diff --git a/src/cli/vis_simulate/tests.rs b/src/cli/vis_simulate/tests.rs index 760094c8..4d83e5c9 100644 --- a/src/cli/vis_simulate/tests.rs +++ b/src/cli/vis_simulate/tests.rs @@ -203,9 +203,9 @@ fn test_1090008640_vis_simulate() { assert_abs_diff_eq!(group_params[4] as f64 + jd_zero, 2456860.3406018466); // The values of the visibilities changes slightly depending on the precision. - #[cfg(feature = "cuda-single")] + #[cfg(feature = "gpu-single")] let epsilon = 2e-4; - #[cfg(not(feature = "cuda-single"))] + #[cfg(not(feature = "gpu-single"))] let epsilon = 0.0; assert_abs_diff_eq!( vis[0..29], @@ -288,9 +288,9 @@ fn test_1090008640_vis_simulate() { ); assert_abs_diff_eq!(group_params[4] as f64 + jd_zero, 2456860.3406944424); - #[cfg(feature = "cuda-single")] + #[cfg(feature = "gpu-single")] let epsilon = 3e-4; - #[cfg(not(feature = "cuda-single"))] + #[cfg(not(feature = "gpu-single"))] let epsilon = 0.0; assert_abs_diff_eq!( vis[0..29], @@ -338,7 +338,7 @@ fn test_1090008640_vis_simulate() { // exactly the same. #[test] #[serial] -#[cfg(all(feature = "cuda", not(feature = "cuda-single")))] +#[cfg(all(feature = "cuda", not(feature = "gpu-single")))] fn test_1090008640_vis_simulate_cpu_gpu_match() { use ndarray::prelude::*; diff --git a/src/cuda/utils.cu b/src/cuda/utils.cu deleted file mode 100644 index 84ed7f1e..00000000 --- a/src/cuda/utils.cu +++ /dev/null @@ -1,40 +0,0 @@ -// This Source Code Form is subject to the terms of the Mozilla Public -// License, v. 2.0. If a copy of the MPL was not distributed with this -// file, You can obtain one at http://mozilla.org/MPL/2.0/. - -// "Homegrown" CUDA utilities. -// -// As this code contains code derived from an official NVIDIA example -// (https://github.com/NVIDIA/cuda-samples/blob/master/Samples/1_Utilities/deviceQuery/deviceQuery.cpp), -// legally, a copyright, list of conditions and disclaimer must be distributed -// with this code. This should be found in the "cuda" directory of the -// mwa_hyperdrive git repo, file LICENSE-NVIDIA. - -#include - -extern "C" const char *get_cuda_device_info(int device, char name[256], int *device_major, int *device_minor, - size_t *total_global_mem, int *driver_version, int *runtime_version) { - cudaError_t error_id = cudaSetDevice(device); - if (error_id != cudaSuccess) - return cudaGetErrorString(error_id); - - cudaDeviceProp device_prop; - error_id = cudaGetDeviceProperties(&device_prop, device); - if (error_id != cudaSuccess) - return cudaGetErrorString(error_id); - - memcpy(name, device_prop.name, 256); - *device_major = device_prop.major; - *device_minor = device_prop.minor; - *total_global_mem = device_prop.totalGlobalMem; - - error_id = cudaDriverGetVersion(driver_version); - if (error_id != cudaSuccess) - return cudaGetErrorString(error_id); - - error_id = cudaRuntimeGetVersion(runtime_version); - if (error_id != cudaSuccess) - return cudaGetErrorString(error_id); - - return NULL; -} diff --git a/src/cuda/common.cuh b/src/gpu/common.cuh similarity index 63% rename from src/cuda/common.cuh rename to src/gpu/common.cuh index bc39ec1f..90dc777e 100644 --- a/src/cuda/common.cuh +++ b/src/gpu/common.cuh @@ -7,32 +7,92 @@ #include #include -#include - #include "types.h" +// HIP-specific defines. +#if __HIPCC__ +#define gpuMalloc hipMalloc +#define gpuFree hipFree +#define gpuMemcpy hipMemcpy +#define gpuMemcpyHostToDevice hipMemcpyHostToDevice +#define gpuGetErrorString hipGetErrorString +#define gpuGetLastError hipGetLastError +#define gpuDeviceSynchronize hipDeviceSynchronize +#define gpuError_t hipError_t +#define gpuSuccess hipSuccess + // If SINGLE is enabled, use single-precision floats everywhere. Otherwise // default to double-precision. #ifdef SINGLE -#define FLOAT4 float4 -#define SINCOS sincosf -#define EXP expf -#define POW powf -#define FLOOR floorf -#define COMPLEX cuFloatComplex -#define CUCONJ cuConjf -#define LOG logf -#define EXP expf +#define FLOAT4 float4 +#define SINCOS sincosf +#define EXP expf +#define POW powf +#define FLOOR floorf +#define COMPLEX hipFloatComplex +#define MAKE_COMPLEX make_hipFloatComplex +#define CUCONJ hipConjf +#define LOG logf +#define EXP expf #else -#define FLOAT4 double4 -#define SINCOS sincos -#define EXP exp -#define POW pow -#define FLOOR floor -#define COMPLEX cuDoubleComplex -#define CUCONJ cuConj -#define LOG log -#define EXP exp +#define FLOAT4 double4 +#define SINCOS sincos +#define EXP exp +#define POW pow +#define FLOOR floor +#define COMPLEX hipDoubleComplex +#define MAKE_COMPLEX make_hipDoubleComplex +#define CUCONJ hipConj +#define LOG log +#define EXP exp +#endif // SINGLE + +// CUDA-specific defines. +#elif __CUDACC__ + +#define gpuMalloc cudaMalloc +#define gpuFree cudaFree +#define gpuMemcpy cudaMemcpy +#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice +#define gpuGetErrorString cudaGetErrorString +#define gpuGetLastError cudaGetLastError +#define gpuDeviceSynchronize cudaDeviceSynchronize +#define gpuError_t cudaError_t +#define gpuSuccess cudaSuccess +#define warpSize 32 + +#ifdef SINGLE +#define FLOAT4 float4 +#define SINCOS sincosf +#define EXP expf +#define POW powf +#define FLOOR floorf +#define COMPLEX cuFloatComplex +#define MAKE_COMPLEX make_cuFloatComplex +#define CUCONJ cuConjf +#define LOG logf +#define EXP expf +#else +#define FLOAT4 double4 +#define SINCOS sincos +#define EXP exp +#define POW pow +#define FLOOR floor +#define COMPLEX cuDoubleComplex +#define MAKE_COMPLEX make_cuDoubleComplex +#define CUCONJ cuConj +#define LOG log +#define EXP exp +#endif // SINGLE +#endif // __HIPCC__ +// #define C32 cuFloatComplex +// #define C64 cuDoubleComplex + +#ifdef __CUDACC__ +#include +#elif __HIPCC__ +#include +#include #endif const FLOAT VEL_C = 299792458.0; // speed of light in a vacuum @@ -54,33 +114,17 @@ typedef struct JONES_C { COMPLEX j11; } JONES_C; -inline __device__ COMPLEX operator+(const COMPLEX a, const COMPLEX b) { - return COMPLEX{ - .x = a.x + b.x, - .y = a.y + b.y, - }; -} +inline __device__ COMPLEX operator+(const COMPLEX a, const COMPLEX b) { return MAKE_COMPLEX(a.x + b.x, a.y + b.y); } inline __device__ COMPLEX operator*(const COMPLEX a, const COMPLEX b) { - return COMPLEX{ - .x = a.x * b.x - a.y * b.y, - .y = a.x * b.y + a.y * b.x, - }; + return MAKE_COMPLEX(a.x * b.x - a.y * b.y, a.x * b.y + a.y * b.x); } inline __device__ void operator*=(COMPLEX &a, const COMPLEX b) { - a = COMPLEX{ - .x = a.x * b.x - a.y * b.y, - .y = a.x * b.y + a.y * b.x, - }; + a = MAKE_COMPLEX(a.x * b.x - a.y * b.y, a.x * b.y + a.y * b.x); } -inline __device__ COMPLEX operator*(const COMPLEX a, const FLOAT b) { - return COMPLEX{ - .x = a.x * b, - .y = a.y * b, - }; -} +inline __device__ COMPLEX operator*(const COMPLEX a, const FLOAT b) { return MAKE_COMPLEX(a.x * b, a.y * b); } inline __device__ void operator+=(COMPLEX &a, const COMPLEX b) { a.x += b.x; diff --git a/src/cuda/compile_flags.txt b/src/gpu/compile_flags.txt similarity index 93% rename from src/cuda/compile_flags.txt rename to src/gpu/compile_flags.txt index 93689f8a..d5d1b011 100644 --- a/src/cuda/compile_flags.txt +++ b/src/gpu/compile_flags.txt @@ -1,5 +1,6 @@ -I/usr/local/cuda/include -I/opt/cuda/include +-I/opt/rocm/include -Wall -Wextra -Wpedantic diff --git a/src/cuda/mod.rs b/src/gpu/mod.rs similarity index 57% rename from src/cuda/mod.rs rename to src/gpu/mod.rs index 58ee9b6b..c61ef90e 100644 --- a/src/cuda/mod.rs +++ b/src/gpu/mod.rs @@ -2,7 +2,7 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at http://mozilla.org/MPL/2.0/. -//! CUDA code to be used by hyperdrive. +//! GPU code to be used by hyperdrive. #![allow(non_snake_case)] #![allow(clippy::upper_case_acronyms)] @@ -21,45 +21,67 @@ use thiserror::Error; pub(crate) use utils::get_device_info; -// Import Rust bindings to the CUDA code specific to the precision we're using, -// and set corresponding compile-time types. +// Import Rust bindings to the CUDA/HIP code specific to the precision we're +// using, and set corresponding compile-time types. cfg_if::cfg_if! { - if #[cfg(feature = "cuda-single")] { - /// f32 (using the "cuda-single" feature) - pub(crate) type CudaFloat = f32; - pub(crate) type CudaJones = JonesF32; + if #[cfg(feature = "gpu-single")] { + /// f32 (using the "gpu-single" feature) + pub(crate) type GpuFloat = f32; + pub(crate) type GpuJones = JonesF32; include!("types_single.rs"); include!("model_single.rs"); - } else if #[cfg(all(feature = "cuda", not(feature = "cuda-single")))] { - /// f64 (using the "cuda" feature and not "cuda-single") - pub(crate) type CudaFloat = f64; - pub(crate) type CudaJones = JonesF64; + } else if #[cfg(all(any(feature = "cuda", feature = "hip"), not(feature = "gpu-single")))] { + /// f64 (not using "gpu-single") + pub(crate) type GpuFloat = f64; + pub(crate) type GpuJones = JonesF64; include!("types_double.rs"); include!("model_double.rs"); } } -// Ensure that the shapelet constants are the same in the Rust code and CUDA +// Import CUDA/HIP functions into the same names. +#[cfg(feature = "cuda")] +use cuda_runtime_sys::{ + cudaDeviceSynchronize as gpuDeviceSynchronize, cudaError::cudaSuccess as gpuSuccess, + cudaFree as gpuFree, cudaGetErrorString as gpuGetErrorString, + cudaGetLastError as gpuGetLastError, cudaMalloc as gpuMalloc, cudaMemcpy as gpuMemcpy, + cudaMemcpyKind::cudaMemcpyDeviceToHost as gpuMemcpyDeviceToHost, + cudaMemcpyKind::cudaMemcpyHostToDevice as gpuMemcpyHostToDevice, +}; +#[cfg(feature = "hip")] +use hip_sys::hiprt::{ + hipDeviceSynchronize as gpuDeviceSynchronize, hipError_t::hipSuccess as gpuSuccess, + hipFree as gpuFree, hipGetErrorString as gpuGetErrorString, hipGetLastError as gpuGetLastError, + hipMalloc as gpuMalloc, hipMemcpy as gpuMemcpy, + hipMemcpyKind::hipMemcpyDeviceToHost as gpuMemcpyDeviceToHost, + hipMemcpyKind::hipMemcpyHostToDevice as gpuMemcpyHostToDevice, +}; + +// Ensure that the shapelet constants are the same in the Rust code and GPU // code. static_assertions::const_assert_eq!(crate::model::shapelets::SBF_L as i32, SBF_L); static_assertions::const_assert_eq!(crate::model::shapelets::SBF_N as i32, SBF_N); -static_assertions::const_assert_eq!(crate::model::shapelets::SBF_C as CudaFloat, SBF_C); -static_assertions::const_assert_eq!(crate::model::shapelets::SBF_DX as CudaFloat, SBF_DX); +static_assertions::const_assert_eq!(crate::model::shapelets::SBF_C as GpuFloat, SBF_C); +static_assertions::const_assert_eq!(crate::model::shapelets::SBF_DX as GpuFloat, SBF_DX); -macro_rules! cuda_kernel_call { - ($cuda_fn:path, $($args:expr),* $(,)?) => {{ +macro_rules! gpu_kernel_call { + ($gpu_fn:path, $($args:expr),* $(,)?) => {{ #[allow(unused_unsafe)] unsafe { - let error_message_ptr = $cuda_fn($($args),*); + let error_message_ptr = $gpu_fn($($args),*); if error_message_ptr.is_null() { Ok(()) } else { - // Get the CUDA error message behind the pointer. - let error_message = std::ffi::CStr::from_ptr(error_message_ptr).to_str().unwrap_or(""); - let our_error_message = format!("{}: {error_message}", stringify!($cuda_fn)); - Err(CudaError::Kernel { + // Get the GPU error message behind the pointer. + let error_message = std::ffi::CStr::from_ptr(error_message_ptr).to_str(); + #[cfg(feature = "cuda")] + let error_message = error_message.unwrap_or(""); + #[cfg(feature = "hip")] + let error_message = error_message.unwrap_or(""); + let our_error_message = format!("{}: {error_message}", stringify!($gpu_fn)); + Err(GpuError::Kernel { msg: our_error_message.into(), file: file!(), line: line!(), @@ -68,47 +90,50 @@ macro_rules! cuda_kernel_call { } }}; } -pub(crate) use cuda_kernel_call; +pub(crate) use gpu_kernel_call; #[derive(Clone, Copy)] -pub(crate) enum CudaCall { +pub(crate) enum GpuCall { Malloc, CopyToDevice, CopyFromDevice, } -/// Run [`cuda_runtime_sys::cudaGetLastError`] and -/// [`cuda_runtime_sys::cudaDeviceSynchronize`]. If either of these calls return -/// an error, it is converted to a Rust error and returned from this function. -/// The single argument describes what the just-performed operation was and -/// makes the returned error a helpful one. +/// Run [`gpuGetLastError`] and [`gpuDeviceSynchronize`]. If either of these +/// calls return an error, it is converted to a Rust error and returned from +/// this function. The single argument describes what the just-performed +/// operation was and makes the returned error a helpful one. /// /// # Safety /// -/// This function interfaces directly with the CUDA API. Rust errors attempt to -/// catch problems but there are no guarantees. +/// This function interfaces directly with the CUDA/HIP API. Rust errors attempt +/// to catch problems but there are no guarantees. #[track_caller] -unsafe fn check_for_errors(cuda_call: CudaCall) -> Result<(), CudaError> { +unsafe fn check_for_errors(gpu_call: GpuCall) -> Result<(), GpuError> { // Only do a device sync if we're in debug mode, for performance. let debug_mode = matches!(std::env::var("DEBUG").as_deref(), Ok("true")); if debug_mode { - let code = cuda_runtime_sys::cudaDeviceSynchronize(); - if code != cuda_runtime_sys::cudaError::cudaSuccess { - let c_str = CStr::from_ptr(cuda_runtime_sys::cudaGetErrorString(code)); - let msg = c_str.to_str().unwrap_or(""); + let code = gpuDeviceSynchronize(); + if code != gpuSuccess { + let c_str = CStr::from_ptr(gpuGetErrorString(code)); + let msg = c_str.to_str(); + #[cfg(feature = "cuda")] + let msg = msg.unwrap_or(""); + #[cfg(feature = "hip")] + let msg = msg.unwrap_or(""); let location = Location::caller(); - return Err(match cuda_call { - CudaCall::Malloc => CudaError::Malloc { + return Err(match gpu_call { + GpuCall::Malloc => GpuError::Malloc { msg: msg.into(), file: location.file(), line: location.line(), }, - CudaCall::CopyToDevice => CudaError::CopyToDevice { + GpuCall::CopyToDevice => GpuError::CopyToDevice { msg: msg.into(), file: location.file(), line: location.line(), }, - CudaCall::CopyFromDevice => CudaError::CopyFromDevice { + GpuCall::CopyFromDevice => GpuError::CopyFromDevice { msg: msg.into(), file: location.file(), line: location.line(), @@ -117,23 +142,27 @@ unsafe fn check_for_errors(cuda_call: CudaCall) -> Result<(), CudaError> { } } - let code = cuda_runtime_sys::cudaGetLastError(); - if code != cuda_runtime_sys::cudaError::cudaSuccess { - let c_str = CStr::from_ptr(cuda_runtime_sys::cudaGetErrorString(code)); - let msg = c_str.to_str().unwrap_or(""); + let code = gpuGetLastError(); + if code != gpuSuccess { + let c_str = CStr::from_ptr(gpuGetErrorString(code)); + let msg = c_str.to_str(); + #[cfg(feature = "cuda")] + let msg = msg.unwrap_or(""); + #[cfg(feature = "hip")] + let msg = msg.unwrap_or(""); let location = Location::caller(); - return Err(match cuda_call { - CudaCall::Malloc => CudaError::Malloc { + return Err(match gpu_call { + GpuCall::Malloc => GpuError::Malloc { msg: msg.into(), file: location.file(), line: location.line(), }, - CudaCall::CopyToDevice => CudaError::CopyToDevice { + GpuCall::CopyToDevice => GpuError::CopyToDevice { msg: msg.into(), file: location.file(), line: location.line(), }, - CudaCall::CopyFromDevice => CudaError::CopyFromDevice { + GpuCall::CopyFromDevice => GpuError::CopyFromDevice { msg: msg.into(), file: location.file(), line: location.line(), @@ -145,7 +174,7 @@ unsafe fn check_for_errors(cuda_call: CudaCall) -> Result<(), CudaError> { } /// A Rust-managed pointer to CUDA device memory. When this is dropped, -/// [`cuda_runtime_sys::cudaFree`] is called on the pointer. +/// [`gpuFree`] is called on the pointer. #[derive(Debug)] pub(crate) struct DevicePointer { ptr: *mut T, @@ -158,7 +187,7 @@ impl Drop for DevicePointer { fn drop(&mut self) { if !self.ptr.is_null() { unsafe { - cuda_runtime_sys::cudaFree(self.ptr.cast()); + gpuFree(self.ptr.cast()); } } } @@ -177,14 +206,14 @@ impl DevicePointer { /// Allocate a number of bytes on the device. #[track_caller] - pub(crate) fn malloc(size: usize) -> Result, CudaError> { + pub(crate) fn malloc(size: usize) -> Result, GpuError> { if size == 0 { Ok(Self::default()) } else { let mut d_ptr = std::ptr::null_mut(); unsafe { - cuda_runtime_sys::cudaMalloc(&mut d_ptr, size); - check_for_errors(CudaCall::Malloc)?; + gpuMalloc(&mut d_ptr, size); + check_for_errors(GpuCall::Malloc)?; } Ok(Self { ptr: d_ptr.cast(), @@ -197,12 +226,12 @@ impl DevicePointer { /// is smaller than `self.size`. Note that unlike `libc`'s `remalloc`, if a /// new buffer is created, the original bytes are not preserved. #[track_caller] - pub(crate) fn realloc(&mut self, size: usize) -> Result<(), CudaError> { + pub(crate) fn realloc(&mut self, size: usize) -> Result<(), GpuError> { if size <= self.size { return Ok(()); } - // CUDA doesn't provide a realloc, so just make a new `DevicePointer` + // CUDA/HIP don't provide a realloc, so just make a new `DevicePointer` // and swap it with the old one; the old buffer will be dropped. let mut new = Self::malloc(size)?; std::mem::swap(self, &mut new); @@ -212,17 +241,17 @@ impl DevicePointer { /// Copy a slice of data to the device. Any type is allowed, and the returned /// pointer is to the device memory. #[track_caller] - pub(crate) fn copy_to_device(v: &[T]) -> Result, CudaError> { + pub(crate) fn copy_to_device(v: &[T]) -> Result, GpuError> { let size = std::mem::size_of_val(v); unsafe { let mut d_ptr = Self::malloc(size)?; - cuda_runtime_sys::cudaMemcpy( + gpuMemcpy( d_ptr.get_mut().cast(), v.as_ptr().cast(), size, - cuda_runtime_sys::cudaMemcpyKind::cudaMemcpyHostToDevice, + gpuMemcpyHostToDevice, ); - check_for_errors(CudaCall::CopyToDevice)?; + check_for_errors(GpuCall::CopyToDevice)?; Ok(d_ptr) } } @@ -231,10 +260,10 @@ impl DevicePointer { /// bytes in the `DevicePointer` and `v`. The contents of `v` are /// overwritten. #[track_caller] - pub fn copy_from_device(&self, v: &mut [T]) -> Result<(), CudaError> { + pub fn copy_from_device(&self, v: &mut [T]) -> Result<(), GpuError> { let location = Location::caller(); if self.ptr.is_null() { - return Err(CudaError::CopyFromDevice { + return Err(GpuError::CopyFromDevice { msg: "Attempted to copy data from a null device pointer".into(), file: location.file(), line: location.line(), @@ -243,7 +272,7 @@ impl DevicePointer { let size = std::mem::size_of_val(v); if size != self.size { - return Err(CudaError::CopyFromDevice { + return Err(GpuError::CopyFromDevice { msg: format!( "Device buffer size {} is not equal to provided buffer size {size} (length {})", self.size, @@ -256,13 +285,13 @@ impl DevicePointer { } unsafe { - cuda_runtime_sys::cudaMemcpy( + gpuMemcpy( v.as_mut_ptr().cast(), self.ptr.cast(), size, - cuda_runtime_sys::cudaMemcpyKind::cudaMemcpyDeviceToHost, + gpuMemcpyDeviceToHost, ); - check_for_errors(CudaCall::CopyFromDevice) + check_for_errors(GpuCall::CopyFromDevice) } } @@ -271,7 +300,7 @@ impl DevicePointer { /// what is already allocated against the pointer, then the buffer is freed /// and another is created to fit `v` (i.e. re-alloc). #[track_caller] - pub(crate) fn overwrite(&mut self, v: &[T]) -> Result<(), CudaError> { + pub(crate) fn overwrite(&mut self, v: &[T]) -> Result<(), GpuError> { // Nothing to do if the collection is empty. if v.is_empty() { return Ok(()); @@ -280,22 +309,27 @@ impl DevicePointer { let size = std::mem::size_of_val(v); self.realloc(size)?; unsafe { - cuda_runtime_sys::cudaMemcpy( + gpuMemcpy( self.get_mut() as *mut c_void, v.as_ptr().cast(), size, - cuda_runtime_sys::cudaMemcpyKind::cudaMemcpyHostToDevice, + gpuMemcpyHostToDevice, ); - check_for_errors(CudaCall::CopyToDevice) + check_for_errors(GpuCall::CopyToDevice) } } /// Clear all of the bytes in the buffer by writing zeros. #[cfg(test)] pub(crate) fn clear(&mut self) { + #[cfg(feature = "cuda")] + use cuda_runtime_sys::cudaMemset as gpuMemset; + #[cfg(feature = "hip")] + use hip_sys::hiprt::hipMemset as gpuMemset; + unsafe { if self.size > 0 { - cuda_runtime_sys::cudaMemset(self.get_mut().cast(), 0, self.size); + gpuMemset(self.get_mut().cast(), 0, self.size); } } } @@ -305,10 +339,10 @@ impl DevicePointer { /// Copy a slice of data from the device. There must be an equal number of /// bytes in the `DevicePointer` and `v`. #[track_caller] - pub fn copy_from_device_new(&self) -> Result, CudaError> { + pub fn copy_from_device_new(&self) -> Result, GpuError> { if self.ptr.is_null() { let location = Location::caller(); - return Err(CudaError::CopyFromDevice { + return Err(GpuError::CopyFromDevice { msg: "Attempted to copy data from a null device pointer".into(), file: location.file(), line: location.line(), @@ -319,13 +353,13 @@ impl DevicePointer { v.resize_with(self.size / std::mem::size_of::(), || T::default()); unsafe { - cuda_runtime_sys::cudaMemcpy( + gpuMemcpy( v.as_mut_ptr().cast(), self.ptr.cast(), self.size, - cuda_runtime_sys::cudaMemcpyKind::cudaMemcpyDeviceToHost, + gpuMemcpyDeviceToHost, ); - check_for_errors(CudaCall::CopyFromDevice)?; + check_for_errors(GpuCall::CopyFromDevice)?; } Ok(v) @@ -342,7 +376,8 @@ impl Default for DevicePointer { } #[derive(Error, Debug)] -pub enum CudaError { +pub enum GpuError { + #[cfg(feature = "cuda")] #[error("{file}:{line}: cudaMemcpy to device failed: {msg}")] CopyToDevice { msg: Box, @@ -350,6 +385,15 @@ pub enum CudaError { line: u32, }, + #[cfg(feature = "hip")] + #[error("{file}:{line}: hipMemcpy to device failed: {msg}")] + CopyToDevice { + msg: Box, + file: &'static str, + line: u32, + }, + + #[cfg(feature = "cuda")] #[error("{file}:{line}: cudaMemcpy from device failed: {msg}")] CopyFromDevice { msg: Box, @@ -357,6 +401,15 @@ pub enum CudaError { line: u32, }, + #[cfg(feature = "hip")] + #[error("{file}:{line}: hipMemcpy from device failed: {msg}")] + CopyFromDevice { + msg: Box, + file: &'static str, + line: u32, + }, + + #[cfg(feature = "cuda")] #[error("{file}:{line}: cudaMalloc error: {msg}")] Malloc { msg: Box, @@ -364,6 +417,15 @@ pub enum CudaError { line: u32, }, + #[cfg(feature = "hip")] + #[error("{file}:{line}: hipMalloc error: {msg}")] + Malloc { + msg: Box, + file: &'static str, + line: u32, + }, + + #[cfg(feature = "cuda")] #[error("{file}:{line}: CUDA kernel error: {msg}")] Kernel { msg: Box, @@ -371,6 +433,23 @@ pub enum CudaError { line: u32, }, + #[cfg(feature = "hip")] + #[error("{file}:{line}: HIP kernel error: {msg}")] + Kernel { + msg: Box, + file: &'static str, + line: u32, + }, + + #[cfg(feature = "cuda")] + #[error("{file}:{line}: {msg}")] + Generic { + msg: Box, + file: &'static str, + line: u32, + }, + + #[cfg(feature = "hip")] #[error("{file}:{line}: {msg}")] Generic { msg: Box, @@ -379,7 +458,7 @@ pub enum CudaError { }, } -// Suppress warnings for unused CUDA shapelet consts. +// Suppress warnings for unused GPU shapelet consts. mod unused { #[allow(unused)] fn unused() { diff --git a/src/cuda/model.cu b/src/gpu/model.cu similarity index 95% rename from src/cuda/model.cu rename to src/gpu/model.cu index 322f89ec..c6f39934 100644 --- a/src/cuda/model.cu +++ b/src/gpu/model.cu @@ -4,8 +4,6 @@ #include -#include - #include "common.cuh" #include "model.h" #include "types.h" @@ -64,10 +62,7 @@ inline __device__ COMPLEX get_shapelet_envelope(const GaussianParams g_params, c int x_pos_int = x_pos < 0 ? 0 : (int)FLOOR(x_pos); int y_pos_int = y_pos < 0 ? 0 : (int)FLOOR(y_pos); - COMPLEX envelope = COMPLEX{ - .x = 0.0, - .y = 0.0, - }; + COMPLEX envelope = MAKE_COMPLEX(0.0, 0.0); for (int i_coeff = 0; i_coeff < num_coeffs; i_coeff++) { const ShapeletCoeff coeff = coeffs[i_coeff]; @@ -92,10 +87,7 @@ inline __device__ COMPLEX get_shapelet_envelope(const GaussianParams g_params, c // // The following is my attempt at doing this efficiently. int i_power_index = (int)((coeff.n1 + coeff.n2) % 4); - COMPLEX i_power = COMPLEX{ - .x = I_POWERS_REAL[i_power_index], - .y = I_POWERS_IMAG[i_power_index], - }; + COMPLEX i_power = MAKE_COMPLEX(I_POWERS_REAL[i_power_index], I_POWERS_IMAG[i_power_index]); envelope += i_power * rest; } @@ -395,16 +387,16 @@ extern "C" const char *model_points(const Points *comps, const Addresses *a, con a->d_tile_map, a->d_freq_map, a->num_unique_beam_freqs, a->d_tile_index_to_unflagged_tile_index_map, d_vis_fb); - cudaError_t error_id; + gpuError_t error_id; #ifdef DEBUG - error_id = cudaDeviceSynchronize(); - if (error_id != cudaSuccess) { - return cudaGetErrorString(error_id); + error_id = gpuDeviceSynchronize(); + if (error_id != gpuSuccess) { + return gpuGetErrorString(error_id); } #endif - error_id = cudaGetLastError(); - if (error_id != cudaSuccess) { - return cudaGetErrorString(error_id); + error_id = gpuGetLastError(); + if (error_id != gpuSuccess) { + return gpuGetErrorString(error_id); } return NULL; @@ -421,16 +413,16 @@ extern "C" const char *model_gaussians(const Gaussians *comps, const Addresses * d_beam_jones, a->d_tile_map, a->d_freq_map, a->num_unique_beam_freqs, a->d_tile_index_to_unflagged_tile_index_map, d_vis_fb); - cudaError_t error_id; + gpuError_t error_id; #ifdef DEBUG - error_id = cudaDeviceSynchronize(); - if (error_id != cudaSuccess) { - return cudaGetErrorString(error_id); + error_id = gpuDeviceSynchronize(); + if (error_id != gpuSuccess) { + return gpuGetErrorString(error_id); } #endif - error_id = cudaGetLastError(); - if (error_id != cudaSuccess) { - return cudaGetErrorString(error_id); + error_id = gpuGetLastError(); + if (error_id != gpuSuccess) { + return gpuGetErrorString(error_id); } return NULL; @@ -447,16 +439,16 @@ extern "C" const char *model_shapelets(const Shapelets *comps, const Addresses * a->num_freqs, a->num_baselines, a->d_freqs, d_uvws, *comps, a->d_shapelet_basis_values, d_beam_jones, a->d_tile_map, a->d_freq_map, a->num_unique_beam_freqs, a->d_tile_index_to_unflagged_tile_index_map, d_vis_fb); - cudaError_t error_id; + gpuError_t error_id; #ifdef DEBUG - error_id = cudaDeviceSynchronize(); - if (error_id != cudaSuccess) { - return cudaGetErrorString(error_id); + error_id = gpuDeviceSynchronize(); + if (error_id != gpuSuccess) { + return gpuGetErrorString(error_id); } #endif - error_id = cudaGetLastError(); - if (error_id != cudaSuccess) { - return cudaGetErrorString(error_id); + error_id = gpuGetLastError(); + if (error_id != gpuSuccess) { + return gpuGetErrorString(error_id); } return NULL; diff --git a/src/cuda/model.h b/src/gpu/model.h similarity index 100% rename from src/cuda/model.h rename to src/gpu/model.h diff --git a/src/cuda/model_double.rs b/src/gpu/model_double.rs similarity index 100% rename from src/cuda/model_double.rs rename to src/gpu/model_double.rs diff --git a/src/cuda/model_single.rs b/src/gpu/model_single.rs similarity index 100% rename from src/cuda/model_single.rs rename to src/gpu/model_single.rs diff --git a/src/cuda/tests.rs b/src/gpu/tests.rs similarity index 78% rename from src/cuda/tests.rs rename to src/gpu/tests.rs index 48386ea0..e46569b7 100644 --- a/src/cuda/tests.rs +++ b/src/gpu/tests.rs @@ -25,12 +25,18 @@ fn copy_to_and_from_device_succeeds() { #[test] #[serial] -fn cuda_malloc_huge_fails() { +fn gpu_malloc_huge_fails() { let size = 1024_usize.pow(4); // 1 TB; - let result: Result, CudaError> = DevicePointer::malloc(size); + let result: Result, GpuError> = DevicePointer::malloc(size); assert!(result.is_err()); - let err = result.unwrap_err(); - assert!(err.to_string().ends_with("cudaMalloc error: out of memory")); + let err = result.unwrap_err().to_string(); + #[cfg(feature = "cuda")] + assert!(err.ends_with("cudaMalloc error: out of memory"), "{err}"); + #[cfg(feature = "hip")] + assert!( + err.contains("hipMalloc error"), + "Error string wasn't expected; got: {err}" + ); } #[test] @@ -43,7 +49,10 @@ fn copy_from_non_existent_pointer_fails() { let result = d_ptr.copy_from_device(&mut dest); assert!(result.is_err()); let err = result.unwrap_err().to_string(); + #[cfg(feature = "cuda")] assert!(err.contains("cudaMemcpy from device failed")); + #[cfg(feature = "hip")] + assert!(err.contains("hipMemcpy from device failed")); assert!(err.contains("Attempted to copy data from a null device pointer")); } diff --git a/src/cuda/types.h b/src/gpu/types.h similarity index 100% rename from src/cuda/types.h rename to src/gpu/types.h diff --git a/src/cuda/types_double.rs b/src/gpu/types_double.rs similarity index 100% rename from src/cuda/types_double.rs rename to src/gpu/types_double.rs diff --git a/src/cuda/types_single.rs b/src/gpu/types_single.rs similarity index 100% rename from src/cuda/types_single.rs rename to src/gpu/types_single.rs diff --git a/src/cuda/update_rust_bindings.sh b/src/gpu/update_rust_bindings.sh similarity index 90% rename from src/cuda/update_rust_bindings.sh rename to src/gpu/update_rust_bindings.sh index 0adb84a5..5070e37d 100755 --- a/src/cuda/update_rust_bindings.sh +++ b/src/gpu/update_rust_bindings.sh @@ -1,7 +1,7 @@ #!/bin/bash -# Update the Rust bindings to CUDA code. This script must be run whenever the C -# headers for CUDA code change. +# Update the Rust bindings to GPU code. This script must be run whenever the C +# headers for GPU code change. # This script requires bindgen. This can be provided by a package manager or # installed with "cargo install bindgen". @@ -10,7 +10,7 @@ SCRIPTPATH="$(cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P)" bindgen "${SCRIPTPATH}"/utils.h \ - --allowlist-function "get_cuda_device_info" \ + --allowlist-function "get_gpu_device_info" \ > "${SCRIPTPATH}"/utils_bindings.rs for PRECISION in SINGLE DOUBLE; do diff --git a/src/gpu/utils.cu b/src/gpu/utils.cu new file mode 100644 index 00000000..dc77142a --- /dev/null +++ b/src/gpu/utils.cu @@ -0,0 +1,68 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at http://mozilla.org/MPL/2.0/. + +// "Homegrown" GPU utilities. +// +// As this code contains code derived from an official NVIDIA example +// (https://github.com/NVIDIA/cuda-samples/blob/master/Samples/1_Utilities/deviceQuery/deviceQuery.cpp), +// legally, a copyright, list of conditions and disclaimer must be distributed +// with this code. This should be found in the root directory of the +// mwa_hyperdrive git repo, file LICENSE-NVIDIA. + +// HIP-specific defines. +#if __HIPCC__ +#define gpuDeviceProp hipDeviceProp_t +#define gpuError_t hipError_t +#define gpuDriverGetVersion hipDriverGetVersion +#define gpuGetDeviceProperties hipGetDeviceProperties +#define gpuGetErrorString hipGetErrorString +#define gpuRuntimeGetVersion hipRuntimeGetVersion +#define gpuSetDevice hipSetDevice +#define gpuSuccess hipSuccess + +// CUDA-specific defines. +#elif __CUDACC__ +#define gpuDeviceProp cudaDeviceProp +#define gpuError_t cudaError_t +#define gpuDriverGetVersion cudaDriverGetVersion +#define gpuGetDeviceProperties cudaGetDeviceProperties +#define gpuGetErrorString cudaGetErrorString +#define gpuRuntimeGetVersion cudaRuntimeGetVersion +#define gpuSetDevice cudaSetDevice +#define gpuSuccess cudaSuccess +#endif // __HIPCC__ + +#ifdef __CUDACC__ +#include +#elif __HIPCC__ +#include +#include +#endif + +extern "C" const char *get_gpu_device_info(int device, char name[256], int *device_major, int *device_minor, + size_t *total_global_mem, int *driver_version, int *runtime_version) { + gpuError_t error_id = gpuSetDevice(device); + if (error_id != gpuSuccess) + return gpuGetErrorString(error_id); + + gpuDeviceProp device_prop; + error_id = gpuGetDeviceProperties(&device_prop, device); + if (error_id != gpuSuccess) + return gpuGetErrorString(error_id); + + memcpy(name, device_prop.name, 256); + *device_major = device_prop.major; + *device_minor = device_prop.minor; + *total_global_mem = device_prop.totalGlobalMem; + + error_id = gpuDriverGetVersion(driver_version); + if (error_id != gpuSuccess) + return gpuGetErrorString(error_id); + + error_id = gpuRuntimeGetVersion(runtime_version); + if (error_id != gpuSuccess) + return gpuGetErrorString(error_id); + + return NULL; +} diff --git a/src/cuda/utils.h b/src/gpu/utils.h similarity index 77% rename from src/cuda/utils.h rename to src/gpu/utils.h index 3eb1d9df..862b0971 100644 --- a/src/cuda/utils.h +++ b/src/gpu/utils.h @@ -3,7 +3,7 @@ // file, You can obtain one at http://mozilla.org/MPL/2.0/. /** - * Utilities for CUDA devices. + * Utilities for CUDA/HIP devices. */ #pragma once @@ -25,8 +25,8 @@ extern "C" { * code. This should be found in the root of the mwa_hyperdrive git repo, file * LICENSE-NVIDIA. */ -const char *get_cuda_device_info(int device, char name[256], int *device_major, int *device_minor, - size_t *total_global_mem, int *driver_version, int *runtime_version); +const char *get_gpu_device_info(int device, char name[256], int *device_major, int *device_minor, + size_t *total_global_mem, int *driver_version, int *runtime_version); #ifdef __cplusplus } // extern "C" diff --git a/src/cuda/utils.rs b/src/gpu/utils.rs similarity index 52% rename from src/cuda/utils.rs rename to src/gpu/utils.rs index 55b110a1..c4e31eb6 100644 --- a/src/cuda/utils.rs +++ b/src/gpu/utils.rs @@ -2,7 +2,7 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at http://mozilla.org/MPL/2.0/. -//! Utilities for CUDA devices. +//! Utilities for CUDA/HIP devices. //! //! We assume that everything is UTF-8. @@ -13,28 +13,27 @@ use std::{ panic::Location, }; -use super::CudaError; +use super::GpuError; #[derive(Debug, Clone)] -pub(crate) struct CudaDriverInfo { - /// Formatted CUDA driver version, e.g. "11.7". +pub(crate) struct GpuDriverInfo { + /// Formatted CUDA/HIP driver version, e.g. "11.7". pub(crate) driver_version: Box, - /// Formatted CUDA runtime version, e.g. "11.7". + /// Formatted CUDA/HIP runtime version, e.g. "11.7". pub(crate) runtime_version: Box, } #[derive(Debug, Clone)] -pub(crate) struct CudaDeviceInfo { +pub(crate) struct GpuDeviceInfo { pub(crate) name: Box, pub(crate) capability: Box, /// \[MebiBytes (MiB)\] pub(crate) total_global_mem: usize, } -/// Get CUDA device and driver information. At present, this function only +/// Get CUDA/HIP device and driver information. At present, this function only /// returns information on "device 0". -#[track_caller] -pub(crate) fn get_device_info() -> Result<(CudaDeviceInfo, CudaDriverInfo), CudaError> { +pub(crate) fn get_device_info() -> Result<(GpuDeviceInfo, GpuDriverInfo), GpuError> { unsafe { // TODO: Always assume we're using device 0 for now. let device = 0; @@ -44,7 +43,7 @@ pub(crate) fn get_device_info() -> Result<(CudaDeviceInfo, CudaDriverInfo), Cuda let mut total_global_mem = 0; let mut driver_version = 0; let mut runtime_version = 0; - let error_message_ptr = get_cuda_device_info( + let error_message_ptr = get_gpu_device_info( device, name, &mut device_major, @@ -54,38 +53,60 @@ pub(crate) fn get_device_info() -> Result<(CudaDeviceInfo, CudaDriverInfo), Cuda &mut runtime_version, ); if !error_message_ptr.is_null() { - // Get the CUDA error message behind the pointer. - let error_message = CStr::from_ptr(error_message_ptr) - .to_str() - .unwrap_or(""); + // Get the CUDA/HIP error message behind the pointer. + let error_message = CStr::from_ptr(error_message_ptr).to_str(); + #[cfg(feature = "cuda")] + let error_message = error_message.unwrap_or(""); + #[cfg(feature = "hip")] + let error_message = error_message.unwrap_or(""); let location = Location::caller(); - return Err(CudaError::Generic { + return Err(GpuError::Generic { msg: error_message.into(), file: location.file(), line: location.line(), }); } - let device_info = CudaDeviceInfo { + let device_info = GpuDeviceInfo { name: CString::from_raw(name) .to_str() - .expect("CUDA device name isn't UTF-8") + .expect("GPU device name isn't UTF-8") .to_string() .into_boxed_str(), capability: format!("{device_major}.{device_minor}").into_boxed_str(), total_global_mem: total_global_mem / 1048576, }; - let driver_version = format!("{}.{}", driver_version / 1000, (driver_version / 10) % 100); - let runtime_version = format!( - "{}.{}", - runtime_version / 1000, - (runtime_version / 10) % 100 - ); + #[cfg(feature = "cuda")] + let (driver_version, runtime_version) = { + let d = format!("{}.{}", driver_version / 1000, (driver_version / 10) % 100); + let r = format!( + "{}.{}", + runtime_version / 1000, + (runtime_version / 10) % 100 + ); + (d, r) + }; + #[cfg(feature = "hip")] + let (driver_version, runtime_version) = { + // This isn't documented, but is the only thing that makes sense to + // me. + let d = format!( + "{}.{}", + driver_version / 10_000_000, + (driver_version / 10_000) % 100 + ); + let r = format!( + "{}.{}", + runtime_version / 10_000_000, + (runtime_version / 10_000) % 100 + ); + (d, r) + }; Ok(( device_info, - CudaDriverInfo { + GpuDriverInfo { driver_version: driver_version.into_boxed_str(), runtime_version: runtime_version.into_boxed_str(), }, diff --git a/src/cuda/utils_bindings.rs b/src/gpu/utils_bindings.rs similarity index 96% rename from src/cuda/utils_bindings.rs rename to src/gpu/utils_bindings.rs index 62f3c693..070f24e3 100644 --- a/src/cuda/utils_bindings.rs +++ b/src/gpu/utils_bindings.rs @@ -2,7 +2,7 @@ extern "C" { #[doc = " A \"watered-down\" version of the CUDA example \"deviceQuery\".\n\n See the full example at:\nhttps://github.com/NVIDIA/cuda-samples/blob/master/Samples/1_Utilities/deviceQuery/deviceQuery.cpp\n\n As this code contains code derived from an official NVIDIA example, legally,\n a copyright, list of conditions and disclaimer must be distributed with this\n code. This should be found in the root of the mwa_hyperdrive git repo, file\n LICENSE-NVIDIA."] - pub fn get_cuda_device_info( + pub fn get_gpu_device_info( device: ::std::os::raw::c_int, name: *mut ::std::os::raw::c_char, device_major: *mut ::std::os::raw::c_int, diff --git a/src/lib.rs b/src/lib.rs index b5b99319..375e34c0 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -24,8 +24,8 @@ mod solutions; pub mod srclist; mod unit_parsing; -#[cfg(feature = "cuda")] -mod cuda; +#[cfg(any(feature = "cuda", feature = "hip"))] +mod gpu; #[cfg(test)] mod tests; @@ -40,8 +40,8 @@ lazy_static::lazy_static! { /// This should only ever be changed from its default by CLI code. static ref MODEL_DEVICE: AtomicCell = { cfg_if::cfg_if! { - if #[cfg(feature = "cuda")] { - AtomicCell::new(ModelDevice::Cuda) + if #[cfg(any(feature = "cuda", feature = "hip"))] { + AtomicCell::new(ModelDevice::Gpu) } else { AtomicCell::new(ModelDevice::Cpu) } diff --git a/src/model/error.rs b/src/model/error.rs index 810ce048..6a861b5f 100644 --- a/src/model/error.rs +++ b/src/model/error.rs @@ -11,7 +11,7 @@ pub enum ModelError { #[error(transparent)] Beam(#[from] crate::beam::BeamError), - #[cfg(feature = "cuda")] + #[cfg(any(feature = "cuda", feature = "hip"))] #[error(transparent)] - Cuda(#[from] crate::cuda::CudaError), + Gpu(#[from] crate::gpu::GpuError), } diff --git a/src/model/cuda.rs b/src/model/gpu.rs similarity index 75% rename from src/model/cuda.rs rename to src/model/gpu.rs index 0676103f..f6da1295 100644 --- a/src/model/cuda.rs +++ b/src/model/gpu.rs @@ -2,7 +2,7 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at http://mozilla.org/MPL/2.0/. -//! Code to generate sky-model visibilities with CUDA. +//! Code to generate sky-model visibilities with CUDA/HIP. use std::{borrow::Cow, collections::HashSet}; @@ -17,9 +17,9 @@ use ndarray::prelude::*; use super::{mask_pols, shapelets, ModelError, SkyModeller}; use crate::{ - beam::{Beam, BeamCUDA}, + beam::{Beam, BeamGpu}, context::Polarisations, - cuda::{self, cuda_kernel_call, CudaError, CudaFloat, CudaJones, DevicePointer}, + gpu::{self, gpu_kernel_call, DevicePointer, GpuError, GpuFloat, GpuJones}, srclist::{ get_instrumental_flux_densities, ComponentType, FluxDensityType, ShapeletCoeff, SourceList, }, @@ -28,9 +28,9 @@ use crate::{ /// The first axis of `*_list_fds` is unflagged fine channel frequency, the /// second is the source component. The length of `hadecs`, `lmns`, /// `*_list_fds`'s second axis are the same. -pub struct SkyModellerCuda<'a> { +pub struct SkyModellerGpu<'a> { /// The trait object to use for beam calculations. - cuda_beam: Box, + gpu_beam: Box, /// The phase centre used for all modelling. phase_centre: RADec, @@ -41,7 +41,7 @@ pub struct SkyModellerCuda<'a> { array_latitude: f64, /// The UT1 - UTC offset. If this is 0, effectively UT1 == UTC, which is a /// wrong assumption by up to 0.9s. We assume the this value does not change - /// over the timestamps given to this `SkyModellerCuda`. + /// over the timestamps given to this `SkyModellerGpu`. dut1: Duration, /// Shift baselines, LSTs and array latitudes back to J2000. precess: bool, @@ -55,85 +55,85 @@ pub struct SkyModellerCuda<'a> { pols: Polarisations, /// A simple map from an absolute tile index into an unflagged tile index. - /// This is important because CUDA will use tile indices from 0 to the + /// This is important because CUDA/HIP will use tile indices from 0 to the /// length of `unflagged_tile_xyzs`, but the beam code has dipole delays and /// dipole gains available for *all* tiles. So if tile 32 is flagged, any - /// CUDA thread with a tile index of 32 would naively get the flagged beam - /// info. This map would make tile index go to the next unflagged tile, + /// CUDA/HIP thread with a tile index of 32 would naively get the flagged + /// beam info. This map would make tile index go to the next unflagged tile, /// perhaps 33. tile_index_to_unflagged_tile_index_map: DevicePointer, - d_freqs: DevicePointer, - d_shapelet_basis_values: DevicePointer, + d_freqs: DevicePointer, + d_shapelet_basis_values: DevicePointer, point_power_law_radecs: Vec, - point_power_law_lmns: DevicePointer, + point_power_law_lmns: DevicePointer, /// Instrumental flux densities calculated at 150 MHz. - point_power_law_fds: DevicePointer, + point_power_law_fds: DevicePointer, /// Spectral indices. - point_power_law_sis: DevicePointer, + point_power_law_sis: DevicePointer, point_curved_power_law_radecs: Vec, - point_curved_power_law_lmns: DevicePointer, - pub(super) point_curved_power_law_fds: DevicePointer, - pub(super) point_curved_power_law_sis: DevicePointer, - point_curved_power_law_qs: DevicePointer, + point_curved_power_law_lmns: DevicePointer, + pub(super) point_curved_power_law_fds: DevicePointer, + pub(super) point_curved_power_law_sis: DevicePointer, + point_curved_power_law_qs: DevicePointer, point_list_radecs: Vec, - point_list_lmns: DevicePointer, + point_list_lmns: DevicePointer, /// Instrumental (i.e. XX, XY, YX, XX). - point_list_fds: DevicePointer, + point_list_fds: DevicePointer, gaussian_power_law_radecs: Vec, - gaussian_power_law_lmns: DevicePointer, + gaussian_power_law_lmns: DevicePointer, /// Instrumental flux densities calculated at 150 MHz. - gaussian_power_law_fds: DevicePointer, + gaussian_power_law_fds: DevicePointer, /// Spectral indices. - gaussian_power_law_sis: DevicePointer, - gaussian_power_law_gps: DevicePointer, + gaussian_power_law_sis: DevicePointer, + gaussian_power_law_gps: DevicePointer, gaussian_curved_power_law_radecs: Vec, - gaussian_curved_power_law_lmns: DevicePointer, - gaussian_curved_power_law_fds: DevicePointer, - gaussian_curved_power_law_sis: DevicePointer, - gaussian_curved_power_law_qs: DevicePointer, - gaussian_curved_power_law_gps: DevicePointer, + gaussian_curved_power_law_lmns: DevicePointer, + gaussian_curved_power_law_fds: DevicePointer, + gaussian_curved_power_law_sis: DevicePointer, + gaussian_curved_power_law_qs: DevicePointer, + gaussian_curved_power_law_gps: DevicePointer, gaussian_list_radecs: Vec, - gaussian_list_lmns: DevicePointer, + gaussian_list_lmns: DevicePointer, /// Instrumental (i.e. XX, XY, YX, XX). - gaussian_list_fds: DevicePointer, - gaussian_list_gps: DevicePointer, + gaussian_list_fds: DevicePointer, + gaussian_list_gps: DevicePointer, shapelet_power_law_radecs: Vec, - shapelet_power_law_lmns: DevicePointer, + shapelet_power_law_lmns: DevicePointer, /// Instrumental flux densities calculated at 150 MHz. - shapelet_power_law_fds: DevicePointer, + shapelet_power_law_fds: DevicePointer, /// Spectral indices. - shapelet_power_law_sis: DevicePointer, - shapelet_power_law_gps: DevicePointer, - shapelet_power_law_coeffs: DevicePointer, + shapelet_power_law_sis: DevicePointer, + shapelet_power_law_gps: DevicePointer, + shapelet_power_law_coeffs: DevicePointer, shapelet_power_law_coeff_lens: DevicePointer, shapelet_curved_power_law_radecs: Vec, - shapelet_curved_power_law_lmns: DevicePointer, - shapelet_curved_power_law_fds: DevicePointer, - shapelet_curved_power_law_sis: DevicePointer, - shapelet_curved_power_law_qs: DevicePointer, - shapelet_curved_power_law_gps: DevicePointer, - shapelet_curved_power_law_coeffs: DevicePointer, + shapelet_curved_power_law_lmns: DevicePointer, + shapelet_curved_power_law_fds: DevicePointer, + shapelet_curved_power_law_sis: DevicePointer, + shapelet_curved_power_law_qs: DevicePointer, + shapelet_curved_power_law_gps: DevicePointer, + shapelet_curved_power_law_coeffs: DevicePointer, shapelet_curved_power_law_coeff_lens: DevicePointer, shapelet_list_radecs: Vec, - shapelet_list_lmns: DevicePointer, + shapelet_list_lmns: DevicePointer, /// Instrumental (i.e. XX, XY, YX, XX). - shapelet_list_fds: DevicePointer, - shapelet_list_gps: DevicePointer, - shapelet_list_coeffs: DevicePointer, + shapelet_list_fds: DevicePointer, + shapelet_list_gps: DevicePointer, + shapelet_list_coeffs: DevicePointer, shapelet_list_coeff_lens: DevicePointer, } -impl<'a> SkyModellerCuda<'a> { +impl<'a> SkyModellerGpu<'a> { /// Given a source list, split the components into each component type (e.g. /// points, shapelets) and by each flux density type (e.g. list, power law), /// then copy them to a GPU ready for modelling. Where possible, list flux @@ -153,71 +153,71 @@ impl<'a> SkyModellerCuda<'a> { array_latitude_rad: f64, dut1: Duration, apply_precession: bool, - ) -> Result, ModelError> { + ) -> Result, ModelError> { let mut point_power_law_radecs: Vec = vec![]; - let mut point_power_law_lmns: Vec = vec![]; + let mut point_power_law_lmns: Vec = vec![]; let mut point_power_law_fds: Vec<_> = vec![]; let mut point_power_law_sis: Vec<_> = vec![]; let mut point_curved_power_law_radecs: Vec = vec![]; - let mut point_curved_power_law_lmns: Vec = vec![]; + let mut point_curved_power_law_lmns: Vec = vec![]; let mut point_curved_power_law_fds: Vec<_> = vec![]; let mut point_curved_power_law_sis: Vec<_> = vec![]; let mut point_curved_power_law_qs: Vec<_> = vec![]; let mut point_list_radecs: Vec = vec![]; - let mut point_list_lmns: Vec = vec![]; + let mut point_list_lmns: Vec = vec![]; let mut point_list_fds: Vec<&FluxDensityType> = vec![]; let mut gaussian_power_law_radecs: Vec = vec![]; - let mut gaussian_power_law_lmns: Vec = vec![]; + let mut gaussian_power_law_lmns: Vec = vec![]; let mut gaussian_power_law_fds: Vec<_> = vec![]; let mut gaussian_power_law_sis: Vec<_> = vec![]; - let mut gaussian_power_law_gps: Vec = vec![]; + let mut gaussian_power_law_gps: Vec = vec![]; let mut gaussian_curved_power_law_radecs: Vec = vec![]; - let mut gaussian_curved_power_law_lmns: Vec = vec![]; + let mut gaussian_curved_power_law_lmns: Vec = vec![]; let mut gaussian_curved_power_law_fds: Vec<_> = vec![]; let mut gaussian_curved_power_law_sis: Vec<_> = vec![]; let mut gaussian_curved_power_law_qs: Vec<_> = vec![]; - let mut gaussian_curved_power_law_gps: Vec = vec![]; + let mut gaussian_curved_power_law_gps: Vec = vec![]; let mut gaussian_list_radecs: Vec = vec![]; - let mut gaussian_list_lmns: Vec = vec![]; + let mut gaussian_list_lmns: Vec = vec![]; let mut gaussian_list_fds: Vec<&FluxDensityType> = vec![]; - let mut gaussian_list_gps: Vec = vec![]; + let mut gaussian_list_gps: Vec = vec![]; let mut shapelet_power_law_radecs: Vec = vec![]; - let mut shapelet_power_law_lmns: Vec = vec![]; + let mut shapelet_power_law_lmns: Vec = vec![]; let mut shapelet_power_law_fds: Vec<_> = vec![]; let mut shapelet_power_law_sis: Vec<_> = vec![]; - let mut shapelet_power_law_gps: Vec = vec![]; + let mut shapelet_power_law_gps: Vec = vec![]; let mut shapelet_power_law_coeffs: Vec<&[ShapeletCoeff]> = vec![]; let mut shapelet_curved_power_law_radecs: Vec = vec![]; - let mut shapelet_curved_power_law_lmns: Vec = vec![]; + let mut shapelet_curved_power_law_lmns: Vec = vec![]; let mut shapelet_curved_power_law_fds: Vec<_> = vec![]; let mut shapelet_curved_power_law_sis: Vec<_> = vec![]; let mut shapelet_curved_power_law_qs: Vec<_> = vec![]; - let mut shapelet_curved_power_law_gps: Vec = vec![]; + let mut shapelet_curved_power_law_gps: Vec = vec![]; let mut shapelet_curved_power_law_coeffs: Vec<&[ShapeletCoeff]> = vec![]; let mut shapelet_list_radecs: Vec = vec![]; - let mut shapelet_list_lmns: Vec = vec![]; + let mut shapelet_list_lmns: Vec = vec![]; let mut shapelet_list_fds: Vec<&FluxDensityType> = vec![]; - let mut shapelet_list_gps: Vec = vec![]; + let mut shapelet_list_gps: Vec = vec![]; let mut shapelet_list_coeffs: Vec<&[ShapeletCoeff]> = vec![]; - let jones_to_cuda_jones = |j: Jones| -> CudaJones { - CudaJones { - j00_re: j[0].re as CudaFloat, - j00_im: j[0].im as CudaFloat, - j01_re: j[1].re as CudaFloat, - j01_im: j[1].im as CudaFloat, - j10_re: j[2].re as CudaFloat, - j10_im: j[2].im as CudaFloat, - j11_re: j[3].re as CudaFloat, - j11_im: j[3].im as CudaFloat, + let jones_to_gpu_jones = |j: Jones| -> GpuJones { + GpuJones { + j00_re: j[0].re as GpuFloat, + j00_im: j[0].im as GpuFloat, + j01_re: j[1].re as GpuFloat, + j01_im: j[1].im as GpuFloat, + j10_re: j[2].re as GpuFloat, + j10_im: j[2].im as GpuFloat, + j11_re: j[3].re as GpuFloat, + j11_im: j[3].im as GpuFloat, } }; @@ -235,39 +235,39 @@ impl<'a> SkyModellerCuda<'a> { { let radec = comp.radec; let LmnRime { l, m, n } = comp.radec.to_lmn(phase_centre).prepare_for_rime(); - let lmn = cuda::LmnRime { - l: l as CudaFloat, - m: m as CudaFloat, - n: n as CudaFloat, + let lmn = gpu::LmnRime { + l: l as GpuFloat, + m: m as GpuFloat, + n: n as GpuFloat, }; match &comp.flux_type { FluxDensityType::PowerLaw { si, fd: _ } => { // Rather than using this PL's reference freq, use a pre- - // defined one, so the CUDA code doesn't need to keep track - // of all reference freqs. - let fd_at_150mhz = comp.estimate_at_freq(cuda::POWER_LAW_FD_REF_FREQ as _); + // defined one, so the the GPU code doesn't need to keep + // track of all reference freqs. + let fd_at_150mhz = comp.estimate_at_freq(gpu::POWER_LAW_FD_REF_FREQ as _); let inst_fd: Jones = fd_at_150mhz.to_inst_stokes(); - let cuda_inst_fd = jones_to_cuda_jones(inst_fd); + let gpu_inst_fd = jones_to_gpu_jones(inst_fd); match &comp.comp_type { ComponentType::Point => { point_power_law_radecs.push(radec); point_power_law_lmns.push(lmn); - point_power_law_fds.push(cuda_inst_fd); - point_power_law_sis.push(*si as CudaFloat); + point_power_law_fds.push(gpu_inst_fd); + point_power_law_sis.push(*si as GpuFloat); } ComponentType::Gaussian { maj, min, pa } => { - let gp = cuda::GaussianParams { - maj: *maj as CudaFloat, - min: *min as CudaFloat, - pa: *pa as CudaFloat, + let gp = gpu::GaussianParams { + maj: *maj as GpuFloat, + min: *min as GpuFloat, + pa: *pa as GpuFloat, }; gaussian_power_law_radecs.push(radec); gaussian_power_law_lmns.push(lmn); gaussian_power_law_gps.push(gp); - gaussian_power_law_fds.push(cuda_inst_fd); - gaussian_power_law_sis.push(*si as CudaFloat); + gaussian_power_law_fds.push(gpu_inst_fd); + gaussian_power_law_sis.push(*si as GpuFloat); } ComponentType::Shapelet { @@ -276,34 +276,34 @@ impl<'a> SkyModellerCuda<'a> { pa, coeffs, } => { - let gp = cuda::GaussianParams { - maj: *maj as CudaFloat, - min: *min as CudaFloat, - pa: *pa as CudaFloat, + let gp = gpu::GaussianParams { + maj: *maj as GpuFloat, + min: *min as GpuFloat, + pa: *pa as GpuFloat, }; shapelet_power_law_radecs.push(radec); shapelet_power_law_lmns.push(lmn); shapelet_power_law_gps.push(gp); shapelet_power_law_coeffs.push(coeffs); - shapelet_power_law_fds.push(cuda_inst_fd); - shapelet_power_law_sis.push(*si as CudaFloat); + shapelet_power_law_fds.push(gpu_inst_fd); + shapelet_power_law_sis.push(*si as GpuFloat); } }; } FluxDensityType::CurvedPowerLaw { si, fd, q } => { - let fd_at_150mhz = comp.estimate_at_freq(cuda::POWER_LAW_FD_REF_FREQ as _); + let fd_at_150mhz = comp.estimate_at_freq(gpu::POWER_LAW_FD_REF_FREQ as _); let inst_fd: Jones = fd_at_150mhz.to_inst_stokes(); - let cuda_inst_fd = jones_to_cuda_jones(inst_fd); + let gpu_inst_fd = jones_to_gpu_jones(inst_fd); // A new SI is needed when changing the reference freq. // Thanks Jack. #[allow(clippy::unnecessary_cast)] - let si = if fd.freq == cuda::POWER_LAW_FD_REF_FREQ as f64 { + let si = if fd.freq == gpu::POWER_LAW_FD_REF_FREQ as f64 { *si } else { #[allow(clippy::unnecessary_cast)] - let logratio = (fd.freq / cuda::POWER_LAW_FD_REF_FREQ as f64).ln(); + let logratio = (fd.freq / gpu::POWER_LAW_FD_REF_FREQ as f64).ln(); ((fd.i / fd_at_150mhz.i).ln() - q * logratio.powi(2)) / logratio }; @@ -311,23 +311,23 @@ impl<'a> SkyModellerCuda<'a> { ComponentType::Point => { point_curved_power_law_radecs.push(radec); point_curved_power_law_lmns.push(lmn); - point_curved_power_law_fds.push(cuda_inst_fd); - point_curved_power_law_sis.push(si as CudaFloat); - point_curved_power_law_qs.push(*q as CudaFloat); + point_curved_power_law_fds.push(gpu_inst_fd); + point_curved_power_law_sis.push(si as GpuFloat); + point_curved_power_law_qs.push(*q as GpuFloat); } ComponentType::Gaussian { maj, min, pa } => { - let gp = cuda::GaussianParams { - maj: *maj as CudaFloat, - min: *min as CudaFloat, - pa: *pa as CudaFloat, + let gp = gpu::GaussianParams { + maj: *maj as GpuFloat, + min: *min as GpuFloat, + pa: *pa as GpuFloat, }; gaussian_curved_power_law_radecs.push(radec); gaussian_curved_power_law_lmns.push(lmn); gaussian_curved_power_law_gps.push(gp); - gaussian_curved_power_law_fds.push(cuda_inst_fd); - gaussian_curved_power_law_sis.push(si as CudaFloat); - gaussian_curved_power_law_qs.push(*q as CudaFloat); + gaussian_curved_power_law_fds.push(gpu_inst_fd); + gaussian_curved_power_law_sis.push(si as GpuFloat); + gaussian_curved_power_law_qs.push(*q as GpuFloat); } ComponentType::Shapelet { @@ -336,18 +336,18 @@ impl<'a> SkyModellerCuda<'a> { pa, coeffs, } => { - let gp = cuda::GaussianParams { - maj: *maj as CudaFloat, - min: *min as CudaFloat, - pa: *pa as CudaFloat, + let gp = gpu::GaussianParams { + maj: *maj as GpuFloat, + min: *min as GpuFloat, + pa: *pa as GpuFloat, }; shapelet_curved_power_law_radecs.push(radec); shapelet_curved_power_law_lmns.push(lmn); shapelet_curved_power_law_gps.push(gp); shapelet_curved_power_law_coeffs.push(coeffs); - shapelet_curved_power_law_fds.push(cuda_inst_fd); - shapelet_curved_power_law_sis.push(si as CudaFloat); - shapelet_curved_power_law_qs.push(*q as CudaFloat); + shapelet_curved_power_law_fds.push(gpu_inst_fd); + shapelet_curved_power_law_sis.push(si as GpuFloat); + shapelet_curved_power_law_qs.push(*q as GpuFloat); } }; } @@ -360,10 +360,10 @@ impl<'a> SkyModellerCuda<'a> { } ComponentType::Gaussian { maj, min, pa } => { - let gp = cuda::GaussianParams { - maj: *maj as CudaFloat, - min: *min as CudaFloat, - pa: *pa as CudaFloat, + let gp = gpu::GaussianParams { + maj: *maj as GpuFloat, + min: *min as GpuFloat, + pa: *pa as GpuFloat, }; gaussian_list_radecs.push(radec); gaussian_list_lmns.push(lmn); @@ -377,10 +377,10 @@ impl<'a> SkyModellerCuda<'a> { pa, coeffs, } => { - let gp = cuda::GaussianParams { - maj: *maj as CudaFloat, - min: *min as CudaFloat, - pa: *pa as CudaFloat, + let gp = gpu::GaussianParams { + maj: *maj as GpuFloat, + min: *min as GpuFloat, + pa: *pa as GpuFloat, }; shapelet_list_radecs.push(radec); shapelet_list_lmns.push(lmn); @@ -394,13 +394,13 @@ impl<'a> SkyModellerCuda<'a> { let point_list_fds = get_instrumental_flux_densities(&point_list_fds, unflagged_fine_chan_freqs) - .mapv(jones_to_cuda_jones); + .mapv(jones_to_gpu_jones); let gaussian_list_fds = get_instrumental_flux_densities(&gaussian_list_fds, unflagged_fine_chan_freqs) - .mapv(jones_to_cuda_jones); + .mapv(jones_to_gpu_jones); let shapelet_list_fds = get_instrumental_flux_densities(&shapelet_list_fds, unflagged_fine_chan_freqs) - .mapv(jones_to_cuda_jones); + .mapv(jones_to_gpu_jones); let (shapelet_power_law_coeffs, shapelet_power_law_coeff_lens) = get_flattened_coeffs(shapelet_power_law_coeffs); @@ -409,16 +409,16 @@ impl<'a> SkyModellerCuda<'a> { let (shapelet_list_coeffs, shapelet_list_coeff_lens) = get_flattened_coeffs(shapelet_list_coeffs); - // Variables for CUDA. They're made flexible in their types for - // whichever precision is being used in the CUDA code. + // Variables for CUDA/HIP. They're made flexible in their types for + // whichever precision is being used. let (unflagged_fine_chan_freqs_ints, unflagged_fine_chan_freqs_floats): (Vec<_>, Vec<_>) = unflagged_fine_chan_freqs .iter() - .map(|&f| (f as u32, f as CudaFloat)) + .map(|&f| (f as u32, f as GpuFloat)) .unzip(); - let shapelet_basis_values: Vec = shapelets::SHAPELET_BASIS_VALUES + let shapelet_basis_values: Vec = shapelets::SHAPELET_BASIS_VALUES .iter() - .map(|&f| f as CudaFloat) + .map(|&f| f as GpuFloat) .collect(); let num_baselines = (unflagged_tile_xyzs.len() * (unflagged_tile_xyzs.len() - 1)) / 2; @@ -441,8 +441,8 @@ impl<'a> SkyModellerCuda<'a> { let d_tile_index_to_unflagged_tile_index_map = DevicePointer::copy_to_device(&tile_index_to_unflagged_tile_index_map)?; - Ok(SkyModellerCuda { - cuda_beam: beam.prepare_cuda_beam(&unflagged_fine_chan_freqs_ints)?, + Ok(SkyModellerGpu { + gpu_beam: beam.prepare_gpu_beam(&unflagged_fine_chan_freqs_ints)?, phase_centre, array_longitude: array_longitude_rad, @@ -557,8 +557,8 @@ impl<'a> SkyModellerCuda<'a> { /// This function is mostly used for testing. For a single timestep, over /// the already-provided baselines and frequencies, generate visibilities /// for each specified sky-model point-source component. The - /// `SkyModellerCuda` object *must* already have its UVW coordinates set; - /// see [`SkyModellerCuda::set_uvws`]. + /// `SkyModellerGpu` object *must* already have its UVW coordinates set; see + /// [`SkyModellerGpu::set_uvws`]. /// /// `lst_rad`: The local sidereal time in \[radians\]. /// @@ -568,8 +568,8 @@ impl<'a> SkyModellerCuda<'a> { &self, lst_rad: f64, array_latitude_rad: f64, - d_uvws: &DevicePointer, - d_beam_jones: &mut DevicePointer, + d_uvws: &DevicePointer, + d_beam_jones: &mut DevicePointer, d_vis_fb: &mut DevicePointer>, ) -> Result<(), ModelError> { if self.point_power_law_radecs.is_empty() @@ -580,23 +580,23 @@ impl<'a> SkyModellerCuda<'a> { } { - let (azs, zas): (Vec, Vec) = self + let (azs, zas): (Vec, Vec) = self .point_power_law_radecs .iter() .chain(self.point_curved_power_law_radecs.iter()) .chain(self.point_list_radecs.iter()) .map(|radec| { let azel = radec.to_hadec(lst_rad).to_azel(array_latitude_rad); - (azel.az as CudaFloat, azel.za() as CudaFloat) + (azel.az as GpuFloat, azel.za() as GpuFloat) }) .unzip(); d_beam_jones.realloc( - self.cuda_beam.get_num_unique_tiles() as usize - * self.cuda_beam.get_num_unique_freqs() as usize + self.gpu_beam.get_num_unique_tiles() as usize + * self.gpu_beam.get_num_unique_freqs() as usize * azs.len() - * std::mem::size_of::(), + * std::mem::size_of::(), )?; - self.cuda_beam.calc_jones_pair( + self.gpu_beam.calc_jones_pair( &azs, &zas, array_latitude_rad, @@ -604,9 +604,9 @@ impl<'a> SkyModellerCuda<'a> { )?; } - cuda_kernel_call!( - cuda::model_points, - &cuda::Points { + gpu_kernel_call!( + gpu::model_points, + &gpu::Points { num_power_laws: self .point_power_law_radecs .len() @@ -644,8 +644,8 @@ impl<'a> SkyModellerCuda<'a> { /// This function is mostly used for testing. For a single timestep, over /// the already-provided baselines and frequencies, generate visibilities /// for each specified sky-model Gaussian-source component. The - /// `SkyModellerCuda` object *must* already have its UVW coordinates set; - /// see [`SkyModellerCuda::set_uvws`]. + /// `SkyModellerGpu` object *must* already have its UVW coordinates set; see + /// [`SkyModellerGpu::set_uvws`]. /// /// `lst_rad`: The local sidereal time in \[radians\]. /// @@ -655,8 +655,8 @@ impl<'a> SkyModellerCuda<'a> { &self, lst_rad: f64, array_latitude_rad: f64, - d_uvws: &DevicePointer, - d_beam_jones: &mut DevicePointer, + d_uvws: &DevicePointer, + d_beam_jones: &mut DevicePointer, d_vis_fb: &mut DevicePointer>, ) -> Result<(), ModelError> { if self.gaussian_power_law_radecs.is_empty() @@ -667,23 +667,23 @@ impl<'a> SkyModellerCuda<'a> { } { - let (azs, zas): (Vec, Vec) = self + let (azs, zas): (Vec, Vec) = self .gaussian_power_law_radecs .iter() .chain(self.gaussian_curved_power_law_radecs.iter()) .chain(self.gaussian_list_radecs.iter()) .map(|radec| { let azel = radec.to_hadec(lst_rad).to_azel(array_latitude_rad); - (azel.az as CudaFloat, azel.za() as CudaFloat) + (azel.az as GpuFloat, azel.za() as GpuFloat) }) .unzip(); d_beam_jones.realloc( - self.cuda_beam.get_num_unique_tiles() as usize - * self.cuda_beam.get_num_unique_freqs() as usize + self.gpu_beam.get_num_unique_tiles() as usize + * self.gpu_beam.get_num_unique_freqs() as usize * azs.len() - * std::mem::size_of::(), + * std::mem::size_of::(), )?; - self.cuda_beam.calc_jones_pair( + self.gpu_beam.calc_jones_pair( &azs, &zas, array_latitude_rad, @@ -691,9 +691,9 @@ impl<'a> SkyModellerCuda<'a> { )?; } - cuda_kernel_call!( - cuda::model_gaussians, - &cuda::Gaussians { + gpu_kernel_call!( + gpu::model_gaussians, + &gpu::Gaussians { num_power_laws: self .gaussian_power_law_radecs .len() @@ -734,8 +734,8 @@ impl<'a> SkyModellerCuda<'a> { /// This function is mostly used for testing. For a single timestep, over /// the already-provided baselines and frequencies, generate visibilities /// for each specified sky-model Gaussian-source component. The - /// `SkyModellerCuda` object *must* already have its UVW coordinates set; - /// see [`SkyModellerCuda::set_uvws`]. + /// `SkyModellerGpu` object *must* already have its UVW coordinates set; see + /// [`SkyModellerGpu::set_uvws`]. /// /// `lst_rad`: The local sidereal time in \[radians\]. /// @@ -745,8 +745,8 @@ impl<'a> SkyModellerCuda<'a> { &self, lst_rad: f64, array_latitude_rad: f64, - d_uvws: &DevicePointer, - d_beam_jones: &mut DevicePointer, + d_uvws: &DevicePointer, + d_beam_jones: &mut DevicePointer, d_vis_fb: &mut DevicePointer>, ) -> Result<(), ModelError> { if self.shapelet_power_law_radecs.is_empty() @@ -757,23 +757,23 @@ impl<'a> SkyModellerCuda<'a> { } { - let (azs, zas): (Vec, Vec) = self + let (azs, zas): (Vec, Vec) = self .shapelet_power_law_radecs .iter() .chain(self.shapelet_curved_power_law_radecs.iter()) .chain(self.shapelet_list_radecs.iter()) .map(|radec| { let azel = radec.to_hadec(lst_rad).to_azel(array_latitude_rad); - (azel.az as CudaFloat, azel.za() as CudaFloat) + (azel.az as GpuFloat, azel.za() as GpuFloat) }) .unzip(); d_beam_jones.realloc( - self.cuda_beam.get_num_unique_tiles() as usize - * self.cuda_beam.get_num_unique_freqs() as usize + self.gpu_beam.get_num_unique_tiles() as usize + * self.gpu_beam.get_num_unique_freqs() as usize * azs.len() - * std::mem::size_of::(), + * std::mem::size_of::(), )?; - self.cuda_beam.calc_jones_pair( + self.gpu_beam.calc_jones_pair( &azs, &zas, array_latitude_rad, @@ -788,9 +788,9 @@ impl<'a> SkyModellerCuda<'a> { DevicePointer::copy_to_device(uvs.curved_power_law.as_slice().expect("is contiguous"))?; let list_uvs = DevicePointer::copy_to_device(uvs.list.as_slice().expect("is contiguous"))?; - cuda_kernel_call!( - cuda::model_shapelets, - &cuda::Shapelets { + gpu_kernel_call!( + gpu::model_shapelets, + &gpu::Shapelets { num_power_laws: self .shapelet_power_law_radecs .len() @@ -840,15 +840,15 @@ impl<'a> SkyModellerCuda<'a> { } /// This is a "specialised" version of [`SkyModeller::model_timestep_with`]; - /// it accepts CUDA buffers directly, saving some allocations. Unlike the + /// it accepts GPU buffers directly, saving some allocations. Unlike the /// aforementioned function, the incoming visibilities *are not* cleared; /// visibilities are accumulated. fn model_timestep_with( &self, lst_rad: f64, array_latitude_rad: f64, - d_uvws: &DevicePointer, - d_beam_jones: &mut DevicePointer, + d_uvws: &DevicePointer, + d_beam_jones: &mut DevicePointer, d_vis_fb: &mut DevicePointer>, ) -> Result<(), ModelError> { unsafe { @@ -867,8 +867,8 @@ impl<'a> SkyModellerCuda<'a> { fn get_lst_uvws_latitude( &self, timestamp: Epoch, - d_uvws: &mut DevicePointer, - ) -> Result<(f64, Vec, f64), CudaError> { + d_uvws: &mut DevicePointer, + ) -> Result<(f64, Vec, f64), GpuError> { let (lst, xyzs, latitude) = if self.precess { let precession_info = precess_time( self.array_longitude, @@ -905,30 +905,30 @@ impl<'a> SkyModellerCuda<'a> { }; let uvws = xyzs_to_cross_uvws(&xyzs, self.phase_centre.to_hadec(lst)); - let cuda_uvws: Vec = uvws + let gpu_uvws: Vec = uvws .iter() - .map(|&uvw| cuda::UVW { - u: uvw.u as CudaFloat, - v: uvw.v as CudaFloat, - w: uvw.w as CudaFloat, + .map(|&uvw| gpu::UVW { + u: uvw.u as GpuFloat, + v: uvw.v as GpuFloat, + w: uvw.w as GpuFloat, }) .collect(); - d_uvws.overwrite(&cuda_uvws)?; + d_uvws.overwrite(&gpu_uvws)?; Ok((lst, uvws, latitude)) } - /// Get a populated [`cuda::Addresses`]. This should never outlive `self`. - fn get_addresses(&self) -> cuda::Addresses { - cuda::Addresses { + /// Get a populated [`gpu::Addresses`]. This should never outlive `self`. + fn get_addresses(&self) -> gpu::Addresses { + gpu::Addresses { num_freqs: self.num_freqs, num_vis: self.num_baselines * self.num_freqs, num_baselines: self.num_baselines, d_freqs: self.d_freqs.get(), d_shapelet_basis_values: self.d_shapelet_basis_values.get(), - num_unique_beam_freqs: self.cuda_beam.get_num_unique_freqs(), - d_tile_map: self.cuda_beam.get_tile_map(), - d_freq_map: self.cuda_beam.get_freq_map(), + num_unique_beam_freqs: self.gpu_beam.get_num_unique_freqs(), + d_tile_map: self.gpu_beam.get_tile_map(), + d_freq_map: self.gpu_beam.get_freq_map(), d_tile_index_to_unflagged_tile_index_map: self .tile_index_to_unflagged_tile_index_map .get(), @@ -937,7 +937,7 @@ impl<'a> SkyModellerCuda<'a> { /// Shapelets need their own special kind of UVW coordinates. Each shapelet /// component's position is treated as the phase centre. This function uses - /// the FFI type [`cuda::ShapeletUV`]; the W isn't actually used in + /// the FFI type [`gpu::ShapeletUV`]; the W isn't actually used in /// computation, and omitting it is hopefully a little more efficient. /// /// The returned arrays have baseline as the first axis and component as the @@ -963,7 +963,7 @@ impl<'a> SkyModellerCuda<'a> { } } -impl<'a> SkyModeller<'a> for SkyModellerCuda<'a> { +impl<'a> SkyModeller<'a> for SkyModellerGpu<'a> { fn model_timestep( &self, timestamp: Epoch, @@ -1006,36 +1006,36 @@ impl<'a> SkyModeller<'a> for SkyModellerCuda<'a> { } } -/// The return type of [SkyModellerCuda::get_shapelet_uvs]. These arrays have +/// The return type of [SkyModellerGpu::get_shapelet_uvs]. These arrays have /// baseline as the first axis and component as the second. pub(super) struct ShapeletUVs { - power_law: Array2, - curved_power_law: Array2, - pub(super) list: Array2, + power_law: Array2, + curved_power_law: Array2, + pub(super) list: Array2, } fn get_shapelet_uvs_inner( radecs: &[RADec], lst_rad: f64, tile_xyzs: &[XyzGeodetic], -) -> Array2 { +) -> Array2 { let n = tile_xyzs.len(); let num_baselines = (n * (n - 1)) / 2; - let mut shapelet_uvs: Array2 = Array2::from_elem( + let mut shapelet_uvs: Array2 = Array2::from_elem( (num_baselines, radecs.len()), - cuda::ShapeletUV { u: 0.0, v: 0.0 }, + gpu::ShapeletUV { u: 0.0, v: 0.0 }, ); shapelet_uvs .axis_iter_mut(Axis(1)) .zip(radecs.iter()) .for_each(|(mut baseline_uv, radec)| { let hadec = radec.to_hadec(lst_rad); - let shapelet_uvs: Vec = xyzs_to_cross_uvws(tile_xyzs, hadec) + let shapelet_uvs: Vec = xyzs_to_cross_uvws(tile_xyzs, hadec) .into_iter() - .map(|uvw| cuda::ShapeletUV { - u: uvw.u as CudaFloat, - v: uvw.v as CudaFloat, + .map(|uvw| gpu::ShapeletUV { + u: uvw.u as GpuFloat, + v: uvw.v as GpuFloat, }) .collect(); baseline_uv.assign(&Array1::from(shapelet_uvs)); @@ -1049,8 +1049,8 @@ fn get_shapelet_uvs_inner( /// array-of-arrays). fn get_flattened_coeffs( shapelet_coeffs: Vec<&[ShapeletCoeff]>, -) -> (Vec, Vec) { - let mut coeffs: Vec = vec![]; +) -> (Vec, Vec) { + let mut coeffs: Vec = vec![]; let mut coeff_lengths = Vec::with_capacity(coeffs.len()); for coeffs_for_comp in shapelet_coeffs { @@ -1061,10 +1061,10 @@ fn get_flattened_coeffs( .expect("not bigger than i32::MAX"), ); for &ShapeletCoeff { n1, n2, value } in coeffs_for_comp { - coeffs.push(cuda::ShapeletCoeff { + coeffs.push(gpu::ShapeletCoeff { n1, n2, - value: value as CudaFloat, + value: value as GpuFloat, }) } } diff --git a/src/model/mod.rs b/src/model/mod.rs index f414aed6..2125bf32 100644 --- a/src/model/mod.rs +++ b/src/model/mod.rs @@ -5,17 +5,17 @@ //! Code to generate sky-model visibilities. mod cpu; -#[cfg(feature = "cuda")] -mod cuda; mod error; +#[cfg(any(feature = "cuda", feature = "hip"))] +mod gpu; pub(crate) mod shapelets; #[cfg(test)] mod tests; pub use cpu::SkyModellerCpu; -#[cfg(feature = "cuda")] -pub use cuda::SkyModellerCuda; pub(crate) use error::ModelError; +#[cfg(any(feature = "cuda", feature = "hip"))] +pub use gpu::SkyModellerGpu; use std::collections::HashSet; @@ -31,10 +31,10 @@ pub enum ModelDevice { /// when modelling. Cpu, - /// A CUDA-capable device is used for modelling. The precision depends on - /// the compile features used. - #[cfg(feature = "cuda")] - Cuda, + /// A CUDA- or HIP-capable device is used for modelling. The precision + /// depends on the compile features used. + #[cfg(any(feature = "cuda", feature = "hip"))] + Gpu, } impl ModelDevice { @@ -42,11 +42,11 @@ impl ModelDevice { match self { ModelDevice::Cpu => "double", - #[cfg(feature = "cuda-single")] - ModelDevice::Cuda => "single", + #[cfg(feature = "gpu-single")] + ModelDevice::Gpu => "single", - #[cfg(all(feature = "cuda", not(feature = "cuda-single")))] - ModelDevice::Cuda => "double", + #[cfg(all(any(feature = "cuda", feature = "hip"), not(feature = "gpu-single")))] + ModelDevice::Gpu => "double", } } @@ -56,11 +56,15 @@ impl ModelDevice { match self { ModelDevice::Cpu => Ok(get_cpu_info()), - #[cfg(feature = "cuda")] - ModelDevice::Cuda => { - let (device_info, driver_info) = crate::cuda::get_device_info()?; + #[cfg(any(feature = "cuda", feature = "hip"))] + ModelDevice::Gpu => { + let (device_info, driver_info) = crate::gpu::get_device_info()?; + #[cfg(feature = "cuda")] + let device_type = "CUDA"; + #[cfg(feature = "hip")] + let device_type = "HIP"; Ok(format!( - "{} (capability {}, {} MiB), CUDA driver {}, runtime {}", + "{} (capability {}, {} MiB), {device_type} driver {}, runtime {}", device_info.name, device_info.capability, device_info.total_global_mem, @@ -74,9 +78,9 @@ impl ModelDevice { #[derive(thiserror::Error, Debug)] pub(crate) enum DeviceError { - #[cfg(feature = "cuda")] + #[cfg(any(feature = "cuda", feature = "hip"))] #[error(transparent)] - Cuda(#[from] crate::cuda::CudaError), + Gpu(#[from] crate::gpu::GpuError), } /// Get a formatted string with information on the device used for modelling. @@ -127,7 +131,8 @@ pub trait SkyModeller<'a> { /// # Errors /// /// This function will return an error if there was a problem with - /// beam-response calculation or a CUDA error (if using CUDA functionality). + /// beam-response calculation or a CUDA/HIP error (if using CUDA/HIP + /// functionality). fn model_timestep( &self, timestamp: Epoch, @@ -151,7 +156,8 @@ pub trait SkyModeller<'a> { /// # Errors /// /// This function will return an error if there was a problem with - /// beam-response calculation or a CUDA error (if using CUDA functionality). + /// beam-response calculation or a CUDA/HIP error (if using CUDA/HIP + /// functionality). fn model_timestep_with( &self, timestamp: Epoch, @@ -160,14 +166,13 @@ pub trait SkyModeller<'a> { } /// Create a [`SkyModeller`] trait object that generates sky-model visibilities -/// on either the CPU or a CUDA-compatible GPU. This function conveniently -/// provides either a [`SkyModellerCpu`] or [`SkyModellerCuda`] depending on how -/// `hyperdrive` was compiled and the `use_cpu_for_modelling` flag. +/// on the CPU, a CUDA-compatible GPU or a HIP-compatible GPU, depending on the +/// value of [`MODEL_DEVICE`]. /// /// # Errors /// -/// This function will return an error if CUDA mallocs and copies can't be -/// executed, or if there was a problem in setting up a `BeamCUDA`. +/// This function will return an error if GPU mallocs and copies can't be +/// executed, or if there was a problem in setting up a `BeamGpu`. #[allow(clippy::too_many_arguments)] pub fn new_sky_modeller<'a>( beam: &'a dyn Beam, @@ -197,9 +202,9 @@ pub fn new_sky_modeller<'a>( apply_precession, ))), - #[cfg(feature = "cuda")] - ModelDevice::Cuda => { - let modeller = SkyModellerCuda::new( + #[cfg(any(feature = "cuda", feature = "hip"))] + ModelDevice::Gpu => { + let modeller = SkyModellerGpu::new( beam, source_list, pols, diff --git a/src/model/tests/cpu.rs b/src/model/tests/cpu.rs index 32f4938a..69d290e9 100644 --- a/src/model/tests/cpu.rs +++ b/src/model/tests/cpu.rs @@ -361,8 +361,8 @@ fn shapelet_multiple_components() { .components .shapelets .get_shapelet_uvws(0.0, &obs.xyzs); - // Set the w terms to 0, because they 0 on the CUDA side, and this way the - // CPU and CUDA tests can use the same test values. + // Set the w terms to 0, because they are 0 on the GPU side, and this way + // the CPU and GPU tests can use the same test values. shapelet_uvws.iter_mut().for_each(|uvw| uvw.w = 0.0); test_multiple_shapelet_components(visibilities.view(), shapelet_uvws.view(), 0.0, 0.0); diff --git a/src/model/tests/cuda.rs b/src/model/tests/gpu.rs similarity index 94% rename from src/model/tests/cuda.rs rename to src/model/tests/gpu.rs index 892fc0e2..11386a75 100644 --- a/src/model/tests/cuda.rs +++ b/src/model/tests/gpu.rs @@ -2,7 +2,7 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at http://mozilla.org/MPL/2.0/. -//! Tests on generating sky-model visibilities with CUDA. +//! Tests on generating sky-model visibilities with a GPU. //! //! These tests use the same expected values as the CPU tests. @@ -10,7 +10,7 @@ use ndarray::prelude::*; use super::*; use crate::{ - cuda::{self, CudaFloat, DevicePointer}, + gpu::{self, DevicePointer, GpuFloat}, srclist::Source, }; @@ -40,9 +40,9 @@ macro_rules! test_modelling { d_vis_fb .copy_from_device(visibilities.as_slice_mut().unwrap()) .unwrap(); - #[cfg(not(feature = "cuda-single"))] + #[cfg(not(feature = "gpu-single"))] let epsilon = if $no_beam { 0.0 } else { 1e-15 }; - #[cfg(feature = "cuda-single")] + #[cfg(feature = "gpu-single")] let epsilon = if $no_beam { 6e-8 } else { 2e-3 }; $list_test_fn(visibilities.view(), epsilon); d_vis_fb.clear(); @@ -88,7 +88,7 @@ macro_rules! test_modelling { fn point_zenith_gpu() { test_modelling!( true, - SkyModellerCuda::model_points, + SkyModellerGpu::model_points, &POINT_ZENITH_LIST, &POINT_ZENITH_POWER_LAW, &POINT_ZENITH_CURVED_POWER_LAW, @@ -102,7 +102,7 @@ fn point_zenith_gpu() { fn point_off_zenith_gpu() { test_modelling!( true, - SkyModellerCuda::model_points, + SkyModellerGpu::model_points, &POINT_OFF_ZENITH_LIST, &POINT_OFF_ZENITH_POWER_LAW, &POINT_OFF_ZENITH_CURVED_POWER_LAW, @@ -116,7 +116,7 @@ fn point_off_zenith_gpu() { fn gaussian_zenith_gpu() { test_modelling!( true, - SkyModellerCuda::model_gaussians, + SkyModellerGpu::model_gaussians, &GAUSSIAN_ZENITH_LIST, &GAUSSIAN_ZENITH_POWER_LAW, &GAUSSIAN_ZENITH_CURVED_POWER_LAW, @@ -130,7 +130,7 @@ fn gaussian_zenith_gpu() { fn gaussian_off_zenith_gpu() { test_modelling!( true, - SkyModellerCuda::model_gaussians, + SkyModellerGpu::model_gaussians, &GAUSSIAN_OFF_ZENITH_LIST, &GAUSSIAN_OFF_ZENITH_POWER_LAW, &GAUSSIAN_OFF_ZENITH_CURVED_POWER_LAW, @@ -144,7 +144,7 @@ fn gaussian_off_zenith_gpu() { fn shapelet_zenith_gpu() { test_modelling!( true, - SkyModellerCuda::model_shapelets, + SkyModellerGpu::model_shapelets, &SHAPELET_ZENITH_LIST, &SHAPELET_ZENITH_POWER_LAW, &SHAPELET_ZENITH_CURVED_POWER_LAW, @@ -158,7 +158,7 @@ fn shapelet_zenith_gpu() { fn shapelet_off_zenith_gpu() { test_modelling!( true, - SkyModellerCuda::model_shapelets, + SkyModellerGpu::model_shapelets, &SHAPELET_OFF_ZENITH_LIST, &SHAPELET_OFF_ZENITH_POWER_LAW, &SHAPELET_OFF_ZENITH_CURVED_POWER_LAW, @@ -172,7 +172,7 @@ fn shapelet_off_zenith_gpu() { fn point_zenith_gpu_fee() { test_modelling!( false, - SkyModellerCuda::model_points, + SkyModellerGpu::model_points, &POINT_ZENITH_LIST, &POINT_ZENITH_POWER_LAW, &POINT_ZENITH_CURVED_POWER_LAW, @@ -186,7 +186,7 @@ fn point_zenith_gpu_fee() { fn point_off_zenith_gpu_fee() { test_modelling!( false, - SkyModellerCuda::model_points, + SkyModellerGpu::model_points, &POINT_OFF_ZENITH_LIST, &POINT_OFF_ZENITH_POWER_LAW, &POINT_OFF_ZENITH_CURVED_POWER_LAW, @@ -200,7 +200,7 @@ fn point_off_zenith_gpu_fee() { fn gaussian_zenith_gpu_fee() { test_modelling!( false, - SkyModellerCuda::model_gaussians, + SkyModellerGpu::model_gaussians, &GAUSSIAN_ZENITH_LIST, &GAUSSIAN_ZENITH_POWER_LAW, &GAUSSIAN_ZENITH_CURVED_POWER_LAW, @@ -214,7 +214,7 @@ fn gaussian_zenith_gpu_fee() { fn gaussian_off_zenith_gpu_fee() { test_modelling!( false, - SkyModellerCuda::model_gaussians, + SkyModellerGpu::model_gaussians, &GAUSSIAN_OFF_ZENITH_LIST, &GAUSSIAN_OFF_ZENITH_POWER_LAW, &GAUSSIAN_OFF_ZENITH_CURVED_POWER_LAW, @@ -228,7 +228,7 @@ fn gaussian_off_zenith_gpu_fee() { fn shapelet_zenith_gpu_fee() { test_modelling!( false, - SkyModellerCuda::model_shapelets, + SkyModellerGpu::model_shapelets, &SHAPELET_ZENITH_LIST, &SHAPELET_ZENITH_POWER_LAW, &SHAPELET_ZENITH_CURVED_POWER_LAW, @@ -242,7 +242,7 @@ fn shapelet_zenith_gpu_fee() { fn shapelet_off_zenith_gpu_fee() { test_modelling!( false, - SkyModellerCuda::model_shapelets, + SkyModellerGpu::model_shapelets, &SHAPELET_OFF_ZENITH_LIST, &SHAPELET_OFF_ZENITH_POWER_LAW, &SHAPELET_OFF_ZENITH_CURVED_POWER_LAW, @@ -256,7 +256,7 @@ fn shapelet_off_zenith_gpu_fee() { fn non_trivial_gaussian() { test_modelling!( false, - SkyModellerCuda::model_gaussians, + SkyModellerGpu::model_gaussians, &SourceList::from([( "list".to_string(), Source { @@ -497,9 +497,9 @@ fn gaussian_multiple_components() { .copy_from_device(visibilities.as_slice_mut().unwrap()) .unwrap(); - #[cfg(not(feature = "cuda-single"))] + #[cfg(not(feature = "gpu-single"))] test_multiple_gaussian_components(visibilities.view(), 0.0); - #[cfg(feature = "cuda-single")] + #[cfg(feature = "gpu-single")] test_multiple_gaussian_components(visibilities.view(), 5e-7); } @@ -543,15 +543,15 @@ fn shapelet_multiple_components() { let shapelet_uvs = modeller .get_shapelet_uvs(obs.lst) .list - .map(|&cuda::ShapeletUV { u, v }| UVW { - u: CudaFloat::into(u), - v: CudaFloat::into(v), + .map(|&gpu::ShapeletUV { u, v }| UVW { + u: GpuFloat::into(u), + v: GpuFloat::into(v), w: 0.0, }); - #[cfg(not(feature = "cuda-single"))] + #[cfg(not(feature = "gpu-single"))] test_multiple_shapelet_components(visibilities.view(), shapelet_uvs.view(), 0.0, 0.0); - #[cfg(feature = "cuda-single")] + #[cfg(feature = "gpu-single")] test_multiple_shapelet_components(visibilities.view(), shapelet_uvs.view(), 5e-7, 3e-8); } @@ -584,7 +584,7 @@ fn test_curved_power_law_changing_ref_freq() { obs.freqs.clear(); obs.freqs.push(150e6); let (modeller, _) = obs.get_gpu_modeller(&srclist); - let mut modeller_fds = [crate::cuda::CudaJones::default(); 1]; + let mut modeller_fds = [crate::gpu::GpuJones::default(); 1]; let mut modeller_sis = [0.0; 1]; modeller .point_curved_power_law_fds diff --git a/src/model/tests/mod.rs b/src/model/tests/mod.rs index f63303b7..fd21645f 100644 --- a/src/model/tests/mod.rs +++ b/src/model/tests/mod.rs @@ -11,8 +11,8 @@ //! highlighted. mod cpu; -#[cfg(feature = "cuda")] -mod cuda; +#[cfg(any(feature = "cuda", feature = "hip"))] +mod gpu; use approx::assert_abs_diff_eq; use marlu::{ @@ -25,10 +25,10 @@ use num_complex::Complex; use vec1::vec1; use super::*; -#[cfg(feature = "cuda")] -use crate::cuda::DevicePointer; -#[cfg(feature = "cuda")] -use crate::model::cuda::SkyModellerCuda; +#[cfg(any(feature = "cuda", feature = "hip"))] +use crate::gpu::DevicePointer; +#[cfg(any(feature = "cuda", feature = "hip"))] +use crate::model::gpu::SkyModellerGpu; use crate::{ beam::{create_beam_object, Delays}, srclist::{ @@ -313,13 +313,13 @@ impl ObsParams { ) } - #[cfg(feature = "cuda")] + #[cfg(any(feature = "cuda", feature = "hip"))] #[track_caller] fn get_gpu_modeller( &self, srclist: &SourceList, - ) -> (SkyModellerCuda, DevicePointer) { - let m = SkyModellerCuda::new( + ) -> (SkyModellerGpu, DevicePointer) { + let m = SkyModellerGpu::new( &*self.beam, srclist, Polarisations::default(), @@ -333,16 +333,16 @@ impl ObsParams { true, ) .unwrap(); - let cuda_uvws = self + let gpu_uvws = self .uvws .iter() - .map(|&uvw| crate::cuda::UVW { - u: uvw.u as crate::cuda::CudaFloat, - v: uvw.v as crate::cuda::CudaFloat, - w: uvw.w as crate::cuda::CudaFloat, + .map(|&uvw| crate::gpu::UVW { + u: uvw.u as crate::gpu::GpuFloat, + v: uvw.v as crate::gpu::GpuFloat, + w: uvw.w as crate::gpu::GpuFloat, }) .collect::>(); - let d_uvws = DevicePointer::copy_to_device(&cuda_uvws).unwrap(); + let d_uvws = DevicePointer::copy_to_device(&gpu_uvws).unwrap(); (m, d_uvws) } } diff --git a/src/params/vis_subtract.rs b/src/params/vis_subtract.rs index 56d4be13..30b05665 100644 --- a/src/params/vis_subtract.rs +++ b/src/params/vis_subtract.rs @@ -345,7 +345,7 @@ pub(crate) enum VisSubtractError { #[error(transparent)] IO(#[from] std::io::Error), - #[cfg(feature = "cuda")] + #[cfg(any(feature = "cuda", feature = "hip"))] #[error(transparent)] - Cuda(#[from] crate::cuda::CudaError), + Gpu(#[from] crate::gpu::GpuError), } diff --git a/src/solutions/mod.rs b/src/solutions/mod.rs index c0b57c69..f9d1e1f5 100644 --- a/src/solutions/mod.rs +++ b/src/solutions/mod.rs @@ -133,8 +133,7 @@ pub struct CalibrationSolutions { /// specified in lambdas to metres \[Hz\]. pub freq_centroid: Option, - /// What was used to model the visibilities? This is currently either - /// "CPU" or "CUDA GPU". + /// What was used to model the visibilities? pub modeller: Option, }