From ce5ff4a65e57265ddb80ba31091add167ab24ade Mon Sep 17 00:00:00 2001 From: Sebastian Grimberg Date: Mon, 9 Oct 2023 14:03:15 -0700 Subject: [PATCH 1/6] Add Device: CPU to all examples for reference (use default backend) --- examples/cavity/cavity_impedance.json | 1 + examples/cavity/cavity_pec.json | 1 + examples/coaxial/coaxial_matched.json | 1 + examples/coaxial/coaxial_open.json | 1 + examples/coaxial/coaxial_short.json | 1 + examples/cpw/cpw_lumped_adaptive.json | 1 + examples/cpw/cpw_lumped_uniform.json | 1 + examples/cpw/cpw_wave_adaptive.json | 1 + examples/cpw/cpw_wave_uniform.json | 1 + examples/rings/rings.json | 1 + examples/spheres/spheres.json | 1 + 11 files changed, 11 insertions(+) diff --git a/examples/cavity/cavity_impedance.json b/examples/cavity/cavity_impedance.json index 9f6e5e133..c4e6928c0 100644 --- a/examples/cavity/cavity_impedance.json +++ b/examples/cavity/cavity_impedance.json @@ -49,6 +49,7 @@ "Solver": { "Order": 4, + "Device": "CPU", "Eigenmode": { "N": 15, diff --git a/examples/cavity/cavity_pec.json b/examples/cavity/cavity_pec.json index 489244959..35309bad1 100644 --- a/examples/cavity/cavity_pec.json +++ b/examples/cavity/cavity_pec.json @@ -46,6 +46,7 @@ "Solver": { "Order": 4, + "Device": "CPU", "Eigenmode": { "N": 15, diff --git a/examples/coaxial/coaxial_matched.json b/examples/coaxial/coaxial_matched.json index 977db50b5..300b1a110 100644 --- a/examples/coaxial/coaxial_matched.json +++ b/examples/coaxial/coaxial_matched.json @@ -48,6 +48,7 @@ "Solver": { "Order": 3, + "Device": "CPU", "Transient": { "Type": "GeneralizedAlpha", diff --git a/examples/coaxial/coaxial_open.json b/examples/coaxial/coaxial_open.json index e1f5df9f2..da77bf3e6 100644 --- a/examples/coaxial/coaxial_open.json +++ b/examples/coaxial/coaxial_open.json @@ -46,6 +46,7 @@ "Solver": { "Order": 3, + "Device": "CPU", "Transient": { "Type": "GeneralizedAlpha", diff --git a/examples/coaxial/coaxial_short.json b/examples/coaxial/coaxial_short.json index 30feaffcc..b8522890f 100644 --- a/examples/coaxial/coaxial_short.json +++ b/examples/coaxial/coaxial_short.json @@ -42,6 +42,7 @@ "Solver": { "Order": 3, + "Device": "CPU", "Transient": { "Type": "GeneralizedAlpha", diff --git a/examples/cpw/cpw_lumped_adaptive.json b/examples/cpw/cpw_lumped_adaptive.json index 37815c6a3..b5b44ad9b 100644 --- a/examples/cpw/cpw_lumped_adaptive.json +++ b/examples/cpw/cpw_lumped_adaptive.json @@ -164,6 +164,7 @@ "Solver": { "Order": 2, + "Device": "CPU", "Driven": { "MinFreq": 2.0, // GHz diff --git a/examples/cpw/cpw_lumped_uniform.json b/examples/cpw/cpw_lumped_uniform.json index 19d84c718..180e613ae 100644 --- a/examples/cpw/cpw_lumped_uniform.json +++ b/examples/cpw/cpw_lumped_uniform.json @@ -164,6 +164,7 @@ "Solver": { "Order": 2, + "Device": "CPU", "Driven": { "MinFreq": 2.0, // GHz diff --git a/examples/cpw/cpw_wave_adaptive.json b/examples/cpw/cpw_wave_adaptive.json index c40d6cefd..a1cc9973b 100644 --- a/examples/cpw/cpw_wave_adaptive.json +++ b/examples/cpw/cpw_wave_adaptive.json @@ -128,6 +128,7 @@ "Solver": { "Order": 2, + "Device": "CPU", "Driven": { "MinFreq": 2.0, // GHz diff --git a/examples/cpw/cpw_wave_uniform.json b/examples/cpw/cpw_wave_uniform.json index 46b9c30e0..5a24760fd 100644 --- a/examples/cpw/cpw_wave_uniform.json +++ b/examples/cpw/cpw_wave_uniform.json @@ -128,6 +128,7 @@ "Solver": { "Order": 2, + "Device": "CPU", "Driven": { "MinFreq": 2.0, // GHz diff --git a/examples/rings/rings.json b/examples/rings/rings.json index 7e7ea2c34..668b82101 100644 --- a/examples/rings/rings.json +++ b/examples/rings/rings.json @@ -78,6 +78,7 @@ "Solver": { "Order": 2, + "Device": "CPU", "Magnetostatic": { "Save": 2 diff --git a/examples/spheres/spheres.json b/examples/spheres/spheres.json index 6a0c93edd..0f672b089 100644 --- a/examples/spheres/spheres.json +++ b/examples/spheres/spheres.json @@ -74,6 +74,7 @@ "Solver": { "Order": 3, + "Device": "CPU", "Electrostatic": { "Save": 2 From d4ecf4ef48c3331b14a75486b5c02c6c49bdb9c2 Mon Sep 17 00:00:00 2001 From: Sebastian Grimberg Date: Mon, 9 Oct 2023 14:42:35 -0700 Subject: [PATCH 2/6] Add documentation for GPU support and partial assembly --- CHANGELOG.md | 6 ++++++ README.md | 4 ++++ docs/make.jl | 3 ++- docs/src/guide/guide.md | 1 + docs/src/guide/parallelism.md | 40 +++++++++++++++++++++++++++++++++++ docs/src/index.md | 4 ++++ docs/src/install.md | 9 ++++++++ 7 files changed, 66 insertions(+), 1 deletion(-) create mode 100644 docs/src/guide/parallelism.md diff --git a/CHANGELOG.md b/CHANGELOG.md index 1f18f9b03..dce9a584c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -21,6 +21,12 @@ The format of this changelog is based on - Added documentation for various timer categories and improved timing breakdown of various sections of a simulation. - Fixed bug in implementation of numeric wave ports for driven simulations. + - Added GPU support for *Palace* via its dependencies, and added the + `config["Solver"]["Device"]` and `config["Solver"]["Backend"]` options for runtime + configuration of the MFEM device (`"CPU"` or `"GPU"`) and libCEED backend, with suitable + defaults for users. + - Added a new section to the documentation on + [Parallelism and GPU support](https://awslabs.github.io/palace/dev/guide/parallelism/). ## [0.12.0] - 2023-12-21 diff --git a/README.md b/README.md index 1b1f61f84..2b93d537a 100644 --- a/README.md +++ b/README.md @@ -39,6 +39,9 @@ the frequency or time domain, using the [high-order operator partial assembly](https://mfem.org/performance/), parallel sparse direct solvers, and algebraic multigrid (AMG) preconditioners, for fast performance on platforms ranging from laptops to HPC systems. + - Support for hardware acceleration using NVIDIA or AMD GPUs, including multi-GPU + parallelism, using pure CUDA and HIP code as well as [MAGMA](https://icl.utk.edu/magma/) + and other libraries. ## Getting started @@ -62,6 +65,7 @@ System requirements: - C and Fortran (optional) compilers for dependency builds - MPI distribution - BLAS, LAPACK libraries + - CUDA Toolkit or ROCm installation (optional, for GPU support only) ## Documentation diff --git a/docs/make.jl b/docs/make.jl index 7db158111..d518706c9 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -23,7 +23,8 @@ makedocs( "guide/problem.md", "guide/model.md", "guide/boundaries.md", - "guide/postprocessing.md" + "guide/postprocessing.md", + "guide/parallelism.md" ], "Configuration File" => Any[ "config/config.md", diff --git a/docs/src/guide/guide.md b/docs/src/guide/guide.md index d3183c907..f5e7fc879 100644 --- a/docs/src/guide/guide.md +++ b/docs/src/guide/guide.md @@ -14,3 +14,4 @@ which can be performed with *Palace* and the various features available in the s - [Simulation Models](model.md) - [Boundary Conditions](boundaries.md) - [Postprocessing and Visualization](postprocessing.md) + - [Parallelism and GPU Support](parallelism.md) diff --git a/docs/src/guide/parallelism.md b/docs/src/guide/parallelism.md new file mode 100644 index 000000000..81baad27f --- /dev/null +++ b/docs/src/guide/parallelism.md @@ -0,0 +1,40 @@ +```@raw html + + +``` + +# Parallelism and GPU Support + +*Palace* employs multiple types of parallelism in an attempt to maximize performance across +a wide range of deployment possibilities. The first is MPI-based distributed-memory +parallelism. This is controlled using the `-np` command line flag as outlined in +[Running *Palace*](../run.md). + +Shared-memory parallelism using OpenMP is also available. To enable this, the +`-DPALACE_WITH_OPENMP=ON` option should be specified at configure time. At runtime, the +number of threads is configured with the `-nt` argument to the `palace` executable, or by +setting the [`OMP_NUM_THREADS`](https://www.openmp.org/spec-html/5.0/openmpse50.html) +environment variable. + +Lastly, *Palace* supports GPU-acceleration using NVIDIA and AMD GPUs, activated with the +build options `-DPALACE_WITH_CUDA=ON` and `-DPALACE_WITH_HIP=ON`, respectively. At runtime, +the [`config["Solver"]["Device"]`](../config/solver.md#config%5B%22Solver%22%5D) parameter in the +configuration file can be set to `"CPU"` (the default) or `"GPU"` in order to configure +*Palace* and MFEM to use the available GPU(s). The [`config["Solver"]["Backend"]`] +(../config/solver.md#config["Solver"]) parameter, on the other hand, controls the [libCEED +backend](https://libceed.org/en/latest/gettingstarted/#backends). Users typically do not +need to provide a value for this option and can instead rely on *Palace*'s default, which +selects the most appropriate backend for the given value of [`config["Solver"]["Device"]`] +(../config/solver.md#config["Solver"]). + +In order to take full advantage of the performance benefits made available by GPU- +acceleration, it is recommended to make use of [operator partial assembly] +(https://mfem.org/performance/), activated when the value of +[`config["Solver"]["PartialAssemblyOrder"]`](../config/solver.md#config%5B%22Solver%22%5D) is +less than [`config["Solver"]["Order"]`](../config/solver.md#config%5B%22Solver%22%5D). This feature +avoids assembling a global sparse matrix and instead makes use of data structures for +operators which lend themselves to more efficient asymptotic storage and application costs. +See also [https://libceed.org/en/latest/intro/](https://libceed.org/en/latest/intro/) for +more details. Partial assembly in *Palace* supports mixed meshes including both tensor +product elements (hexahedra and quadrilaterals) as well as non-tensor product elements +(tetrahedra, prisms, pyramids, and triangles). diff --git a/docs/src/index.md b/docs/src/index.md index 306c6505e..931666587 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -42,6 +42,10 @@ the frequency or time domain, using the [high-order operator partial assembly](https://mfem.org/performance/), parallel sparse direct solvers, and algebraic multigrid (AMG) preconditioners, for fast performance on platforms ranging from laptops to HPC systems. + - Support for + [hardware acceleration using NVIDIA or AMD GPUs](https://libceed.org/en/latest/intro/), + including multi-GPU parallelism, using pure CUDA and HIP code as well as + [MAGMA](https://icl.utk.edu/magma/) and other libraries. ## Contents diff --git a/docs/src/install.md b/docs/src/install.md index f947e5296..819e017d9 100644 --- a/docs/src/install.md +++ b/docs/src/install.md @@ -56,6 +56,9 @@ A build from source requires the following prerequisites installed on your syste - C and Fortran (optional) compilers for dependency builds - MPI distribution - BLAS, LAPACK libraries (described below in [Math libraries](#Math-libraries)) + - [CUDA Toolkit](https://www.amd.com/en/graphics/servers-solutions-rocm) or [ROCm] + (https://developer.nvidia.com/cuda-toolkit) installation (optional, for GPU support + only) In addition, builds from source require the following system packages which are typically already installed and are available from most package managers (`apt`, `dnf`, `brew`, etc.): @@ -101,6 +104,9 @@ The *Palace* build respects standard CMake variables, including: desired compilers. - `CMAKE_CXX_FLAGS`, `CMAKE_C_FLAGS`, and `CMAKE_Fortran_FLAGS` which define the corresponding compiler flags. + - `CMAKE_CUDA_COMPILER`, `CMAKE_CUDA_FLAGS`, `CMAKE_CUDA_ARCHITECTURES`, and the + corresponding `CMAKE_HIP_COMPILER`, `CMAKE_HIP_FLAGS`, and `CMAKE_HIP_ARCHITECTURES` for + GPU-accelerated builds with CUDA or HIP. - `CMAKE_INSTALL_PREFIX` which specifies the path for installation (if none is provided, defaults to ``). - `CMAKE_BUILD_TYPE` which defines the build type such as `Release`, `Debug`, @@ -116,6 +122,9 @@ Additional build options are (with default values in brackets): - `PALACE_WITH_64BIT_INT [OFF]` : Build with 64-bit integer support - `PALACE_WITH_OPENMP [OFF]` : Use OpenMP for shared-memory parallelism + - `PALACE_WITH_CUDA [OFF]` : Use CUDA for NVIDIA GPU support + - `PALACE_WITH_HIP [OFF]` : Use HIP for AMD or NVIDIA GPU support + - `PALACE_WITH_GPU_AWARE_MPI [OFF]` : Option to set if MPI distribution is GPU aware - `PALACE_WITH_SUPERLU [ON]` : Build with SuperLU_DIST sparse direct solver - `PALACE_WITH_STRUMPACK [OFF]` : Build with STRUMPACK sparse direct solver - `PALACE_WITH_MUMPS [OFF]` : Build with MUMPS sparse direct solver From 6d797110510179867b79b070531d5315879091e1 Mon Sep 17 00:00:00 2001 From: Sebastian Grimberg Date: Tue, 24 Oct 2023 15:16:51 -0700 Subject: [PATCH 3/6] Fix a missing link --- docs/src/guide/postprocessing.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/src/guide/postprocessing.md b/docs/src/guide/postprocessing.md index b67cc60a2..b2151a964 100644 --- a/docs/src/guide/postprocessing.md +++ b/docs/src/guide/postprocessing.md @@ -70,7 +70,7 @@ These include: ## Boundary postprocessing Boundary postprocessing capabilities are enabled by including objects under -`config["Boundaries"]["Postprocessing"]`](../config/boundaries.md) in the configuration +[`config["Boundaries"]["Postprocessing"]`](../config/boundaries.md) in the configuration file. These include: - [`config["Boundaries"]["Postprocessing"]["Capacitance"]`](../config/boundaries.md#boundaries%5B%22Postprocessing%22%5D%5B%22Capacitance%22%5D) : From ed5a6e83f9f72bcfcecacbdc991a9ba4d9d6efc5 Mon Sep 17 00:00:00 2001 From: Sebastian Grimberg Date: Thu, 18 Jan 2024 09:58:19 -0800 Subject: [PATCH 4/6] make format --- docs/src/guide/parallelism.md | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/docs/src/guide/parallelism.md b/docs/src/guide/parallelism.md index 81baad27f..a0a5c57da 100644 --- a/docs/src/guide/parallelism.md +++ b/docs/src/guide/parallelism.md @@ -18,23 +18,25 @@ environment variable. Lastly, *Palace* supports GPU-acceleration using NVIDIA and AMD GPUs, activated with the build options `-DPALACE_WITH_CUDA=ON` and `-DPALACE_WITH_HIP=ON`, respectively. At runtime, -the [`config["Solver"]["Device"]`](../config/solver.md#config%5B%22Solver%22%5D) parameter in the -configuration file can be set to `"CPU"` (the default) or `"GPU"` in order to configure -*Palace* and MFEM to use the available GPU(s). The [`config["Solver"]["Backend"]`] -(../config/solver.md#config["Solver"]) parameter, on the other hand, controls the [libCEED -backend](https://libceed.org/en/latest/gettingstarted/#backends). Users typically do not -need to provide a value for this option and can instead rely on *Palace*'s default, which -selects the most appropriate backend for the given value of [`config["Solver"]["Device"]`] -(../config/solver.md#config["Solver"]). +the [`config["Solver"]["Device"]`](../config/solver.md#config%5B%22Solver%22%5D) parameter +in the configuration file can be set to `"CPU"` (the default) or `"GPU"` in order to +configure *Palace* and MFEM to use the available GPU(s). The +[`config["Solver"]["Backend"]`](../config/solver.md#config%5B%22Solver%22%5D) parameter, on +the other hand, controls the +[libCEED backend](https://libceed.org/en/latest/gettingstarted/#backends). Users typically +do not need to provide a value for this option and can instead rely on *Palace*'s default, +which selects the most appropriate backend for the given value of +[`config["Solver"]["Device"]`](../config/solver.md#config%5B%22Solver%22%5D). In order to take full advantage of the performance benefits made available by GPU- acceleration, it is recommended to make use of [operator partial assembly] (https://mfem.org/performance/), activated when the value of -[`config["Solver"]["PartialAssemblyOrder"]`](../config/solver.md#config%5B%22Solver%22%5D) is -less than [`config["Solver"]["Order"]`](../config/solver.md#config%5B%22Solver%22%5D). This feature -avoids assembling a global sparse matrix and instead makes use of data structures for -operators which lend themselves to more efficient asymptotic storage and application costs. -See also [https://libceed.org/en/latest/intro/](https://libceed.org/en/latest/intro/) for -more details. Partial assembly in *Palace* supports mixed meshes including both tensor -product elements (hexahedra and quadrilaterals) as well as non-tensor product elements +[`config["Solver"]["PartialAssemblyOrder"]`](../config/solver.md#config%5B%22Solver%22%5D) +is less than [`config["Solver"]["Order"]`](../config/solver.md#config%5B%22Solver%22%5D). +This feature avoids assembling a global sparse matrix and instead makes use of data +structures for operators which lend themselves to more efficient asymptotic storage and +application costs. See also +[https://libceed.org/en/latest/intro/](https://libceed.org/en/latest/intro/) for more +details. Partial assembly in *Palace* supports mixed meshes including both tensor product +elements (hexahedra and quadrilaterals) as well as non-tensor product elements (tetrahedra, prisms, pyramids, and triangles). From abcf2ef17b55e4a71ff87efca981ec9c50e837c7 Mon Sep 17 00:00:00 2001 From: Sebastian Grimberg Date: Mon, 26 Feb 2024 11:57:42 -0800 Subject: [PATCH 5/6] Address PR feedback: Fix link issues --- docs/src/install.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/src/install.md b/docs/src/install.md index 819e017d9..fc5258bce 100644 --- a/docs/src/install.md +++ b/docs/src/install.md @@ -56,8 +56,8 @@ A build from source requires the following prerequisites installed on your syste - C and Fortran (optional) compilers for dependency builds - MPI distribution - BLAS, LAPACK libraries (described below in [Math libraries](#Math-libraries)) - - [CUDA Toolkit](https://www.amd.com/en/graphics/servers-solutions-rocm) or [ROCm] - (https://developer.nvidia.com/cuda-toolkit) installation (optional, for GPU support + - [CUDA Toolkit](https://developer.nvidia.com/cuda-toolkit) or + [ROCm](https://rocm.docs.amd.com/en/latest/) installation (optional, for GPU support only) In addition, builds from source require the following system packages which are typically From a79d79eef77aea390c8ae12248358f4483bf52f6 Mon Sep 17 00:00:00 2001 From: Sebastian Grimberg Date: Mon, 26 Feb 2024 12:01:24 -0800 Subject: [PATCH 6/6] Fix a missed hyperlink broken by newline --- docs/src/guide/parallelism.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/src/guide/parallelism.md b/docs/src/guide/parallelism.md index a0a5c57da..c04f54b4f 100644 --- a/docs/src/guide/parallelism.md +++ b/docs/src/guide/parallelism.md @@ -29,8 +29,8 @@ which selects the most appropriate backend for the given value of [`config["Solver"]["Device"]`](../config/solver.md#config%5B%22Solver%22%5D). In order to take full advantage of the performance benefits made available by GPU- -acceleration, it is recommended to make use of [operator partial assembly] -(https://mfem.org/performance/), activated when the value of +acceleration, it is recommended to make use of +[operator partial assembly](https://mfem.org/performance/), activated when the value of [`config["Solver"]["PartialAssemblyOrder"]`](../config/solver.md#config%5B%22Solver%22%5D) is less than [`config["Solver"]["Order"]`](../config/solver.md#config%5B%22Solver%22%5D). This feature avoids assembling a global sparse matrix and instead makes use of data