From ce5ff4a65e57265ddb80ba31091add167ab24ade Mon Sep 17 00:00:00 2001
From: Sebastian Grimberg <sjg@amazon.com>
Date: Mon, 9 Oct 2023 14:03:15 -0700
Subject: [PATCH 1/6] Add Device: CPU to all examples for reference (use
 default backend)

---
 examples/cavity/cavity_impedance.json | 1 +
 examples/cavity/cavity_pec.json       | 1 +
 examples/coaxial/coaxial_matched.json | 1 +
 examples/coaxial/coaxial_open.json    | 1 +
 examples/coaxial/coaxial_short.json   | 1 +
 examples/cpw/cpw_lumped_adaptive.json | 1 +
 examples/cpw/cpw_lumped_uniform.json  | 1 +
 examples/cpw/cpw_wave_adaptive.json   | 1 +
 examples/cpw/cpw_wave_uniform.json    | 1 +
 examples/rings/rings.json             | 1 +
 examples/spheres/spheres.json         | 1 +
 11 files changed, 11 insertions(+)

diff --git a/examples/cavity/cavity_impedance.json b/examples/cavity/cavity_impedance.json
index 9f6e5e133..c4e6928c0 100644
--- a/examples/cavity/cavity_impedance.json
+++ b/examples/cavity/cavity_impedance.json
@@ -49,6 +49,7 @@
   "Solver":
   {
     "Order": 4,
+    "Device": "CPU",
     "Eigenmode":
     {
       "N": 15,
diff --git a/examples/cavity/cavity_pec.json b/examples/cavity/cavity_pec.json
index 489244959..35309bad1 100644
--- a/examples/cavity/cavity_pec.json
+++ b/examples/cavity/cavity_pec.json
@@ -46,6 +46,7 @@
   "Solver":
   {
     "Order": 4,
+    "Device": "CPU",
     "Eigenmode":
     {
       "N": 15,
diff --git a/examples/coaxial/coaxial_matched.json b/examples/coaxial/coaxial_matched.json
index 977db50b5..300b1a110 100644
--- a/examples/coaxial/coaxial_matched.json
+++ b/examples/coaxial/coaxial_matched.json
@@ -48,6 +48,7 @@
   "Solver":
   {
     "Order": 3,
+    "Device": "CPU",
     "Transient":
     {
       "Type": "GeneralizedAlpha",
diff --git a/examples/coaxial/coaxial_open.json b/examples/coaxial/coaxial_open.json
index e1f5df9f2..da77bf3e6 100644
--- a/examples/coaxial/coaxial_open.json
+++ b/examples/coaxial/coaxial_open.json
@@ -46,6 +46,7 @@
   "Solver":
   {
     "Order": 3,
+    "Device": "CPU",
     "Transient":
     {
       "Type": "GeneralizedAlpha",
diff --git a/examples/coaxial/coaxial_short.json b/examples/coaxial/coaxial_short.json
index 30feaffcc..b8522890f 100644
--- a/examples/coaxial/coaxial_short.json
+++ b/examples/coaxial/coaxial_short.json
@@ -42,6 +42,7 @@
   "Solver":
   {
     "Order": 3,
+    "Device": "CPU",
     "Transient":
     {
       "Type": "GeneralizedAlpha",
diff --git a/examples/cpw/cpw_lumped_adaptive.json b/examples/cpw/cpw_lumped_adaptive.json
index 37815c6a3..b5b44ad9b 100644
--- a/examples/cpw/cpw_lumped_adaptive.json
+++ b/examples/cpw/cpw_lumped_adaptive.json
@@ -164,6 +164,7 @@
   "Solver":
   {
     "Order": 2,
+    "Device": "CPU",
     "Driven":
     {
       "MinFreq": 2.0,  // GHz
diff --git a/examples/cpw/cpw_lumped_uniform.json b/examples/cpw/cpw_lumped_uniform.json
index 19d84c718..180e613ae 100644
--- a/examples/cpw/cpw_lumped_uniform.json
+++ b/examples/cpw/cpw_lumped_uniform.json
@@ -164,6 +164,7 @@
   "Solver":
   {
     "Order": 2,
+    "Device": "CPU",
     "Driven":
     {
       "MinFreq": 2.0,  // GHz
diff --git a/examples/cpw/cpw_wave_adaptive.json b/examples/cpw/cpw_wave_adaptive.json
index c40d6cefd..a1cc9973b 100644
--- a/examples/cpw/cpw_wave_adaptive.json
+++ b/examples/cpw/cpw_wave_adaptive.json
@@ -128,6 +128,7 @@
   "Solver":
   {
     "Order": 2,
+    "Device": "CPU",
     "Driven":
     {
       "MinFreq": 2.0,  // GHz
diff --git a/examples/cpw/cpw_wave_uniform.json b/examples/cpw/cpw_wave_uniform.json
index 46b9c30e0..5a24760fd 100644
--- a/examples/cpw/cpw_wave_uniform.json
+++ b/examples/cpw/cpw_wave_uniform.json
@@ -128,6 +128,7 @@
   "Solver":
   {
     "Order": 2,
+    "Device": "CPU",
     "Driven":
     {
       "MinFreq": 2.0,  // GHz
diff --git a/examples/rings/rings.json b/examples/rings/rings.json
index 7e7ea2c34..668b82101 100644
--- a/examples/rings/rings.json
+++ b/examples/rings/rings.json
@@ -78,6 +78,7 @@
   "Solver":
   {
     "Order": 2,
+    "Device": "CPU",
     "Magnetostatic":
     {
       "Save": 2
diff --git a/examples/spheres/spheres.json b/examples/spheres/spheres.json
index 6a0c93edd..0f672b089 100644
--- a/examples/spheres/spheres.json
+++ b/examples/spheres/spheres.json
@@ -74,6 +74,7 @@
   "Solver":
   {
     "Order": 3,
+    "Device": "CPU",
     "Electrostatic":
     {
       "Save": 2

From d4ecf4ef48c3331b14a75486b5c02c6c49bdb9c2 Mon Sep 17 00:00:00 2001
From: Sebastian Grimberg <sjg@amazon.com>
Date: Mon, 9 Oct 2023 14:42:35 -0700
Subject: [PATCH 2/6] Add documentation for GPU support and partial assembly

---
 CHANGELOG.md                  |  6 ++++++
 README.md                     |  4 ++++
 docs/make.jl                  |  3 ++-
 docs/src/guide/guide.md       |  1 +
 docs/src/guide/parallelism.md | 40 +++++++++++++++++++++++++++++++++++
 docs/src/index.md             |  4 ++++
 docs/src/install.md           |  9 ++++++++
 7 files changed, 66 insertions(+), 1 deletion(-)
 create mode 100644 docs/src/guide/parallelism.md

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 1f18f9b03..dce9a584c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -21,6 +21,12 @@ The format of this changelog is based on
   - Added documentation for various timer categories and improved timing breakdown of
     various sections of a simulation.
   - Fixed bug in implementation of numeric wave ports for driven simulations.
+  - Added GPU support for *Palace* via its dependencies, and added the
+    `config["Solver"]["Device"]` and `config["Solver"]["Backend"]` options for runtime
+    configuration of the MFEM device (`"CPU"` or `"GPU"`) and libCEED backend, with suitable
+    defaults for users.
+  - Added a new section to the documentation on
+    [Parallelism and GPU support](https://awslabs.github.io/palace/dev/guide/parallelism/).
 
 ## [0.12.0] - 2023-12-21
 
diff --git a/README.md b/README.md
index 1b1f61f84..2b93d537a 100644
--- a/README.md
+++ b/README.md
@@ -39,6 +39,9 @@ the frequency or time domain, using the
     [high-order operator partial assembly](https://mfem.org/performance/), parallel sparse
     direct solvers, and algebraic multigrid (AMG) preconditioners, for fast performance on
     platforms ranging from laptops to HPC systems.
+  - Support for hardware acceleration using NVIDIA or AMD GPUs, including multi-GPU
+    parallelism, using pure CUDA and HIP code as well as [MAGMA](https://icl.utk.edu/magma/)
+    and other libraries.
 
 ## Getting started
 
@@ -62,6 +65,7 @@ System requirements:
   - C and Fortran (optional) compilers for dependency builds
   - MPI distribution
   - BLAS, LAPACK libraries
+  - CUDA Toolkit or ROCm installation (optional, for GPU support only)
 
 ## Documentation
 
diff --git a/docs/make.jl b/docs/make.jl
index 7db158111..d518706c9 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -23,7 +23,8 @@ makedocs(
             "guide/problem.md",
             "guide/model.md",
             "guide/boundaries.md",
-            "guide/postprocessing.md"
+            "guide/postprocessing.md",
+            "guide/parallelism.md"
         ],
         "Configuration File" => Any[
             "config/config.md",
diff --git a/docs/src/guide/guide.md b/docs/src/guide/guide.md
index d3183c907..f5e7fc879 100644
--- a/docs/src/guide/guide.md
+++ b/docs/src/guide/guide.md
@@ -14,3 +14,4 @@ which can be performed with *Palace* and the various features available in the s
   - [Simulation Models](model.md)
   - [Boundary Conditions](boundaries.md)
   - [Postprocessing and Visualization](postprocessing.md)
+  - [Parallelism and GPU Support](parallelism.md)
diff --git a/docs/src/guide/parallelism.md b/docs/src/guide/parallelism.md
new file mode 100644
index 000000000..81baad27f
--- /dev/null
+++ b/docs/src/guide/parallelism.md
@@ -0,0 +1,40 @@
+```@raw html
+<!--- Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. --->
+<!--- SPDX-License-Identifier: Apache-2.0 --->
+```
+
+# Parallelism and GPU Support
+
+*Palace* employs multiple types of parallelism in an attempt to maximize performance across
+a wide range of deployment possibilities. The first is MPI-based distributed-memory
+parallelism. This is controlled using the `-np` command line flag as outlined in
+[Running *Palace*](../run.md).
+
+Shared-memory parallelism using OpenMP is also available. To enable this, the
+`-DPALACE_WITH_OPENMP=ON` option should be specified at configure time. At runtime, the
+number of threads is configured with the `-nt` argument to the `palace` executable, or by
+setting the [`OMP_NUM_THREADS`](https://www.openmp.org/spec-html/5.0/openmpse50.html)
+environment variable.
+
+Lastly, *Palace* supports GPU-acceleration using NVIDIA and AMD GPUs, activated with the
+build options `-DPALACE_WITH_CUDA=ON` and `-DPALACE_WITH_HIP=ON`, respectively. At runtime,
+the [`config["Solver"]["Device"]`](../config/solver.md#config%5B%22Solver%22%5D) parameter in the
+configuration file can be set to `"CPU"` (the default) or `"GPU"` in order to configure
+*Palace* and MFEM to use the available GPU(s). The [`config["Solver"]["Backend"]`]
+(../config/solver.md#config["Solver"]) parameter, on the other hand, controls the [libCEED
+backend](https://libceed.org/en/latest/gettingstarted/#backends). Users typically do not
+need to provide a value for this option and can instead rely on *Palace*'s default, which
+selects the most appropriate backend for the given value of [`config["Solver"]["Device"]`]
+(../config/solver.md#config["Solver"]).
+
+In order to take full advantage of the performance benefits made available by GPU-
+acceleration, it is recommended to make use of [operator partial assembly]
+(https://mfem.org/performance/), activated when the value of
+[`config["Solver"]["PartialAssemblyOrder"]`](../config/solver.md#config%5B%22Solver%22%5D) is
+less than [`config["Solver"]["Order"]`](../config/solver.md#config%5B%22Solver%22%5D). This feature
+avoids assembling a global sparse matrix and instead makes use of data structures for
+operators which lend themselves to more efficient asymptotic storage and application costs.
+See also [https://libceed.org/en/latest/intro/](https://libceed.org/en/latest/intro/) for
+more details. Partial assembly in *Palace* supports mixed meshes including both tensor
+product elements (hexahedra and quadrilaterals) as well as non-tensor product elements
+(tetrahedra, prisms, pyramids, and triangles).
diff --git a/docs/src/index.md b/docs/src/index.md
index 306c6505e..931666587 100644
--- a/docs/src/index.md
+++ b/docs/src/index.md
@@ -42,6 +42,10 @@ the frequency or time domain, using the
     [high-order operator partial assembly](https://mfem.org/performance/), parallel sparse
     direct solvers, and algebraic multigrid (AMG) preconditioners, for fast performance on
     platforms ranging from laptops to HPC systems.
+  - Support for
+    [hardware acceleration using NVIDIA or AMD GPUs](https://libceed.org/en/latest/intro/),
+    including multi-GPU parallelism, using pure CUDA and HIP code as well as
+    [MAGMA](https://icl.utk.edu/magma/) and other libraries.
 
 ## Contents
 
diff --git a/docs/src/install.md b/docs/src/install.md
index f947e5296..819e017d9 100644
--- a/docs/src/install.md
+++ b/docs/src/install.md
@@ -56,6 +56,9 @@ A build from source requires the following prerequisites installed on your syste
   - C and Fortran (optional) compilers for dependency builds
   - MPI distribution
   - BLAS, LAPACK libraries (described below in [Math libraries](#Math-libraries))
+  - [CUDA Toolkit](https://www.amd.com/en/graphics/servers-solutions-rocm) or [ROCm]
+    (https://developer.nvidia.com/cuda-toolkit) installation (optional, for GPU support
+    only)
 
 In addition, builds from source require the following system packages which are typically
 already installed and are available from most package managers (`apt`, `dnf`, `brew`, etc.):
@@ -101,6 +104,9 @@ The *Palace* build respects standard CMake variables, including:
     desired compilers.
   - `CMAKE_CXX_FLAGS`, `CMAKE_C_FLAGS`, and `CMAKE_Fortran_FLAGS` which define the
     corresponding compiler flags.
+  - `CMAKE_CUDA_COMPILER`, `CMAKE_CUDA_FLAGS`, `CMAKE_CUDA_ARCHITECTURES`, and the
+    corresponding `CMAKE_HIP_COMPILER`, `CMAKE_HIP_FLAGS`, and `CMAKE_HIP_ARCHITECTURES` for
+    GPU-accelerated builds with CUDA or HIP.
   - `CMAKE_INSTALL_PREFIX` which specifies the path for installation (if none is provided,
     defaults to `<BUILD_DIR>`).
   - `CMAKE_BUILD_TYPE` which defines the build type such as `Release`, `Debug`,
@@ -116,6 +122,9 @@ Additional build options are (with default values in brackets):
 
   - `PALACE_WITH_64BIT_INT [OFF]` :  Build with 64-bit integer support
   - `PALACE_WITH_OPENMP [OFF]` :  Use OpenMP for shared-memory parallelism
+  - `PALACE_WITH_CUDA [OFF]` :  Use CUDA for NVIDIA GPU support
+  - `PALACE_WITH_HIP [OFF]` :  Use HIP for AMD or NVIDIA GPU support
+  - `PALACE_WITH_GPU_AWARE_MPI [OFF]` :  Option to set if MPI distribution is GPU aware
   - `PALACE_WITH_SUPERLU [ON]` :  Build with SuperLU_DIST sparse direct solver
   - `PALACE_WITH_STRUMPACK [OFF]` :  Build with STRUMPACK sparse direct solver
   - `PALACE_WITH_MUMPS [OFF]` :  Build with MUMPS sparse direct solver

From 6d797110510179867b79b070531d5315879091e1 Mon Sep 17 00:00:00 2001
From: Sebastian Grimberg <sjg@amazon.com>
Date: Tue, 24 Oct 2023 15:16:51 -0700
Subject: [PATCH 3/6] Fix a missing link

---
 docs/src/guide/postprocessing.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/src/guide/postprocessing.md b/docs/src/guide/postprocessing.md
index b67cc60a2..b2151a964 100644
--- a/docs/src/guide/postprocessing.md
+++ b/docs/src/guide/postprocessing.md
@@ -70,7 +70,7 @@ These include:
 ## Boundary postprocessing
 
 Boundary postprocessing capabilities are enabled by including objects under
-`config["Boundaries"]["Postprocessing"]`](../config/boundaries.md) in the configuration
+[`config["Boundaries"]["Postprocessing"]`](../config/boundaries.md) in the configuration
 file. These include:
 
   - [`config["Boundaries"]["Postprocessing"]["Capacitance"]`](../config/boundaries.md#boundaries%5B%22Postprocessing%22%5D%5B%22Capacitance%22%5D) :

From ed5a6e83f9f72bcfcecacbdc991a9ba4d9d6efc5 Mon Sep 17 00:00:00 2001
From: Sebastian Grimberg <sjg@amazon.com>
Date: Thu, 18 Jan 2024 09:58:19 -0800
Subject: [PATCH 4/6] make format

---
 docs/src/guide/parallelism.md | 32 +++++++++++++++++---------------
 1 file changed, 17 insertions(+), 15 deletions(-)

diff --git a/docs/src/guide/parallelism.md b/docs/src/guide/parallelism.md
index 81baad27f..a0a5c57da 100644
--- a/docs/src/guide/parallelism.md
+++ b/docs/src/guide/parallelism.md
@@ -18,23 +18,25 @@ environment variable.
 
 Lastly, *Palace* supports GPU-acceleration using NVIDIA and AMD GPUs, activated with the
 build options `-DPALACE_WITH_CUDA=ON` and `-DPALACE_WITH_HIP=ON`, respectively. At runtime,
-the [`config["Solver"]["Device"]`](../config/solver.md#config%5B%22Solver%22%5D) parameter in the
-configuration file can be set to `"CPU"` (the default) or `"GPU"` in order to configure
-*Palace* and MFEM to use the available GPU(s). The [`config["Solver"]["Backend"]`]
-(../config/solver.md#config["Solver"]) parameter, on the other hand, controls the [libCEED
-backend](https://libceed.org/en/latest/gettingstarted/#backends). Users typically do not
-need to provide a value for this option and can instead rely on *Palace*'s default, which
-selects the most appropriate backend for the given value of [`config["Solver"]["Device"]`]
-(../config/solver.md#config["Solver"]).
+the [`config["Solver"]["Device"]`](../config/solver.md#config%5B%22Solver%22%5D) parameter
+in the configuration file can be set to `"CPU"` (the default) or `"GPU"` in order to
+configure *Palace* and MFEM to use the available GPU(s). The
+[`config["Solver"]["Backend"]`](../config/solver.md#config%5B%22Solver%22%5D) parameter, on
+the other hand, controls the
+[libCEED backend](https://libceed.org/en/latest/gettingstarted/#backends). Users typically
+do not need to provide a value for this option and can instead rely on *Palace*'s default,
+which selects the most appropriate backend for the given value of
+[`config["Solver"]["Device"]`](../config/solver.md#config%5B%22Solver%22%5D).
 
 In order to take full advantage of the performance benefits made available by GPU-
 acceleration, it is recommended to make use of [operator partial assembly]
 (https://mfem.org/performance/), activated when the value of
-[`config["Solver"]["PartialAssemblyOrder"]`](../config/solver.md#config%5B%22Solver%22%5D) is
-less than [`config["Solver"]["Order"]`](../config/solver.md#config%5B%22Solver%22%5D). This feature
-avoids assembling a global sparse matrix and instead makes use of data structures for
-operators which lend themselves to more efficient asymptotic storage and application costs.
-See also [https://libceed.org/en/latest/intro/](https://libceed.org/en/latest/intro/) for
-more details. Partial assembly in *Palace* supports mixed meshes including both tensor
-product elements (hexahedra and quadrilaterals) as well as non-tensor product elements
+[`config["Solver"]["PartialAssemblyOrder"]`](../config/solver.md#config%5B%22Solver%22%5D)
+is less than [`config["Solver"]["Order"]`](../config/solver.md#config%5B%22Solver%22%5D).
+This feature avoids assembling a global sparse matrix and instead makes use of data
+structures for operators which lend themselves to more efficient asymptotic storage and
+application costs. See also
+[https://libceed.org/en/latest/intro/](https://libceed.org/en/latest/intro/) for more
+details. Partial assembly in *Palace* supports mixed meshes including both tensor product
+elements (hexahedra and quadrilaterals) as well as non-tensor product elements
 (tetrahedra, prisms, pyramids, and triangles).

From abcf2ef17b55e4a71ff87efca981ec9c50e837c7 Mon Sep 17 00:00:00 2001
From: Sebastian Grimberg <sjg@amazon.com>
Date: Mon, 26 Feb 2024 11:57:42 -0800
Subject: [PATCH 5/6] Address PR feedback: Fix link issues

---
 docs/src/install.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/src/install.md b/docs/src/install.md
index 819e017d9..fc5258bce 100644
--- a/docs/src/install.md
+++ b/docs/src/install.md
@@ -56,8 +56,8 @@ A build from source requires the following prerequisites installed on your syste
   - C and Fortran (optional) compilers for dependency builds
   - MPI distribution
   - BLAS, LAPACK libraries (described below in [Math libraries](#Math-libraries))
-  - [CUDA Toolkit](https://www.amd.com/en/graphics/servers-solutions-rocm) or [ROCm]
-    (https://developer.nvidia.com/cuda-toolkit) installation (optional, for GPU support
+  - [CUDA Toolkit](https://developer.nvidia.com/cuda-toolkit) or
+    [ROCm](https://rocm.docs.amd.com/en/latest/) installation (optional, for GPU support
     only)
 
 In addition, builds from source require the following system packages which are typically

From a79d79eef77aea390c8ae12248358f4483bf52f6 Mon Sep 17 00:00:00 2001
From: Sebastian Grimberg <sjg@amazon.com>
Date: Mon, 26 Feb 2024 12:01:24 -0800
Subject: [PATCH 6/6] Fix a missed hyperlink broken by newline

---
 docs/src/guide/parallelism.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/src/guide/parallelism.md b/docs/src/guide/parallelism.md
index a0a5c57da..c04f54b4f 100644
--- a/docs/src/guide/parallelism.md
+++ b/docs/src/guide/parallelism.md
@@ -29,8 +29,8 @@ which selects the most appropriate backend for the given value of
 [`config["Solver"]["Device"]`](../config/solver.md#config%5B%22Solver%22%5D).
 
 In order to take full advantage of the performance benefits made available by GPU-
-acceleration, it is recommended to make use of [operator partial assembly]
-(https://mfem.org/performance/), activated when the value of
+acceleration, it is recommended to make use of
+[operator partial assembly](https://mfem.org/performance/), activated when the value of
 [`config["Solver"]["PartialAssemblyOrder"]`](../config/solver.md#config%5B%22Solver%22%5D)
 is less than [`config["Solver"]["Order"]`](../config/solver.md#config%5B%22Solver%22%5D).
 This feature avoids assembling a global sparse matrix and instead makes use of data