diff --git a/src/gpu/compute_coupling_cuda.cu b/src/gpu/compute_coupling_cuda.cu
index 926ab2aef..defe340bc 100644
--- a/src/gpu/compute_coupling_cuda.cu
+++ b/src/gpu/compute_coupling_cuda.cu
@@ -35,6 +35,9 @@
 
 /* ----------------------------------------------------------------------------------------------- */
 
+// coupling direction: elastic wavefield/domain -> acoustic wavefield/domain
+//                                                 (updates acoustic potential_dot_dot wavefield)
+
 extern EXTERN_LANG
 void FC_FUNC_(compute_coupling_ac_el_cuda,
               COMPUTE_COUPLING_AC_EL_CUDA)(long* Mesh_pointer,
@@ -131,6 +134,9 @@ void FC_FUNC_(compute_coupling_ac_el_cuda,
 
 /* ----------------------------------------------------------------------------------------------- */
 
+// coupling direction: acoustic wavefield/domain -> elastic wavefield/domain
+//                                                  (updates elastic acceleration wavefield)
+
 extern EXTERN_LANG
 void FC_FUNC_(compute_coupling_el_ac_cuda,
               COMPUTE_COUPLING_EL_AC_CUDA)(long* Mesh_pointer,
diff --git a/src/gpu/kernels/Kernel_2_acoustic_impl.cu b/src/gpu/kernels/Kernel_2_acoustic_impl.cu
index 9938c290e..9ec177e32 100644
--- a/src/gpu/kernels/Kernel_2_acoustic_impl.cu
+++ b/src/gpu/kernels/Kernel_2_acoustic_impl.cu
@@ -110,8 +110,8 @@ Kernel_2_acoustic_impl(const int nb_blocks_to_compute,
                        realw* d_gammax,realw* d_gammay,realw* d_gammaz,
                        const realw xix_regular, const realw jacobian_regular,
                        realw_const_p d_hprime_xx,
-                       realw_const_p hprimewgll_xx,
-                       realw_const_p wgllwgll_xy,realw_const_p wgllwgll_xz,realw_const_p wgllwgll_yz,
+                       realw_const_p d_hprimewgll_xx,
+                       realw_const_p d_wgllwgll_xy,realw_const_p d_wgllwgll_xz,realw_const_p d_wgllwgll_yz,
                        realw* d_rhostore,
                        const int use_mesh_coloring_gpu,
                        const int gravity,
@@ -184,7 +184,7 @@ Kernel_2_acoustic_impl(const int nb_blocks_to_compute,
   working_element = bx;
 #else
   //mesh coloring
-  if (use_mesh_coloring_gpu ){
+  if (use_mesh_coloring_gpu){
     working_element = bx;
   }else{
     // iphase-1 and working_element-1 for Fortran->C array conventions
@@ -273,7 +273,7 @@ Kernel_2_acoustic_impl(const int nb_blocks_to_compute,
     sh_hprime_xx[tx] = d_hprime_xx[tx];
 #endif
     // loads hprimewgll into shared memory
-    sh_hprimewgll_xx[tx] = hprimewgll_xx[tx];
+    sh_hprimewgll_xx[tx] = d_hprimewgll_xx[tx];
   }
 
 // counts:
@@ -287,9 +287,9 @@ Kernel_2_acoustic_impl(const int nb_blocks_to_compute,
   __syncthreads();
 
   // summed terms with added gll weights
-  fac1 = wgllwgll_yz[K*NGLLX+J];
-  fac2 = wgllwgll_xz[K*NGLLX+I];
-  fac3 = wgllwgll_xy[J*NGLLX+I];
+  fac1 = d_wgllwgll_yz[K*NGLLX+J];
+  fac2 = d_wgllwgll_xz[K*NGLLX+I];
+  fac3 = d_wgllwgll_xy[J*NGLLX+I];
 
   // We make a loop over direct and adjoint wavefields inside the GPU kernel to increase arithmetic intensity
   for (int k = 0 ; k < nb_field ; k++){
@@ -402,7 +402,7 @@ Kernel_2_acoustic_impl(const int nb_blocks_to_compute,
 #endif // USE_TEXTURES_FIELDS
 #else  // MESH_COLORING
     //mesh coloring
-    if (use_mesh_coloring_gpu ){
+    if (use_mesh_coloring_gpu){
       // no atomic operation needed, colors don't share global points between elements
 #ifdef USE_TEXTURES_FIELDS
     if (k==0) d_potential_dot_dot_acoustic[iglob] = texfetch_potential_dot_dot<FORWARD_OR_ADJOINT>(iglob) + sum_terms;
@@ -525,8 +525,8 @@ template __global__ void Kernel_2_acoustic_impl<1>(const int nb_blocks_to_comput
                                                    realw* d_gammax,realw* d_gammay,realw* d_gammaz,
                                                    const realw xix_regular, const realw jacobian_regular,
                                                    realw_const_p d_hprime_xx,
-                                                   realw_const_p hprimewgll_xx,
-                                                   realw_const_p wgllwgll_xy,realw_const_p wgllwgll_xz,realw_const_p wgllwgll_yz,
+                                                   realw_const_p d_hprimewgll_xx,
+                                                   realw_const_p d_wgllwgll_xy,realw_const_p d_wgllwgll_xz,realw_const_p d_wgllwgll_yz,
                                                    realw* d_rhostore,
                                                    const int use_mesh_coloring_gpu,
                                                    const int gravity,
@@ -558,8 +558,8 @@ Kernel_2_acoustic_single_impl(const int nb_blocks_to_compute,
                               const int* d_irregular_element_number,
                               const realw xix_regular, const realw jacobian_regular,
                               realw_const_p d_hprime_xx,
-                              realw_const_p hprimewgll_xx,
-                              realw_const_p wgllwgll_xy,realw_const_p wgllwgll_xz,realw_const_p wgllwgll_yz,
+                              realw_const_p d_hprimewgll_xx,
+                              realw_const_p d_wgllwgll_xy,realw_const_p d_wgllwgll_xz,realw_const_p d_wgllwgll_yz,
                               realw* d_rhostore,
                               const int use_mesh_coloring_gpu,
                               const int gravity,
@@ -571,14 +571,23 @@ Kernel_2_acoustic_single_impl(const int nb_blocks_to_compute,
   // block-id == number of local element id in phase_ispec array
   int bx = blockIdx.y*gridDim.x+blockIdx.x;
 
+  // checks if anything to do
+  if (bx >= nb_blocks_to_compute) return;
+
   // thread-id == GLL node id
   // note: use only NGLL^3 = 125 active threads, plus 3 inactive/ghost threads,
   //       because we used memory padding from NGLL^3 = 125 to 128 to get coalescent memory accesses;
   //       to avoid execution branching and the need of registers to store an active state variable,
   //       the thread ids are put in valid range
   int tx = threadIdx.x;
+  // limits thread ids to range [0,125-1]
+  if (tx >= NGLL3) tx = NGLL3-1;
+
+  // local index
+  int K = (tx/NGLL2);
+  int J = ((tx-K*NGLL2)/NGLLX);
+  int I = (tx-K*NGLL2-J*NGLLX);
 
-  int I,J,K;
   int iglob,offset;
   int working_element,ispec_irreg;
 
@@ -602,28 +611,24 @@ Kernel_2_acoustic_single_impl(const int nb_blocks_to_compute,
   __shared__ realw sh_hprime_xx[NGLL2];
   __shared__ realw sh_hprimewgll_xx[NGLL2];
 
-  // checks if anything to do
-  if (bx >= nb_blocks_to_compute) return;
-
-  // limits thread ids to range [0,125-1]
-  if (tx >= NGLL3) tx = NGLL3-1;
-
   // spectral-element id
 #ifdef USE_MESH_COLORING_GPU
   working_element = bx;
 #else
   //mesh coloring
-  if (use_mesh_coloring_gpu ){
+  if (use_mesh_coloring_gpu){
     working_element = bx;
   }else{
     // iphase-1 and working_element-1 for Fortran->C array conventions
-    working_element = d_phase_ispec_inner_acoustic[bx + num_phase_ispec_acoustic*(d_iphase-1)]-1;
+    working_element = d_phase_ispec_inner_acoustic[bx + num_phase_ispec_acoustic*(d_iphase-1)] - 1;
   }
 #endif
 
+  ispec_irreg = d_irregular_element_number[working_element] - 1;
+
   // local padded index
   offset = working_element*NGLL3_PADDED + tx;
-  ispec_irreg = d_irregular_element_number[working_element] - 1;
+
   // global index
   iglob = d_ibool[offset] - 1;
 
@@ -644,11 +649,6 @@ Kernel_2_acoustic_single_impl(const int nb_blocks_to_compute,
   // gravity
   if (gravity) kappa_invl = 1.f / d_kappastore[working_element*NGLL3 + tx];
 
-  // local index
-  K = (tx/NGLL2);
-  J = ((tx-K*NGLL2)/NGLLX);
-  I = (tx-K*NGLL2-J*NGLLX);
-
   // calculates laplacian
   if (ispec_irreg >= 0){
     //irregular_element
@@ -680,7 +680,7 @@ Kernel_2_acoustic_single_impl(const int nb_blocks_to_compute,
     sh_hprime_xx[tx] = d_hprime_xx[tx];
 #endif
     // loads hprimewgll into shared memory
-    sh_hprimewgll_xx[tx] = hprimewgll_xx[tx];
+    sh_hprimewgll_xx[tx] = d_hprimewgll_xx[tx];
   }
 
   // synchronize all the threads (one thread for each of the NGLL grid points of the
@@ -753,9 +753,9 @@ Kernel_2_acoustic_single_impl(const int nb_blocks_to_compute,
   }
 
   // summed terms with added gll weights
-  fac1 = wgllwgll_yz[K*NGLLX+J];
-  fac2 = wgllwgll_xz[K*NGLLX+I];
-  fac3 = wgllwgll_xy[J*NGLLX+I];
+  fac1 = d_wgllwgll_yz[K*NGLLX+J];
+  fac2 = d_wgllwgll_xz[K*NGLLX+I];
+  fac3 = d_wgllwgll_xy[J*NGLLX+I];
 
   sum_terms = -(fac1*temp1l + fac2*temp2l + fac3*temp3l);
 
@@ -777,7 +777,7 @@ Kernel_2_acoustic_single_impl(const int nb_blocks_to_compute,
 #endif // USE_TEXTURES_FIELDS
 #else  // MESH_COLORING
     //mesh coloring
-    if (use_mesh_coloring_gpu ){
+    if (use_mesh_coloring_gpu){
       // no atomic operation needed, colors don't share global points between elements
 #ifdef USE_TEXTURES_FIELDS
       if (FORWARD_OR_ADJOINT == 3){
@@ -817,8 +817,8 @@ Kernel_2_acoustic_perf_impl(const int nb_blocks_to_compute,
                             realw* d_etax,realw* d_etay,realw* d_etaz,
                             realw* d_gammax,realw* d_gammay,realw* d_gammaz,
                             realw_const_p d_hprime_xx,
-                            realw_const_p hprimewgll_xx,
-                            realw_const_p wgllwgll_xy,realw_const_p wgllwgll_xz,realw_const_p wgllwgll_yz,
+                            realw_const_p d_hprimewgll_xx,
+                            realw_const_p d_wgllwgll_xy,realw_const_p d_wgllwgll_xz,realw_const_p d_wgllwgll_yz,
                             realw* d_rhostore,
                             const int use_mesh_coloring_gpu,
                             const int gravity,
@@ -927,7 +927,7 @@ Kernel_2_acoustic_perf_impl(const int nb_blocks_to_compute,
     sh_hprime_xx[tx] = d_hprime_xx[tx];
 #endif
     // loads hprimewgll into shared memory
-    sh_hprimewgll_xx[tx] = hprimewgll_xx[tx];
+    sh_hprimewgll_xx[tx] = d_hprimewgll_xx[tx];
   }
 
   // synchronize all the threads (one thread for each of the NGLL grid points of the
@@ -984,9 +984,9 @@ Kernel_2_acoustic_perf_impl(const int nb_blocks_to_compute,
   }
 
   // summed terms with added gll weights
-  fac1 = wgllwgll_yz[K*NGLLX+J];
-  fac2 = wgllwgll_xz[K*NGLLX+I];
-  fac3 = wgllwgll_xy[J*NGLLX+I];
+  fac1 = d_wgllwgll_yz[K*NGLLX+J];
+  fac2 = d_wgllwgll_xz[K*NGLLX+I];
+  fac3 = d_wgllwgll_xy[J*NGLLX+I];
 
   sum_terms = -(fac1*temp1l + fac2*temp2l + fac3*temp3l);
 
diff --git a/src/gpu/kernels/kernel_3_acoustic_cuda_device.cu b/src/gpu/kernels/kernel_3_acoustic_cuda_device.cu
index a724c1cb1..ab742b9f0 100644
--- a/src/gpu/kernels/kernel_3_acoustic_cuda_device.cu
+++ b/src/gpu/kernels/kernel_3_acoustic_cuda_device.cu
@@ -40,18 +40,18 @@ __global__ void kernel_3_acoustic_cuda_device(field* potential_dot_acoustic,
 
   int id = threadIdx.x + (blockIdx.x + blockIdx.y*gridDim.x)*blockDim.x;
 
-  realw rmass;
-  field p_dot_dot;
   // because of block and grid sizing problems, there is a small
   // amount of buffer at the end of the calculation
   if (id < size) {
-    rmass = rmass_acoustic[id];
+    realw rmass = rmass_acoustic[id];
     // multiplies pressure with the inverse of the mass matrix
-    p_dot_dot = rmass*potential_dot_dot_acoustic[id];
+    field p_dot_dot = rmass * potential_dot_dot_acoustic[id];
+
     potential_dot_dot_acoustic[id] = p_dot_dot;
     potential_dot_acoustic[id] += deltatover2*p_dot_dot;
-    if (simulation_type==3) {
-      p_dot_dot = rmass*b_potential_dot_dot_acoustic[id];
+
+    if (simulation_type == 3) {
+      p_dot_dot = rmass * b_potential_dot_dot_acoustic[id];
       b_potential_dot_dot_acoustic[id] = p_dot_dot;
       b_potential_dot_acoustic[id] += b_deltatover2*p_dot_dot;
     }
diff --git a/src/gpu/mesh_constants_gpu.h b/src/gpu/mesh_constants_gpu.h
index 402edb266..414a38901 100644
--- a/src/gpu/mesh_constants_gpu.h
+++ b/src/gpu/mesh_constants_gpu.h
@@ -124,9 +124,27 @@ typedef double realw;
 #endif
 
 // maximum function
+#if !defined(MAX)
 #define MAX(x,y)     (((x) < (y)) ? (y) : (x))
+#endif
 // minimum function
+#if !defined(MIN)
 #define MIN(a,b)     (((a) > (b)) ? (b) : (a))
+#endif
+
+// HIP
+#ifdef USE_HIP
+// for HIP-CPU installation
+#if defined(__HIP_CPU_RT__)
+// forces __forceinline__ keyword to be inline to avoid "duplicate symbol.." linking errors
+#if defined(__forceinline__)
+#undef __forceinline__
+#endif
+#define __forceinline__ inline
+// or
+//#define __forceinline__ __attribute__((always_inline)) inline
+#endif
+#endif
 
 /* ----------------------------------------------------------------------------------------------- */
 
@@ -260,15 +278,11 @@ typedef double realw;
 
 /* ----------------------------------------------------------------------------------------------- */
 
-// type of "working" variables: see also CUSTOM_REAL
-// double precision temporary variables leads to 10% performance decrease
-// in Kernel_2_impl (not very much..)
-typedef float realw;
-
 // textures
 // note: texture templates are supported only for CUDA versions <= 11.x
 //       since CUDA 12.x, these are deprecated and texture objects should be used instead
 //       see: https://developer.nvidia.com/blog/cuda-pro-tip-kepler-texture-objects-improve-performance-and-flexibility/
+#if CUSTOM_REAL == 4
 #if defined(USE_TEXTURES_FIELDS) || defined(USE_TEXTURES_CONSTANTS)
 #ifdef USE_CUDA
 typedef texture<float, cudaTextureType1D, cudaReadModeElementType> realw_texture;
@@ -277,6 +291,16 @@ typedef texture<float, cudaTextureType1D, cudaReadModeElementType> realw_texture
 typedef texture<float, hipTextureType1D, hipReadModeElementType> realw_texture;
 #endif
 #endif
+#elif CUSTOM_REAL == 8
+#if defined(USE_TEXTURES_FIELDS) || defined(USE_TEXTURES_CONSTANTS)
+#ifdef USE_CUDA
+typedef texture<double, cudaTextureType1D, cudaReadModeElementType> realw_texture;
+#endif
+#ifdef USE_HIP
+typedef texture<double, hipTextureType1D, hipReadModeElementType> realw_texture;
+#endif
+#endif
+#endif
 
 // pointer declarations
 // restricted pointers: can improve performance on Kepler ~ 10%
diff --git a/src/gpu/update_displacement_cuda.cu b/src/gpu/update_displacement_cuda.cu
index 8156f87d7..c42acd9be 100644
--- a/src/gpu/update_displacement_cuda.cu
+++ b/src/gpu/update_displacement_cuda.cu
@@ -455,6 +455,7 @@ void FC_FUNC_(kernel_3_acoustic_cuda,
 
   Mesh* mp = (Mesh*)(*Mesh_pointer); // get Mesh from fortran integer wrapper
   int FORWARD_OR_ADJOINT = *FORWARD_OR_ADJOINT_f;
+
   // safety check
   if (FORWARD_OR_ADJOINT != 0 && FORWARD_OR_ADJOINT != 1 && FORWARD_OR_ADJOINT != 3) {
     exit_on_error("Error invalid FORWARD_OR_ADJOINT in Kernel_2_acoustic() routine");