diff --git a/src/gpu/compute_coupling_cuda.cu b/src/gpu/compute_coupling_cuda.cu index 926ab2aef..defe340bc 100644 --- a/src/gpu/compute_coupling_cuda.cu +++ b/src/gpu/compute_coupling_cuda.cu @@ -35,6 +35,9 @@ /* ----------------------------------------------------------------------------------------------- */ +// coupling direction: elastic wavefield/domain -> acoustic wavefield/domain +// (updates acoustic potential_dot_dot wavefield) + extern EXTERN_LANG void FC_FUNC_(compute_coupling_ac_el_cuda, COMPUTE_COUPLING_AC_EL_CUDA)(long* Mesh_pointer, @@ -131,6 +134,9 @@ void FC_FUNC_(compute_coupling_ac_el_cuda, /* ----------------------------------------------------------------------------------------------- */ +// coupling direction: acoustic wavefield/domain -> elastic wavefield/domain +// (updates elastic acceleration wavefield) + extern EXTERN_LANG void FC_FUNC_(compute_coupling_el_ac_cuda, COMPUTE_COUPLING_EL_AC_CUDA)(long* Mesh_pointer, diff --git a/src/gpu/kernels/Kernel_2_acoustic_impl.cu b/src/gpu/kernels/Kernel_2_acoustic_impl.cu index 9938c290e..9ec177e32 100644 --- a/src/gpu/kernels/Kernel_2_acoustic_impl.cu +++ b/src/gpu/kernels/Kernel_2_acoustic_impl.cu @@ -110,8 +110,8 @@ Kernel_2_acoustic_impl(const int nb_blocks_to_compute, realw* d_gammax,realw* d_gammay,realw* d_gammaz, const realw xix_regular, const realw jacobian_regular, realw_const_p d_hprime_xx, - realw_const_p hprimewgll_xx, - realw_const_p wgllwgll_xy,realw_const_p wgllwgll_xz,realw_const_p wgllwgll_yz, + realw_const_p d_hprimewgll_xx, + realw_const_p d_wgllwgll_xy,realw_const_p d_wgllwgll_xz,realw_const_p d_wgllwgll_yz, realw* d_rhostore, const int use_mesh_coloring_gpu, const int gravity, @@ -184,7 +184,7 @@ Kernel_2_acoustic_impl(const int nb_blocks_to_compute, working_element = bx; #else //mesh coloring - if (use_mesh_coloring_gpu ){ + if (use_mesh_coloring_gpu){ working_element = bx; }else{ // iphase-1 and working_element-1 for Fortran->C array conventions @@ -273,7 +273,7 @@ Kernel_2_acoustic_impl(const int nb_blocks_to_compute, sh_hprime_xx[tx] = d_hprime_xx[tx]; #endif // loads hprimewgll into shared memory - sh_hprimewgll_xx[tx] = hprimewgll_xx[tx]; + sh_hprimewgll_xx[tx] = d_hprimewgll_xx[tx]; } // counts: @@ -287,9 +287,9 @@ Kernel_2_acoustic_impl(const int nb_blocks_to_compute, __syncthreads(); // summed terms with added gll weights - fac1 = wgllwgll_yz[K*NGLLX+J]; - fac2 = wgllwgll_xz[K*NGLLX+I]; - fac3 = wgllwgll_xy[J*NGLLX+I]; + fac1 = d_wgllwgll_yz[K*NGLLX+J]; + fac2 = d_wgllwgll_xz[K*NGLLX+I]; + fac3 = d_wgllwgll_xy[J*NGLLX+I]; // We make a loop over direct and adjoint wavefields inside the GPU kernel to increase arithmetic intensity for (int k = 0 ; k < nb_field ; k++){ @@ -402,7 +402,7 @@ Kernel_2_acoustic_impl(const int nb_blocks_to_compute, #endif // USE_TEXTURES_FIELDS #else // MESH_COLORING //mesh coloring - if (use_mesh_coloring_gpu ){ + if (use_mesh_coloring_gpu){ // no atomic operation needed, colors don't share global points between elements #ifdef USE_TEXTURES_FIELDS if (k==0) d_potential_dot_dot_acoustic[iglob] = texfetch_potential_dot_dot(iglob) + sum_terms; @@ -525,8 +525,8 @@ template __global__ void Kernel_2_acoustic_impl<1>(const int nb_blocks_to_comput realw* d_gammax,realw* d_gammay,realw* d_gammaz, const realw xix_regular, const realw jacobian_regular, realw_const_p d_hprime_xx, - realw_const_p hprimewgll_xx, - realw_const_p wgllwgll_xy,realw_const_p wgllwgll_xz,realw_const_p wgllwgll_yz, + realw_const_p d_hprimewgll_xx, + realw_const_p d_wgllwgll_xy,realw_const_p d_wgllwgll_xz,realw_const_p d_wgllwgll_yz, realw* d_rhostore, const int use_mesh_coloring_gpu, const int gravity, @@ -558,8 +558,8 @@ Kernel_2_acoustic_single_impl(const int nb_blocks_to_compute, const int* d_irregular_element_number, const realw xix_regular, const realw jacobian_regular, realw_const_p d_hprime_xx, - realw_const_p hprimewgll_xx, - realw_const_p wgllwgll_xy,realw_const_p wgllwgll_xz,realw_const_p wgllwgll_yz, + realw_const_p d_hprimewgll_xx, + realw_const_p d_wgllwgll_xy,realw_const_p d_wgllwgll_xz,realw_const_p d_wgllwgll_yz, realw* d_rhostore, const int use_mesh_coloring_gpu, const int gravity, @@ -571,14 +571,23 @@ Kernel_2_acoustic_single_impl(const int nb_blocks_to_compute, // block-id == number of local element id in phase_ispec array int bx = blockIdx.y*gridDim.x+blockIdx.x; + // checks if anything to do + if (bx >= nb_blocks_to_compute) return; + // thread-id == GLL node id // note: use only NGLL^3 = 125 active threads, plus 3 inactive/ghost threads, // because we used memory padding from NGLL^3 = 125 to 128 to get coalescent memory accesses; // to avoid execution branching and the need of registers to store an active state variable, // the thread ids are put in valid range int tx = threadIdx.x; + // limits thread ids to range [0,125-1] + if (tx >= NGLL3) tx = NGLL3-1; + + // local index + int K = (tx/NGLL2); + int J = ((tx-K*NGLL2)/NGLLX); + int I = (tx-K*NGLL2-J*NGLLX); - int I,J,K; int iglob,offset; int working_element,ispec_irreg; @@ -602,28 +611,24 @@ Kernel_2_acoustic_single_impl(const int nb_blocks_to_compute, __shared__ realw sh_hprime_xx[NGLL2]; __shared__ realw sh_hprimewgll_xx[NGLL2]; - // checks if anything to do - if (bx >= nb_blocks_to_compute) return; - - // limits thread ids to range [0,125-1] - if (tx >= NGLL3) tx = NGLL3-1; - // spectral-element id #ifdef USE_MESH_COLORING_GPU working_element = bx; #else //mesh coloring - if (use_mesh_coloring_gpu ){ + if (use_mesh_coloring_gpu){ working_element = bx; }else{ // iphase-1 and working_element-1 for Fortran->C array conventions - working_element = d_phase_ispec_inner_acoustic[bx + num_phase_ispec_acoustic*(d_iphase-1)]-1; + working_element = d_phase_ispec_inner_acoustic[bx + num_phase_ispec_acoustic*(d_iphase-1)] - 1; } #endif + ispec_irreg = d_irregular_element_number[working_element] - 1; + // local padded index offset = working_element*NGLL3_PADDED + tx; - ispec_irreg = d_irregular_element_number[working_element] - 1; + // global index iglob = d_ibool[offset] - 1; @@ -644,11 +649,6 @@ Kernel_2_acoustic_single_impl(const int nb_blocks_to_compute, // gravity if (gravity) kappa_invl = 1.f / d_kappastore[working_element*NGLL3 + tx]; - // local index - K = (tx/NGLL2); - J = ((tx-K*NGLL2)/NGLLX); - I = (tx-K*NGLL2-J*NGLLX); - // calculates laplacian if (ispec_irreg >= 0){ //irregular_element @@ -680,7 +680,7 @@ Kernel_2_acoustic_single_impl(const int nb_blocks_to_compute, sh_hprime_xx[tx] = d_hprime_xx[tx]; #endif // loads hprimewgll into shared memory - sh_hprimewgll_xx[tx] = hprimewgll_xx[tx]; + sh_hprimewgll_xx[tx] = d_hprimewgll_xx[tx]; } // synchronize all the threads (one thread for each of the NGLL grid points of the @@ -753,9 +753,9 @@ Kernel_2_acoustic_single_impl(const int nb_blocks_to_compute, } // summed terms with added gll weights - fac1 = wgllwgll_yz[K*NGLLX+J]; - fac2 = wgllwgll_xz[K*NGLLX+I]; - fac3 = wgllwgll_xy[J*NGLLX+I]; + fac1 = d_wgllwgll_yz[K*NGLLX+J]; + fac2 = d_wgllwgll_xz[K*NGLLX+I]; + fac3 = d_wgllwgll_xy[J*NGLLX+I]; sum_terms = -(fac1*temp1l + fac2*temp2l + fac3*temp3l); @@ -777,7 +777,7 @@ Kernel_2_acoustic_single_impl(const int nb_blocks_to_compute, #endif // USE_TEXTURES_FIELDS #else // MESH_COLORING //mesh coloring - if (use_mesh_coloring_gpu ){ + if (use_mesh_coloring_gpu){ // no atomic operation needed, colors don't share global points between elements #ifdef USE_TEXTURES_FIELDS if (FORWARD_OR_ADJOINT == 3){ @@ -817,8 +817,8 @@ Kernel_2_acoustic_perf_impl(const int nb_blocks_to_compute, realw* d_etax,realw* d_etay,realw* d_etaz, realw* d_gammax,realw* d_gammay,realw* d_gammaz, realw_const_p d_hprime_xx, - realw_const_p hprimewgll_xx, - realw_const_p wgllwgll_xy,realw_const_p wgllwgll_xz,realw_const_p wgllwgll_yz, + realw_const_p d_hprimewgll_xx, + realw_const_p d_wgllwgll_xy,realw_const_p d_wgllwgll_xz,realw_const_p d_wgllwgll_yz, realw* d_rhostore, const int use_mesh_coloring_gpu, const int gravity, @@ -927,7 +927,7 @@ Kernel_2_acoustic_perf_impl(const int nb_blocks_to_compute, sh_hprime_xx[tx] = d_hprime_xx[tx]; #endif // loads hprimewgll into shared memory - sh_hprimewgll_xx[tx] = hprimewgll_xx[tx]; + sh_hprimewgll_xx[tx] = d_hprimewgll_xx[tx]; } // synchronize all the threads (one thread for each of the NGLL grid points of the @@ -984,9 +984,9 @@ Kernel_2_acoustic_perf_impl(const int nb_blocks_to_compute, } // summed terms with added gll weights - fac1 = wgllwgll_yz[K*NGLLX+J]; - fac2 = wgllwgll_xz[K*NGLLX+I]; - fac3 = wgllwgll_xy[J*NGLLX+I]; + fac1 = d_wgllwgll_yz[K*NGLLX+J]; + fac2 = d_wgllwgll_xz[K*NGLLX+I]; + fac3 = d_wgllwgll_xy[J*NGLLX+I]; sum_terms = -(fac1*temp1l + fac2*temp2l + fac3*temp3l); diff --git a/src/gpu/kernels/kernel_3_acoustic_cuda_device.cu b/src/gpu/kernels/kernel_3_acoustic_cuda_device.cu index a724c1cb1..ab742b9f0 100644 --- a/src/gpu/kernels/kernel_3_acoustic_cuda_device.cu +++ b/src/gpu/kernels/kernel_3_acoustic_cuda_device.cu @@ -40,18 +40,18 @@ __global__ void kernel_3_acoustic_cuda_device(field* potential_dot_acoustic, int id = threadIdx.x + (blockIdx.x + blockIdx.y*gridDim.x)*blockDim.x; - realw rmass; - field p_dot_dot; // because of block and grid sizing problems, there is a small // amount of buffer at the end of the calculation if (id < size) { - rmass = rmass_acoustic[id]; + realw rmass = rmass_acoustic[id]; // multiplies pressure with the inverse of the mass matrix - p_dot_dot = rmass*potential_dot_dot_acoustic[id]; + field p_dot_dot = rmass * potential_dot_dot_acoustic[id]; + potential_dot_dot_acoustic[id] = p_dot_dot; potential_dot_acoustic[id] += deltatover2*p_dot_dot; - if (simulation_type==3) { - p_dot_dot = rmass*b_potential_dot_dot_acoustic[id]; + + if (simulation_type == 3) { + p_dot_dot = rmass * b_potential_dot_dot_acoustic[id]; b_potential_dot_dot_acoustic[id] = p_dot_dot; b_potential_dot_acoustic[id] += b_deltatover2*p_dot_dot; } diff --git a/src/gpu/mesh_constants_gpu.h b/src/gpu/mesh_constants_gpu.h index 402edb266..414a38901 100644 --- a/src/gpu/mesh_constants_gpu.h +++ b/src/gpu/mesh_constants_gpu.h @@ -124,9 +124,27 @@ typedef double realw; #endif // maximum function +#if !defined(MAX) #define MAX(x,y) (((x) < (y)) ? (y) : (x)) +#endif // minimum function +#if !defined(MIN) #define MIN(a,b) (((a) > (b)) ? (b) : (a)) +#endif + +// HIP +#ifdef USE_HIP +// for HIP-CPU installation +#if defined(__HIP_CPU_RT__) +// forces __forceinline__ keyword to be inline to avoid "duplicate symbol.." linking errors +#if defined(__forceinline__) +#undef __forceinline__ +#endif +#define __forceinline__ inline +// or +//#define __forceinline__ __attribute__((always_inline)) inline +#endif +#endif /* ----------------------------------------------------------------------------------------------- */ @@ -260,15 +278,11 @@ typedef double realw; /* ----------------------------------------------------------------------------------------------- */ -// type of "working" variables: see also CUSTOM_REAL -// double precision temporary variables leads to 10% performance decrease -// in Kernel_2_impl (not very much..) -typedef float realw; - // textures // note: texture templates are supported only for CUDA versions <= 11.x // since CUDA 12.x, these are deprecated and texture objects should be used instead // see: https://developer.nvidia.com/blog/cuda-pro-tip-kepler-texture-objects-improve-performance-and-flexibility/ +#if CUSTOM_REAL == 4 #if defined(USE_TEXTURES_FIELDS) || defined(USE_TEXTURES_CONSTANTS) #ifdef USE_CUDA typedef texture realw_texture; @@ -277,6 +291,16 @@ typedef texture realw_texture typedef texture realw_texture; #endif #endif +#elif CUSTOM_REAL == 8 +#if defined(USE_TEXTURES_FIELDS) || defined(USE_TEXTURES_CONSTANTS) +#ifdef USE_CUDA +typedef texture realw_texture; +#endif +#ifdef USE_HIP +typedef texture realw_texture; +#endif +#endif +#endif // pointer declarations // restricted pointers: can improve performance on Kepler ~ 10% diff --git a/src/gpu/update_displacement_cuda.cu b/src/gpu/update_displacement_cuda.cu index 8156f87d7..c42acd9be 100644 --- a/src/gpu/update_displacement_cuda.cu +++ b/src/gpu/update_displacement_cuda.cu @@ -455,6 +455,7 @@ void FC_FUNC_(kernel_3_acoustic_cuda, Mesh* mp = (Mesh*)(*Mesh_pointer); // get Mesh from fortran integer wrapper int FORWARD_OR_ADJOINT = *FORWARD_OR_ADJOINT_f; + // safety check if (FORWARD_OR_ADJOINT != 0 && FORWARD_OR_ADJOINT != 1 && FORWARD_OR_ADJOINT != 3) { exit_on_error("Error invalid FORWARD_OR_ADJOINT in Kernel_2_acoustic() routine");