diff --git a/README.md b/README.md index ce23d3ea..06a641e4 100644 --- a/README.md +++ b/README.md @@ -242,6 +242,8 @@ Here is the brief information on each version: `src/kokkos_src_v6_2`: is a variant of `src/kokkos_src_v6`, where inner parallel_for constructs are manually fused; the new best performing version. +`src/kokkos_src_v7`: has the same user data layouts and compute patterns as the P2R Kokkos version. + Here we have basic instructions. #### Getting started @@ -274,6 +276,9 @@ $ make COMPILER=nvcc MODE=kokkosv6 INCLUDE_DATA=1 USE_FMAD=1 USE_GPU=1 PREPIN_HO $ make COMPILER=nvcc MODE=kokkosv6 INCLUDE_DATA=0 USE_FMAD=1 USE_GPU=1 PREPIN_HOSTMEM=1 #work only for NVIDIA GPUs $ make COMPILER=nvcc MODE=kokkosv6_2 INCLUDE_DATA=0 USE_FMAD=1 USE_GPU=1 PREPIN_HOSTMEM=1 #work only for NVIDIA GPUs $ make COMPILER=nvcc MODE=kokkosv5_2 INCLUDE_DATA=0 USE_FMAD=1 USE_GPU=1 KOKKOS_ARCH=AMPERE86 +$ make COMPILER=nvcc MODE=kokkosv7 INCLUDE_DATA=0 USE_FMAD=1 USE_GPU=1 +$ make COMPILER=gcc MODE=kokkosv7 USE_GPU=0 KOKKOS_ARCH=POWER9 + $ make MODE=kokkosv6 clean ``` diff --git a/src/kokkos_src_v7/propagate-toz-test_Kokkos_v7.cpp b/src/kokkos_src_v7/propagate-toz-test_Kokkos_v7.cpp index 6eee348a..74e1b63e 100644 --- a/src/kokkos_src_v7/propagate-toz-test_Kokkos_v7.cpp +++ b/src/kokkos_src_v7/propagate-toz-test_Kokkos_v7.cpp @@ -722,26 +722,26 @@ KOKKOS_FUNCTION void propagateToZ(const MP6x6SF_ &inErr_, const MP6F_ &inP const float sCosPsina = sinf(cosP*sina); const float cCosPsina = cosf(cosP*sina); - //for (size_t i=0;i<6;++i) errorProp[N*PosInMtrx(i,i,6,N) + it] = 1.0f; - errorProp[N*PosInMtrx(0,0,6,N) + it] = 1.0f; - errorProp[N*PosInMtrx(1,1,6,N) + it] = 1.0f; - errorProp[N*PosInMtrx(2,2,6,N) + it] = 1.0f; - errorProp[N*PosInMtrx(3,3,6,N) + it] = 1.0f; - errorProp[N*PosInMtrx(4,4,6,N) + it] = 1.0f; - errorProp[N*PosInMtrx(5,5,6,N) + it] = 1.0f; + //for (size_t i=0;i<6;++i) errorProp[PosInMtrx(i,i,6,N) + it] = 1.0f; + errorProp[PosInMtrx(0,0,6,N) + it] = 1.0f; + errorProp[PosInMtrx(1,1,6,N) + it] = 1.0f; + errorProp[PosInMtrx(2,2,6,N) + it] = 1.0f; + errorProp[PosInMtrx(3,3,6,N) + it] = 1.0f; + errorProp[PosInMtrx(4,4,6,N) + it] = 1.0f; + errorProp[PosInMtrx(5,5,6,N) + it] = 1.0f; //[Dec. 21, 2022] Added to have the same pattern as the cudauvm version. - errorProp[N*PosInMtrx(0,1,6,N) + it] = 0.0f; - errorProp[N*PosInMtrx(0,2,6,N) + it] = cosP*sinT*(sinP*cosa*sCosPsina-cosa)*icosT; - errorProp[N*PosInMtrx(0,3,6,N) + it] = cosP*sinT*deltaZ*cosa*(1.0f-sinP*sCosPsina)*(icosT*pt)-k*(cosP*sina-sinP*(1.0f-cCosPsina))*(pt*pt); - errorProp[N*PosInMtrx(0,4,6,N) + it] = (k*pt)*(-sinP*sina+sinP*sinP*sina*sCosPsina-cosP*(1.0f-cCosPsina)); - errorProp[N*PosInMtrx(0,5,6,N) + it] = cosP*deltaZ*cosa*(1.0f-sinP*sCosPsina)*(icosT*icosT); - errorProp[N*PosInMtrx(1,2,6,N) + it] = cosa*sinT*(cosP*cosP*sCosPsina-sinP)*icosT; - errorProp[N*PosInMtrx(1,3,6,N) + it] = sinT*deltaZ*cosa*(cosP*cosP*sCosPsina+sinP)*(icosT*pt)-k*(sinP*sina+cosP*(1.0f-cCosPsina))*(pt*pt); - errorProp[N*PosInMtrx(1,4,6,N) + it] = (k*pt)*(-sinP*(1.0f-cCosPsina)-sinP*cosP*sina*sCosPsina+cosP*sina); - errorProp[N*PosInMtrx(1,5,6,N) + it] = deltaZ*cosa*(cosP*cosP*sCosPsina+sinP)*(icosT*icosT); - errorProp[N*PosInMtrx(4,2,6,N) + it] = -inPar_(iparIpt,it)*sinT*(icosTk); - errorProp[N*PosInMtrx(4,3,6,N) + it] = sinT*deltaZ*(icosTk); - errorProp[N*PosInMtrx(4,5,6,N) + it] = inPar_(iparIpt,it)*deltaZ*(icosT*icosTk); + errorProp[PosInMtrx(0,1,6,N) + it] = 0.0f; + errorProp[PosInMtrx(0,2,6,N) + it] = cosP*sinT*(sinP*cosa*sCosPsina-cosa)*icosT; + errorProp[PosInMtrx(0,3,6,N) + it] = cosP*sinT*deltaZ*cosa*(1.0f-sinP*sCosPsina)*(icosT*pt)-k*(cosP*sina-sinP*(1.0f-cCosPsina))*(pt*pt); + errorProp[PosInMtrx(0,4,6,N) + it] = (k*pt)*(-sinP*sina+sinP*sinP*sina*sCosPsina-cosP*(1.0f-cCosPsina)); + errorProp[PosInMtrx(0,5,6,N) + it] = cosP*deltaZ*cosa*(1.0f-sinP*sCosPsina)*(icosT*icosT); + errorProp[PosInMtrx(1,2,6,N) + it] = cosa*sinT*(cosP*cosP*sCosPsina-sinP)*icosT; + errorProp[PosInMtrx(1,3,6,N) + it] = sinT*deltaZ*cosa*(cosP*cosP*sCosPsina+sinP)*(icosT*pt)-k*(sinP*sina+cosP*(1.0f-cCosPsina))*(pt*pt); + errorProp[PosInMtrx(1,4,6,N) + it] = (k*pt)*(-sinP*(1.0f-cCosPsina)-sinP*cosP*sina*sCosPsina+cosP*sina); + errorProp[PosInMtrx(1,5,6,N) + it] = deltaZ*cosa*(cosP*cosP*sCosPsina+sinP)*(icosT*icosT); + errorProp[PosInMtrx(4,2,6,N) + it] = -inPar_(iparIpt,it)*sinT*(icosTk); + errorProp[PosInMtrx(4,3,6,N) + it] = sinT*deltaZ*(icosTk); + errorProp[PosInMtrx(4,5,6,N) + it] = inPar_(iparIpt,it)*deltaZ*(icosT*icosTk); } //}); //